In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import randint

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV , RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

import joblib
import os

# Import functions

In [2]:
def load_data():
    '''
    A function for loading csv data into dataframe df.
    '''

    #Location of csv file
    csv_file = '../airpollutionlevels/raw_data/air_pollution_data_upd.csv'

    #Loading csv file into df dataframe
    df = pd.read_csv(csv_file)

    return df

def clean_data(df):
    '''
    A function to clean raw data:
    - Dropping unuseful columns
    - Dropping rows with year = NA
    - Dropping rows where pm10_concentration AND pm25_concentration AND no2_concentration are NA
    '''

    #Dropping columns: web_link, reference, iso3, who_ms, population_source, version, pm10_tempcov, pm25_tempcov, no2_tempcov
    df.drop(columns=['web_link',
                     'reference',
                     'iso3',
                     'who_ms',
                     'population_source',
                     'version',
                     'pm10_tempcov',
                     'pm25_tempcov',
                     'no2_tempcov'],
            inplace=True)

    #Dropping rows where year is NA (3 rows for India)
    df.dropna(subset=['year'], inplace=True)

    #Dropping rows where pm10_concentration AND pm25_concentration AND no2_concentration are NA
    df.dropna(how='all', subset=['pm10_concentration', 'pm25_concentration', 'no2_concentration'], inplace=True)

    return df


In [3]:
def classify_concentrations(df):
    '''
    Classifies the concentrations of NO2, PM10, and PM2.5 into categories based on the European Air Quality Index (AQI) classification.
    Sets the target class as the maximum of the three classified pollutant concentrations.
    '''
    # Define classification limits
    no2_limits = [0, 40, 90, 120, 230, 340, 1000]
    pm25_limits = [0, 10, 20, 25, 50, 75, 800]
    pm10_limits = [0, 20, 40, 50, 100, 150, 1200]

    # Classify PM10 concentrations
    df['pm10_class'] = pd.cut(df['pm10_concentration'], bins=pm10_limits, labels=[1, 2, 3, 4, 5, 6])

    # Classify PM2.5 concentrations
    df['pm25_class'] = pd.cut(df['pm25_concentration'], bins=pm25_limits, labels=[1, 2, 3, 4, 5, 6])

    # Classify NO2 concentrations
    df['no2_class'] = pd.cut(df['no2_concentration'], bins=no2_limits, labels=[1, 2, 3, 4, 5, 6])

    # Determine the target class as the maximum of the three pollutant classes
    df['target_class'] = df[['pm10_class', 'pm25_class', 'no2_class']].apply(lambda row: row.max(), axis=1)

    # Drop the intermediate class columns
    df = df.drop(columns=['pm10_class', 'no2_class', 'pm25_class'])
    # Saving in a csv file as we need that to fetch information for Predictions
    df.to_csv('../airpollutionlevels/raw_data/data_lib.csv', index=False)
    return df

In [4]:
def simplify_stations(station_type):
    '''
    Simplifies the station type string by removing duplicates and sorting.

    Args:
    - station_type (str): A string containing station types separated by ', ' e.g. Urban, urban, urban.

    Returns:
    - str: Simplified station types joined into a single string e.g "Urban, urban, urban" returns "Urban"

    If station_type is NaN (missing), returns 'unknown'.'''

    if pd.isna(station_type):
        return "unknown"
    unique_types = sorted(set(station_type.split(', ')))
    return ', '.join(unique_types)

def simplified_station_type(df):
    '''
    Adds a new column 'simplified_station_type' to the DataFrame 'df' based on simplifying 'type_of_stations'.

    Args:
    - df (pandas.DataFrame): The DataFrame containing the column 'type_of_stations' to be simplified.

    Returns:
    - pandas.DataFrame: The input DataFrame 'df' with an additional column 'simplified_station_type'.

    This function applies the 'simplify_stations' function to each value in the 'type_of_stations' column
    and stores the simplified result in a new column 'simplified_station_type'
    '''

    df['type_of_stations'] = df['type_of_stations'].astype('string') #converts type_of_stations column into a string in order to apply simplify_stations function
    df['simplified_station_type'] = df['type_of_stations'].apply(simplify_stations)
    return df

def impute_stations(df):
    '''
    Imputes the values of missing type_of_stations based on similar pollution metrics of know types of stations using KNN imputer'''

    #first simplify station names using simplified_station_type function
    simplified_station_type(df)

    # Manually map known types of stations to numerical labels from stations3 df
    type_mapping = {
        'Unknown': np.nan, #will need this to be nan for imputer to work
        'Urban': 1,
        'Rural': 2,
        'Suburban': 3,
        'Suburban, Urban': 4,
        'Rural, Urban': 5,
        'Rural, Suburban, Urban': 6,
        'Rural, Suburban': 7,
        'Background': 8,
        'Residential And Commercial Area': 9,
        'Traffic': 10,
        'Residential And Commercial Area, Urban Traffic': 11,
        'Background, Traffic': 12,
        'Industrial': 13,
        'Residential And Commercial Area, Urban Traffic': 14,
        'Industrial, Urban': 15,
        'Industrial, Rural, Urban': 16,
        'Residential': 17,
        'Fond Urbain, Traffic': 18,
        'Residential - industrial': 19
    }

    df['encoded_station_type'] = df['simplified_station_type'].map(type_mapping) # encode simpified_station_type column to feed into KNN imputer

    # Select features for imputation
    features = ['pm10_concentration', 'pm25_concentration', 'no2_concentration', 'encoded_station_type'] #features to be learned by imputer

    # Perform KNN imputation
    imputer = KNNImputer(n_neighbors=5) #instantiate imputer
    df_imputed = imputer.fit_transform(df[features]) #returns array with learned features

    # Assign imputed values back to DataFrame
    df['encoded_station_type_imputed'] = df_imputed[:, -1]  # Assuming encoded_station_type is the last column after imputation

    # Revert encoded_station_type back to original categorical values
    reverse_mapping = {v: k for k, v in type_mapping.items() if pd.notna(v)}  # Reverse mapping excluding NaNs. source >> https://stackoverflow.com/questions/483666/reverse-invert-a-dictionary-mapping

    df['final_station_type'] = df['encoded_station_type_imputed'].round().astype(int).map(reverse_mapping).fillna(np.nan)

    return df

In [11]:
def encode_scale_data(df):
    """
    Encode and scale the data for prediction.
    Add a temporary unique_id for tracking purposes.

    Parameters:
    df (DataFrame): Input DataFrame containing raw data.

    Returns:
    DataFrame: Transformed DataFrame with encoded and scaled features.
    """
    df = df.copy()
    # Drop rows with missing values in critical columns
    df = df.dropna(subset=['country_name', 'year', 'population', 'latitude', 'longitude'])

    # Convert 'year' to integer
    df['year'] = df['year'].astype(int)

    # Columns to drop if they exist in the DataFrame
    columns_to_drop = ['pm10_concentration', 'pm25_concentration', 'no2_concentration', 'type_of_stations',
                       'simplified_station_type', 'encoded_station_type', 'encoded_station_type_imputed']
    df = df.drop(columns=columns_to_drop, axis=1)

    # Drop 'city' column if it exists
    if 'city' in df.columns:
        df = df.drop('city', axis=1)

    # Add a unique identifier for tracking purposes
    df['unique_id'] = range(len(df))

    # Reset index to ensure it's sequential and clean
    df = df.reset_index(drop=True)

    # Define the columns for encoding and scaling
    categorical_cols = ['who_region', 'country_name', 'final_station_type']
    numeric_cols = ['population', 'latitude', 'longitude']

    # Instantiate encoders and scalers
    onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
    scaler = StandardScaler()

    # Pipeline for encoding and scaling
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', onehot_encoder, categorical_cols),
            ('scaler', scaler, numeric_cols)
        ],
        remainder='passthrough'  # Keep the 'unique_id' column unchanged
    )

    # Apply transformations (excluding 'pm25_concentration' if it exists)
    transformed_data = preprocessor.fit_transform(df.drop(columns=['target_class'], errors='ignore'))

    # Get the feature names after one-hot encoding
    ohe_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_cols)

    # Construct the final DataFrame columns
    final_columns = list(ohe_feature_names) + numeric_cols + ['year', 'unique_id']


    # Create the final DataFrame
    df_transformed = pd.DataFrame(transformed_data, columns=final_columns)
    df_transformed['target_class'] = df['target_class'].values

    # Save the transformed data to a CSV file
    df_transformed.to_csv('../airpollutionlevels/raw_data/air_pollution_data_encoded_class.csv', index=False)

    return df_transformed


# Preprocess

In [8]:
data_ = load_data()

In [9]:
data_ = clean_data(data_)

In [10]:
data_ = impute_stations(data_)

In [12]:
data2 = classify_concentrations(data_)

In [13]:
data = encode_scale_data(data2)

In [14]:
data

Unnamed: 0,who_region_2_Amr,who_region_3_Sear,who_region_4_Eur,who_region_5_Emr,who_region_6_Wpr,who_region_7_NonMS,country_name_Albania,country_name_Algeria,country_name_Andorra,country_name_Argentina,...,"final_station_type_Suburban, Urban",final_station_type_Traffic,final_station_type_Urban,final_station_type_nan,population,latitude,longitude,year,unique_id,target_class
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,-0.116974,0.198152,-0.333542,2013.0,0.0,2
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,-0.116974,0.198160,-0.333536,2014.0,1.0,2
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,-0.116974,0.198305,-0.333617,2015.0,2.0,2
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,-0.116974,0.198305,-0.333617,2016.0,3.0,2
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,-0.116974,0.198305,-0.333617,2017.0,4.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38614,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-0.242107,0.735796,0.166542,2019.0,38614.0,2
38615,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-0.242107,0.735796,0.166542,2020.0,38615.0,2
38616,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.247440,0.589022,0.145486,2010.0,38616.0,4
38617,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-0.247440,0.589351,0.145324,2013.0,38617.0,4


# Model Explore 

In [15]:
#data['year']=data['year'].astype(int)
X = data.drop(['target_class'], axis=1)
y = data['target_class']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Grid Search

In [16]:
# Define the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)

In [17]:
param_grid = {
'criterion': ['gini'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['log2', None]}

In [18]:
# Convert y_train to categorical
y_train = y_train.astype('category')
y_test = y_test.astype('category')

In [19]:


# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [20]:
# Get the best model from GridSearchCV
best_dt = grid_search.best_estimator_

# Predict on the test set
y_pred = best_dt.predict(X_test)

# Evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Classification Report:
              precision    recall  f1-score   support

           1       0.84      0.84      0.84      3110
           2       0.77      0.77      0.77      2851
           3       0.40      0.42      0.41       433
           4       0.75      0.72      0.73       916
           5       0.65      0.68      0.67       298
           6       0.63      0.57      0.60       116

    accuracy                           0.77      7724
   macro avg       0.67      0.67      0.67      7724
weighted avg       0.77      0.77      0.77      7724

Accuracy: 0.7667


In [21]:
print("Best Parameters found by GridSearchCV:")
print(grid_search.best_params_)

print("Best Estimator found by GridSearchCV:")
print(grid_search.best_estimator_)

Best Parameters found by GridSearchCV:
{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Estimator found by GridSearchCV:
DecisionTreeClassifier(random_state=42)


## Random Search

In [22]:
param_dist = {
    'criterion': ['gini'],
    'max_depth': [None] + list(range(20, 40, 2)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 10),
    'max_features': [None]
}

In [23]:
random_search = RandomizedSearchCV(dt, param_distributions=param_dist,n_iter=100,
                                   cv=5, random_state=42, scoring='accuracy', verbose=1, n_jobs=-1)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [24]:
# Retrieve the best parameters and estimator
print("Best Parameters found by RandomizedSearchCV:")
print(random_search.best_params_)

print("Best Estimator found by RandomizedSearchCV:")
print(random_search.best_estimator_)

# Use the best estimator to make predictions
y_pred_random = random_search.best_estimator_.predict(X_test)

# Evaluate the model
print("Classification Report for RandomizedSearchCV:")
print(classification_report(y_test, y_pred_random))

# Accuracy score
accuracy_random = accuracy_score(y_test, y_pred_random)
print(f"Accuracy: {accuracy_random:.4f}")

Best Parameters found by RandomizedSearchCV:
{'criterion': 'gini', 'max_depth': 22, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Estimator found by RandomizedSearchCV:
DecisionTreeClassifier(max_depth=22, random_state=42)
Classification Report for RandomizedSearchCV:
              precision    recall  f1-score   support

           1       0.83      0.84      0.84      3110
           2       0.77      0.76      0.76      2851
           3       0.40      0.44      0.42       433
           4       0.75      0.73      0.74       916
           5       0.65      0.66      0.65       298
           6       0.67      0.60      0.63       116

    accuracy                           0.76      7724
   macro avg       0.68      0.67      0.67      7724
weighted avg       0.77      0.76      0.76      7724

Accuracy: 0.7644


## Final Model

In [25]:
# Define the model with the best parameters found
best_dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=30,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=42
)

# Evaluate the model using cross-validation and using StratifiedKfold to  ensure same proportion of each target_class
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(best_dt, X_train, y_train, cv=stratified_kfold, scoring='accuracy', verbose=1)

# Output cross-validation scores
print(f"Cross-Validation Scores: {cross_val_scores}")
print("Accuracy:", format(accuracy, '.4f'))


Cross-Validation Scores: [0.7592233  0.7605178  0.76084142 0.74498382 0.75339806 0.75493687
 0.76173519 0.76853351 0.76400129 0.75169958]
Accuracy: 0.7667


## Functions

In [26]:
data_.head()


Unnamed: 0,who_region,country_name,city,year,pm10_concentration,pm25_concentration,no2_concentration,type_of_stations,population,latitude,longitude,simplified_station_type,encoded_station_type,encoded_station_type_imputed,final_station_type,pm10_class,pm25_class,no2_class,target_class
0,4_Eur,Spain,A Coruna,2013.0,23.238,11.491,28.841,"Urban, Urban, Suburban",246056.0,43.3679,-8.418571,"Suburban, Urban",4.0,4.0,"Suburban, Urban",2,2,1,2
1,4_Eur,Spain,A Coruna,2014.0,27.476,15.878,19.575,"Urban, Urban, Suburban",246056.0,43.368033,-8.418233,"Suburban, Urban",4.0,4.0,"Suburban, Urban",2,2,1,2
2,4_Eur,Spain,A Coruna,2015.0,25.515,14.004,22.731,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229,"Suburban, Urban",4.0,4.0,"Suburban, Urban",2,2,1,2
3,4_Eur,Spain,A Coruna,2016.0,23.057,13.16,20.204,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229,"Suburban, Urban",4.0,4.0,"Suburban, Urban",2,2,1,2
4,4_Eur,Spain,A Coruna,2017.0,26.849,14.114,21.543,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229,"Suburban, Urban",4.0,4.0,"Suburban, Urban",2,2,1,2


In [34]:
def train_and_save_model():
    """
    Train a Decision Tree model on the provided dataset and save it.

    Parameters:
    df (pd.DataFrame): Preprocessed dataset containing features and target_class.
    """
    # Data loading
    data_path = '../airpollutionlevels/raw_data/air_pollution_data_encoded_class.csv'
    df = pd.read_csv(data_path)
    # Specify the model file path
    model_dir = '.../airpollutionlevels/models/'
    os.makedirs(model_dir, exist_ok=True)  # Create directory if it doesn't exist
    model_filename = os.path.join(model_dir, 'decision_tree_model.pkl')

    # Split the data into features (X) and target (y)
    X = df.drop(columns=['target_class', 'unique_id'])
    y = df['target_class']

    # Initialize the Decision Tree model with best parameters
    dt = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_leaf=4, min_samples_split=2, random_state=42)

    # Fit the model
    dt.fit(X, y)

    # Save the model
    joblib.dump(dt, model_filename)
    print(f"Model saved at {model_filename}")

In [35]:
def evaluate_model():
    """
    Load a saved Decision Tree model and evaluate its performance on the provided dataset.

    Parameters:
    df (pd.DataFrame): Preprocessed dataset containing features and target_class.

    """
    # Load data
    data_path = '../airpollutionlevels/raw_data/air_pollution_data_encoded_class.csv'
    df = pd.read_csv(data_path)
    # Load the model
    model_filename = '../airpollutionlevels/models/decision_tree_model.pkl'
    dt = joblib.load(model_filename)

    # Split the data into features (X) and target (y)
    X = df.drop(columns=['target_class' , 'unique_id'])
    y = df['target_class']

    # Make predictions
    y_pred = dt.predict(X)

    # Print classification report
    print("Classification Report:")
    print(classification_report(y, y_pred))

    # Calculate and print accuracy
    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy: {accuracy}")


In [36]:
def predict(city, year):
    """
    Load a saved Decision Tree model and predict pollution level for a given city and year.
    If the year does not exist in the dataset, use the latest available data.

    Parameters:
    city (str): Name of the city.
    year (int): Year for the prediction.

    Returns:
    int: Predicted pollution level.
    """

    model_filename = '../airpollutionlevels/models/decision_tree_model.pkl'
    data = pd.read_csv('../airpollutionlevels//raw_data/data_lib.csv')

    # Verify if the 'city' column exists in the DataFrame
    if 'city' not in data.columns:
        raise ValueError("'city' column not found in the dataset. Please ensure the dataset contains a 'city' column.")

    # Check if the city exists in the dataset
    if city not in data['city'].values:
        raise ValueError(f"No data found for '{city}' in the dataset.")

    # Find the latest row for the provided city in the original dataset
    city_data = data[(data['city'] == city) & (data['year'] == data[data['city'] == city]['year'].max())].copy()

     # Add a unique identifier before encoding
    city_data.loc[:, 'unique_id'] = 0  # Set a temporary unique_id for tracking

    # Encode and scale the entire data
    encoded_data = encode_scale_data(data)

    # Locate the corresponding row in the encoded data using the unique identifier
    encoded_city_data = encoded_data[encoded_data['unique_id'] == 0].copy()

    # Replace the year in the encoded city data with the input year
    encoded_city_data.loc[:, 'year'] = year

    # Drop the unique_id column as it's no longer needed
    encoded_city_data = encoded_city_data.drop(columns=['unique_id'])

    # Prepare features for prediction (drop the 'target_class' column if it exists)
    X_predict = encoded_city_data.drop(columns=['target_class'], errors='ignore')

    # Load the model
    dt = joblib.load(model_filename)

    # Predict
    prediction = dt.predict(X_predict)[0]

    print(f"Predicted pollution level for {city} in {year}: {prediction}")
    return prediction

In [33]:
train_and_save_model()

Model saved at .../airpollutionlevels/models/decision_tree_model.pkl


In [38]:
evaluate_model()

Classification Report:
              precision    recall  f1-score   support

           1       0.91      0.94      0.93     15610
           2       0.89      0.88      0.88     14047
           3       0.71      0.65      0.68      2275
           4       0.86      0.86      0.86      4587
           5       0.86      0.82      0.84      1556
           6       0.88      0.76      0.82       544

    accuracy                           0.88     38619
   macro avg       0.85      0.82      0.83     38619
weighted avg       0.88      0.88      0.88     38619

Accuracy: 0.8840467127579689


In [40]:
predict('Berlin', 2018)

Predicted pollution level for Berlin in 2018: 2


2