In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Functions 

In [5]:
def load_data():
    '''
    A function for loading csv data into dataframe df.
    '''

    #Location of csv file
    csv_file = '../raw_data/air_pollution_data_upd.csv'

    #Loading csv file into df dataframe
    df = pd.read_csv(csv_file)

    return df

def clean_data(df):
    '''
    A function to clean raw data:
    - Dropping unuseful columns
    - Dropping rows with year = NA
    - Dropping rows where pm10_concentration AND pm25_concentration AND no2_concentration are NA
    '''

    #Dropping columns: web_link, reference, iso3, who_ms, population_source, version, pm10_tempcov, pm25_tempcov, no2_tempcov
    df.drop(columns=['web_link',
                     'reference',
                     'iso3',
                     'who_ms',
                     'population_source',
                     'version',
                     'pm10_tempcov',
                     'pm25_tempcov',
                     'no2_tempcov'],
            inplace=True)

    #Dropping rows where year is NA (3 rows for India)
    df.dropna(subset=['year'], inplace=True)

    #Dropping rows where pm10_concentration AND pm25_concentration AND no2_concentration are NA
    df.dropna(how='all', subset=['pm10_concentration', 'pm25_concentration', 'no2_concentration'], inplace=True)

    return df

In [6]:
def simplify_stations(station_type):
    '''
    Simplifies the station type string by removing duplicates and sorting.

    Args:
    - station_type (str): A string containing station types separated by ', ' e.g. Urban, urban, urban.

    Returns:
    - str: Simplified station types joined into a single string e.g "Urban, urban, urban" returns "Urban"

    If station_type is NaN (missing), returns 'unknown'.'''

    if pd.isna(station_type):
        return "unknown"
    unique_types = sorted(set(station_type.split(', ')))
    return ', '.join(unique_types)

def simplified_station_type(df):
    '''
    Adds a new column 'simplified_station_type' to the DataFrame 'df' based on simplifying 'type_of_stations'.

    Args:
    - df (pandas.DataFrame): The DataFrame containing the column 'type_of_stations' to be simplified.

    Returns:
    - pandas.DataFrame: The input DataFrame 'df' with an additional column 'simplified_station_type'.

    This function applies the 'simplify_stations' function to each value in the 'type_of_stations' column
    and stores the simplified result in a new column 'simplified_station_type'
    '''

    df['type_of_stations'] = df['type_of_stations'].astype('string') #converts type_of_stations column into a string in order to apply simplify_stations function
    df['simplified_station_type'] = df['type_of_stations'].apply(simplify_stations)
    return df

def impute_stations(df):
    '''
    Imputes the values of missing type_of_stations based on similar pollution metrics of know types of stations using KNN imputer'''

    #first simplify station names using simplified_station_type function
    simplified_station_type(df)

    # Manually map known types of stations to numerical labels from stations3 df
    type_mapping = {
        'Unknown': np.nan, #will need this to be nan for imputer to work
        'Urban': 1,
        'Rural': 2,
        'Suburban': 3,
        'Suburban, Urban': 4,
        'Rural, Urban': 5,
        'Rural, Suburban, Urban': 6,
        'Rural, Suburban': 7,
        'Background': 8,
        'Residential And Commercial Area': 9,
        'Traffic': 10,
        'Residential And Commercial Area, Urban Traffic': 11,
        'Background, Traffic': 12,
        'Industrial': 13,
        'Residential And Commercial Area, Urban Traffic': 14,
        'Industrial, Urban': 15,
        'Industrial, Rural, Urban': 16,
        'Residential': 17,
        'Fond Urbain, Traffic': 18,
        'Residential - industrial': 19
    }

    df['encoded_station_type'] = df['simplified_station_type'].map(type_mapping) # encode simpified_station_type column to feed into KNN imputer

    # Select features for imputation
    features = ['population', 'pm25_concentration', 'encoded_station_type'] #features to be learned by imputer

    # Perform KNN imputation
    imputer = KNNImputer(n_neighbors=5) #instantiate imputer
    df_imputed = imputer.fit_transform(df[features]) #returns array with learned features

    # Assign imputed values back to DataFrame
    df['encoded_station_type_imputed'] = df_imputed[:, -1]  # Assuming encoded_station_type is the last column after imputation

    # Revert encoded_station_type back to original categorical values
    reverse_mapping = {v: k for k, v in type_mapping.items() if pd.notna(v)}  # Reverse mapping excluding NaNs. source >> https://stackoverflow.com/questions/483666/reverse-invert-a-dictionary-mapping

    df['final_station_type'] = df['encoded_station_type_imputed'].round().astype(int).map(reverse_mapping).fillna(np.nan)

    return df


In [7]:
def encode_scale_data(df):
    # Drop rows with missing values in critical columns
    df = df.dropna(subset=['country_name', 'year', 'population', 'latitude', 'longitude'])

    # Convert 'year' to integer
    df = df.copy()  # Make a copy to avoid modifying the original DataFrame slice
    df['year'] = df['year'].astype(int)

    # Columns to drop if they exist in the DataFrame
    columns_to_drop = ['type_of_stations', 'simplified_station_type', 'encoded_station_type', 'encoded_station_type_imputed']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], axis=1)

    # Drop 'city' column if it exists
    if 'city' in df.columns:
        df = df.drop(columns='city', axis=1)
    # Drop 'index' column if it exists (Dont know why it is there)
    if 'index' in df.columns:
        df = df.drop(columns='index', axis=1)

    # Reset index to ensure it's sequential and clean
    df = df.reset_index(drop=True)

    # Define the columns for encoding and scaling
    categorical_cols = ['who_region', 'country_name', 'final_station_type']
    numeric_cols = ['population', 'latitude', 'longitude']

    # Instantiate encoders and scalers
    onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
    scaler = StandardScaler()

    # Pipeline for encoding and scaling
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', onehot_encoder, categorical_cols),
            ('scaler', scaler, numeric_cols)
        ],
        remainder='passthrough'  # Keep the year and pm25_concentration unchanged
    )

    # Apply transformations
    transformed_data = preprocessor.fit_transform(df)

    # Get the feature names after one-hot encoding
    ohe_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_cols)

    # Construct the final DataFrame
    final_columns = list(ohe_feature_names) + numeric_cols + ['year']
    if 'pm25_concentration' in df.columns:
        final_columns.append('pm25_concentration')


    # Check if the final columns match the transformed data shape
    if len(final_columns) != transformed_data.shape[1]:
        raise ValueError(f"Column mismatch: {len(final_columns)} final columns vs {transformed_data.shape[1]} transformed columns")

    df_transformed = pd.DataFrame(transformed_data, columns=final_columns)

    return df_transformed


# data   exploration and data preparation

In [8]:
data = load_data()

In [9]:
data = clean_data(data)

In [10]:
data.head()

Unnamed: 0,who_region,country_name,city,year,pm10_concentration,pm25_concentration,no2_concentration,type_of_stations,population,latitude,longitude
0,4_Eur,Spain,A Coruna,2013.0,23.238,11.491,28.841,"Urban, Urban, Suburban",246056.0,43.3679,-8.418571
1,4_Eur,Spain,A Coruna,2014.0,27.476,15.878,19.575,"Urban, Urban, Suburban",246056.0,43.368033,-8.418233
2,4_Eur,Spain,A Coruna,2015.0,25.515,14.004,22.731,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229
3,4_Eur,Spain,A Coruna,2016.0,23.057,13.16,20.204,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229
4,4_Eur,Spain,A Coruna,2017.0,26.849,14.114,21.543,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229


In [11]:
data.drop(columns=['pm10_concentration','no2_concentration'], inplace=True)

In [12]:
data = data.dropna(subset=['pm25_concentration', 'population']).reset_index()

In [13]:
#Generate a comprehensive summary for the DataFrame

# Number of missing values per column
missing_count = data.isna().sum()

# Percentage of missing values per column
missing_percentage = (data.isna().mean() * 100).round(2)

# Number of unique values per column
unique_count = data.nunique()

# Number of duplicate rows in the DataFrame
duplicate_count = data.duplicated().sum()


# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing Percentage (%)': missing_percentage,
    'Unique Count': unique_count
})

print("Comprehensive Data Summary:")

print(summary_df)
print(f"\nNumber of duplicate rows in the dataset: {duplicate_count}")
len(data)

Comprehensive Data Summary:
                    Missing Count  Missing Percentage (%)  Unique Count
index                           0                    0.00         20982
who_region                      0                    0.00             7
country_name                    0                    0.00           114
city                            0                    0.00          4209
year                            0                    0.00            13
pm25_concentration              0                    0.00         12335
type_of_stations            10373                   49.44           312
population                      0                    0.00          4923
latitude                        0                    0.00          7819
longitude                       0                    0.00          7817

Number of duplicate rows in the dataset: 0


20982

In [14]:
#Generate a comprehensive summary for the DataFrame

# Number of missing values per column
missing_count = data.isna().sum()

# Percentage of missing values per column
missing_percentage = (data.isna().mean() * 100).round(2)

# Number of unique values per column
unique_count = data.nunique()

# Number of duplicate rows in the DataFrame
duplicate_count = data.duplicated().sum()


# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing Percentage (%)': missing_percentage,
    'Unique Count': unique_count
})

print("Comprehensive Data Summary:")

print(summary_df)
print(f"\nNumber of duplicate rows in the dataset: {duplicate_count}")
len(data)

Comprehensive Data Summary:
                    Missing Count  Missing Percentage (%)  Unique Count
index                           0                    0.00         20982
who_region                      0                    0.00             7
country_name                    0                    0.00           114
city                            0                    0.00          4209
year                            0                    0.00            13
pm25_concentration              0                    0.00         12335
type_of_stations            10373                   49.44           312
population                      0                    0.00          4923
latitude                        0                    0.00          7819
longitude                       0                    0.00          7817

Number of duplicate rows in the dataset: 0


20982

In [15]:
data = impute_stations(data)

In [16]:
data.head()

Unnamed: 0,index,who_region,country_name,city,year,pm25_concentration,type_of_stations,population,latitude,longitude,simplified_station_type,encoded_station_type,encoded_station_type_imputed,final_station_type
0,0,4_Eur,Spain,A Coruna,2013.0,11.491,"Urban, Urban, Suburban",246056.0,43.3679,-8.418571,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
1,1,4_Eur,Spain,A Coruna,2014.0,15.878,"Urban, Urban, Suburban",246056.0,43.368033,-8.418233,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
2,2,4_Eur,Spain,A Coruna,2015.0,14.004,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
3,3,4_Eur,Spain,A Coruna,2016.0,13.16,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
4,4,4_Eur,Spain,A Coruna,2017.0,14.114,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.4229,"Suburban, Urban",4.0,4.0,"Suburban, Urban"


In [17]:
#Generate a comprehensive summary for the DataFrame

# Number of missing values per column
missing_count = data.isna().sum()

# Percentage of missing values per column
missing_percentage = (data.isna().mean() * 100).round(2)

# Number of unique values per column
unique_count = data.nunique()

# Number of duplicate rows in the DataFrame
duplicate_count = data.duplicated().sum()


# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing Percentage (%)': missing_percentage,
    'Unique Count': unique_count
})

print("Comprehensive Data Summary:")

print(summary_df)
print(f"\nNumber of duplicate rows in the dataset: {duplicate_count}")
len(data)

Comprehensive Data Summary:
                              Missing Count  Missing Percentage (%)  \
index                                     0                    0.00   
who_region                                0                    0.00   
country_name                              0                    0.00   
city                                      0                    0.00   
year                                      0                    0.00   
pm25_concentration                        0                    0.00   
type_of_stations                      10373                   49.44   
population                                0                    0.00   
latitude                                  0                    0.00   
longitude                                 0                    0.00   
simplified_station_type                   0                    0.00   
encoded_station_type                  10380                   49.47   
encoded_station_type_imputed              0      

20982

In [18]:
data_enc = encode_scale_data(data)

In [19]:
data_enc.head()

Unnamed: 0,who_region_2_Amr,who_region_3_Sear,who_region_4_Eur,who_region_5_Emr,who_region_6_Wpr,who_region_7_NonMS,country_name_Albania,country_name_Algeria,country_name_Argentina,country_name_Australia,...,"final_station_type_Rural, Urban",final_station_type_Suburban,"final_station_type_Suburban, Urban",final_station_type_Traffic,final_station_type_Urban,population,latitude,longitude,year,pm25_concentration
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.199658,0.247642,-0.260121,2013.0,11.491
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.199658,0.247651,-0.260116,2014.0,15.878
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.199658,0.247799,-0.260182,2015.0,14.004
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.199658,0.247799,-0.260182,2016.0,13.16
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.199658,0.247799,-0.260182,2017.0,14.114


### Group by country

In [20]:
grouped_data = data.drop(columns=['population', 'latitude', 'longitude'])

In [21]:
grouped_data = grouped_data.groupby(['country_name', 'year']).mean(numeric_only=True)

In [22]:
grouped_data.reset_index(inplace=True)

In [23]:
grouped_data

Unnamed: 0,country_name,year,index,pm25_concentration,encoded_station_type,encoded_station_type_imputed
0,Afghanistan,2019.0,16488.0,119.774000,,5.000000
1,Albania,2014.0,21654.0,13.191667,1.666667,1.666667
2,Albania,2015.0,21655.0,17.395667,1.666667,1.666667
3,Albania,2016.0,21656.0,17.570000,1.666667,1.666667
4,Albania,2017.0,21657.0,14.045000,1.666667,1.666667
...,...,...,...,...,...,...
687,Viet Nam,2020.0,12682.0,23.348000,1.000000,1.000000
688,"occupied Palestinian territory, including east...",2016.0,1007.0,16.300000,2.000000,2.000000
689,"occupied Palestinian territory, including east...",2017.0,1008.0,13.700000,2.000000,2.000000
690,"occupied Palestinian territory, including east...",2018.0,1009.0,15.200000,2.000000,2.000000


# Model exploring

In [24]:
from pycaret.regression import *

In [25]:
exp_reg = setup(
    data=data_enc,
    target='pm25_concentration',  # Specify the target variable
    session_id=123,               # For reproducibility
    normalize=False,               # Normalizing the data
    transformation=False,          # Apply data transformation (log, sqrt, etc.)
    remove_multicollinearity=False, # Remove correlated features
    feature_selection=False,       # Enable feature selection
    train_size=0.8,               # 80% for training, 20% for testing
    fold=5,                       # 5-fold cross-validation
)

best_model = compare_models(exclude=['lightgbm'])

print(best_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pm25_concentration
2,Target type,Regression
3,Original data shape,"(20982, 141)"
4,Transformed data shape,"(20982, 141)"
5,Transformed train set shape,"(16785, 141)"
6,Transformed test set shape,"(4197, 141)"
7,Numeric features,140
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,2.4703,40.6735,6.2762,0.8715,0.1774,0.1361,0.65
et,Extra Trees Regressor,2.5894,44.9212,6.5887,0.8574,0.1857,0.1434,0.616
dt,Decision Tree Regressor,3.17,62.3361,7.8167,0.8016,0.229,0.1699,0.054
gbr,Gradient Boosting Regressor,4.2746,70.9381,8.3994,0.7767,0.2929,0.2673,0.502
knn,K Neighbors Regressor,4.33,83.0485,9.0744,0.7402,0.275,0.2387,0.116
ridge,Ridge Regression,5.3224,104.0007,10.1818,0.6731,0.3514,0.3106,0.318
br,Bayesian Ridge,5.3254,104.083,10.1861,0.6728,0.3521,0.3109,0.092
omp,Orthogonal Matching Pursuit,6.1014,125.486,11.1863,0.6061,0.407,0.3775,0.05
huber,Huber Regressor,6.5484,164.0711,12.7847,0.4858,0.4264,0.3978,0.454
lasso,Lasso Regression,7.2355,166.8766,12.8986,0.4765,0.4785,0.5079,0.296


RandomForestRegressor(n_jobs=-1, random_state=123)


## Run with scaled target to prevent outliers

In [26]:
data_enc['pm25_concentration'] = StandardScaler().fit_transform(data_enc[['pm25_concentration']])

In [27]:
exp_reg = setup(
    data=data_enc,
    target='pm25_concentration',  # Specify the target variable
    session_id=123,               # For reproducibility
    train_size=0.8,               # 80% for training, 20% for testing
    fold=5,                       # 5-fold cross-validation
)

best_model = compare_models(exclude=['lightgbm'])

print(best_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pm25_concentration
2,Target type,Regression
3,Original data shape,"(20982, 141)"
4,Transformed data shape,"(20982, 141)"
5,Transformed train set shape,"(16785, 141)"
6,Transformed test set shape,"(4197, 141)"
7,Numeric features,140
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.1401,0.1307,0.356,0.8719,0.1207,0.9877,0.606
et,Extra Trees Regressor,0.1471,0.1472,0.3769,0.8549,0.126,1.0818,0.634
dt,Decision Tree Regressor,0.1794,0.1993,0.4418,0.8029,0.1484,1.2005,0.06
gbr,Gradient Boosting Regressor,0.2425,0.2284,0.4766,0.7767,0.1832,1.7016,0.498
knn,K Neighbors Regressor,0.2457,0.2675,0.515,0.7402,0.191,1.6988,0.126
ridge,Ridge Regression,0.3021,0.335,0.5778,0.6731,0.2203,1.9586,0.04
br,Bayesian Ridge,0.3022,0.3352,0.5781,0.6728,0.2204,1.9582,0.056
omp,Orthogonal Matching Pursuit,0.3463,0.4042,0.6348,0.6061,0.2461,2.5489,0.038
huber,Huber Regressor,0.3618,0.5034,0.7073,0.5113,0.255,2.3354,0.492
en,Elastic Net,0.654,0.9512,0.9746,0.0715,0.5099,1.0468,0.044


RandomForestRegressor(n_jobs=-1, random_state=123)


In [28]:
tuned_model = tune_model(best_model, optimize='RMSE', fold=5, n_iter=10, choose_better=True)

final_model = finalize_model(tuned_model)

evaluate_model(final_model)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3327,0.4076,0.6385,0.6488,0.228,1.7229
1,0.3222,0.2941,0.5423,0.6922,0.2218,2.6111
2,0.3264,0.3067,0.5538,0.6805,0.2269,1.682
3,0.3208,0.3893,0.624,0.6141,0.223,2.4742
4,0.3385,0.3401,0.5832,0.6715,0.2333,2.6217
Mean,0.3281,0.3476,0.5883,0.6614,0.2266,2.2224
Std,0.0066,0.0446,0.0377,0.0276,0.0041,0.4279


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

: 