In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from pycaret.regression import setup, compare_models

import os
import pandas as pd
import numpy as np
import pickle

In [3]:
##load data from notebook
import sys
sys.path.append("/home/ilaria/code/elpbcn/air-pollution-levels/air-pollution-levels/air-pollution-levels/ml_logic/")
from data import load_data, clean_data, classify_concentrations, simplify_stations,simplified_station_type, impute_stations


**Import data preprocessing**

In [4]:
def load_data():
    '''
    A function for loading csv data into dataframe df.
    '''

    #Location of csv file
    csv_file = '../air_pollution_data_upd.csv'

    #Loading csv file into df dataframe
    df = pd.read_csv(csv_file)

    return df

In [5]:
data = load_data()
data = clean_data(data)
data = classify_concentrations(data)
data = simplified_station_type(data)
data = impute_stations(data)
data

Unnamed: 0,who_region,country_name,city,year,pm10_concentration,pm25_concentration,no2_concentration,type_of_stations,population,latitude,longitude,target_class,simplified_station_type,encoded_station_type,encoded_station_type_imputed,final_station_type
0,4_Eur,Spain,A Coruna,2013.0,23.238,11.491,28.841,"Urban, Urban, Suburban",246056.0,43.367900,-8.418571,3,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
1,4_Eur,Spain,A Coruna,2014.0,27.476,15.878,19.575,"Urban, Urban, Suburban",246056.0,43.368033,-8.418233,4,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
2,4_Eur,Spain,A Coruna,2015.0,25.515,14.004,22.731,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.422900,4,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
3,4_Eur,Spain,A Coruna,2016.0,23.057,13.160,20.204,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.422900,3,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
4,4_Eur,Spain,A Coruna,2017.0,26.849,14.114,21.543,"Urban, Urban, Suburban, Suburban",246056.0,43.370375,-8.422900,4,"Suburban, Urban",4.0,4.0,"Suburban, Urban"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40093,6_Wpr,Republic of Korea,경기도,2017.0,57.335,36.457,0.029,,,37.337200,126.724100,5,unknown,,1.6,Rural
40094,6_Wpr,Republic of Korea,경기도,2018.0,50.838,31.586,0.027,,,37.337200,126.724100,5,unknown,,1.4,Urban
40095,6_Wpr,Republic of Korea,경기도,2019.0,55.568,31.013,0.028,,,37.337200,126.724100,5,unknown,,1.0,Urban
40096,6_Wpr,China,虎英公园北,2018.0,,30.649,,,,23.012778,113.794444,2,unknown,,1.4,Urban


In [6]:
 #Renaming dataframe
newdf = data

In [7]:
#Convert year to datetime
newdf['year'] = pd.to_datetime(newdf['year'], format='%Y')
newdf["year"]

0       2013-01-01
1       2014-01-01
2       2015-01-01
3       2016-01-01
4       2017-01-01
           ...    
40093   2017-01-01
40094   2018-01-01
40095   2019-01-01
40096   2018-01-01
40097   2019-01-01
Name: year, Length: 40002, dtype: datetime64[ns]

In [8]:
# Drop rows not useful for model
newdf = newdf.drop(columns=["city","pm10_concentration", "no2_concentration", "type_of_stations", "simplified_station_type","encoded_station_type","final_station_type", "target_class"])

In [9]:
# Drop rows with missing values in 'pm25_concentration'
newdf = newdf.dropna(subset=['pm25_concentration'])

In [10]:
newdf

Unnamed: 0,who_region,country_name,year,pm25_concentration,population,latitude,longitude,encoded_station_type_imputed
0,4_Eur,Spain,2013-01-01,11.491,246056.0,43.367900,-8.418571,4.0
1,4_Eur,Spain,2014-01-01,15.878,246056.0,43.368033,-8.418233,4.0
2,4_Eur,Spain,2015-01-01,14.004,246056.0,43.370375,-8.422900,4.0
3,4_Eur,Spain,2016-01-01,13.160,246056.0,43.370375,-8.422900,4.0
4,4_Eur,Spain,2017-01-01,14.114,246056.0,43.370375,-8.422900,4.0
...,...,...,...,...,...,...,...,...
40093,6_Wpr,Republic of Korea,2017-01-01,36.457,,37.337200,126.724100,1.6
40094,6_Wpr,Republic of Korea,2018-01-01,31.586,,37.337200,126.724100,1.4
40095,6_Wpr,Republic of Korea,2019-01-01,31.013,,37.337200,126.724100,1.0
40096,6_Wpr,China,2018-01-01,30.649,,23.012778,113.794444,1.4


In [25]:
#define colomn types
categorycols = ["who_region","country_name"]
numericalcols = ["encoded_station_type_imputed","pm25_concentration","population","latitude","longitude"]

In [12]:
#Call OHE and Scaler
onehotencoder= OneHotEncoder(drop="first", sparse_output=False)
scaler = StandardScaler()

In [13]:
# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', onehotencoder,categorycols),
        ('num', scaler,numericalcols)
    ],
    remainder = "drop"
)

In [14]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [15]:
# Fit and transform the data
processed_data = pipeline.fit_transform(newdf)
year_column = newdf[['year']]

# Convert the processed data back into a DataFrame and add Year column

processed_columns = (pipeline.named_steps['preprocessor'].get_feature_names_out().tolist() +
                     ['year'])
processed_df = pd.DataFrame(processed_data, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

final_df = pd.concat([processed_df, year_column.reset_index(drop=True)], axis=1)
final_df

Unnamed: 0,cat__who_region_2_Amr,cat__who_region_3_Sear,cat__who_region_4_Eur,cat__who_region_5_Emr,cat__who_region_6_Wpr,cat__who_region_7_NonMS,cat__country_name_Albania,cat__country_name_Algeria,cat__country_name_Argentina,cat__country_name_Australia,...,cat__country_name_Uzbekistan,cat__country_name_Venezuela (Bolivarian Republic of),cat__country_name_Viet Nam,"cat__country_name_occupied Palestinian territory, including east Jerusalem",num__encoded_station_type_imputed,num__pm25_concentration,num__population,num__latitude,num__longitude,year
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.217056,-0.436771,-0.199658,0.254556,-0.256882,2013-01-01
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.217056,-0.185931,-0.199658,0.254565,-0.256877,2014-01-01
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.217056,-0.293082,-0.199658,0.254711,-0.256943,2015-01-01
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.217056,-0.341341,-0.199658,0.254711,-0.256943,2016-01-01
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.217056,-0.286793,-0.199658,0.254711,-0.256943,2017-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21724,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.354332,0.990738,,-0.122775,1.656776,2017-01-01
21725,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.485281,0.712223,,-0.122775,1.656776,2018-01-01
21726,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.747179,0.679460,,-0.122775,1.656776,2019-01-01
21727,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.485281,0.658647,,-1.019031,1.473689,2018-01-01


**Finding the best model**

In [49]:
from pycaret.classification import *

X = final_df.drop(['num__pm25_concentration'], axis=1)
y = final_df['num__pm25_concentration']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a training DataFrame
train_df = X_train.copy()
train_df['num__pm25_concentration'] = y_train

# Setup PyCaret
clf_setup = setup(data=train_df, target='num__pm25_concentration', session_id=123, verbose=False)

# Compare models
best_model = compare_models()

print(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.1656,0.1673,0.3962,0.8486,0.141,2.4595,2.465
rf,Random Forest Regressor,0.1633,0.1772,0.4017,0.8419,0.1388,2.2589,2.209
lightgbm,Light Gradient Boosting Machine,0.1952,0.18,0.4137,0.8359,0.156,4.2476,68.985
gbr,Gradient Boosting Regressor,0.2448,0.2442,0.4832,0.7759,0.1845,4.6144,0.966
knn,K Neighbors Regressor,0.2503,0.2909,0.5319,0.7271,0.196,4.8703,0.133
ridge,Ridge Regression,0.3126,0.3553,0.5887,0.6684,0.2264,4.7389,0.083
br,Bayesian Ridge,0.3127,0.3553,0.5887,0.6684,0.2264,4.7379,0.257
dt,Decision Tree Regressor,0.2154,0.3606,0.5729,0.6587,0.1794,1.9334,0.106
omp,Orthogonal Matching Pursuit,0.3658,0.4355,0.6535,0.5909,0.2527,5.2142,0.081
huber,Huber Regressor,0.3768,0.5391,0.7268,0.4951,0.2589,2.9965,0.707


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='squared_error',
                    max_depth=None, max_features=1.0, max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, monotonic_cst=None,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False)
