# PM2.5 prediction

### Import libraries

In [1]:
from IPython.core.display import display
import warnings
import math
import os
import ipywidgets as widgets

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore")
params = ['0_1_mountains', '0_1_nomountains', '0_01_mountains', '0_01_nomountains']


### Data Import

In [2]:
NUMBER_OF_COVARIATES = 20
#'0_1_mountains', '0_1_nomountains', '0_01_mountains', '0_01_nomountains'
param = '0_1_nomountains'
geopackages = ['0324_0331_2021','0418_0425_2021', '0717_0724_2021','0903_0910_2021','1007_1014_2021']
TARGET = 'pm25_st'

results = pd.DataFrame(columns=['MAE_sensor', 'MSE_sensor', 'R2_sensor', 'MAE_cams', 'MSE_cams', 'R2_cams'])

for par in params:
    for index, grid in enumerate(geopackages):
        regressor = RandomForestRegressor(max_depth=100, n_estimators=300)
        if(par[0:3] == '0_1'):
            data = gpd.read_file('assets/grids_0_1/grid_0_1_'+ grid+'.gpkg')
            data = m.process_data(data, 10, TARGET)
        else:
            data = gpd.read_file('assets/grids_0_01/grid_0_01_'+ grid+'.gpkg')
            data = m.process_data(data, 30, TARGET)

        if(par[-11:]=='nomountains'):
            data = data[data['clim_zone'] > 3]
        data.pop('clim_zone')
        data = data[~data[TARGET].isnull()]
        data = data.dropna(axis=1).dropna(axis=0)
        labels = pd.read_csv('assets/fs_results/'+TARGET + par + '_features_model.csv')['Features'].tolist()

        #Store dataset in x and y variables
        X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
        cams_model = X['pm25_cams'].to_numpy()
        X.pop('pm25_cams')

        X = X.iloc[:, :NUMBER_OF_COVARIATES].to_numpy()
        Y = pd.DataFrame(data=data, columns=[TARGET] )

        Y = Y.values.ravel()

        skf = KFold(n_splits=5, shuffle = True)

        #NUMBER_OF_COVARIATES = X.shape[1]
        y1 = np.array(Y)
        i = 1
        mae_list1 = []
        mse_list1 = []
        r2_list1 = []
        mae_list2 = []
        mse_list2 = []
        r2_list2 = []
        print('---------'+ grid +'---------')

        for train_index, test_index in skf.split(X):
            print("Iteration n°:  ", i)
            i = i + 1

            X_train = X[train_index]
            X_test = X[test_index]
            y_train, y_test = y1[train_index], y1[test_index]
            cams_model_validation = cams_model[test_index]

            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)




            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            #print('Mean Absolute Error: ',mae)
            #print('Mean Squared Error: ',mse)
            #print('R2 score: ',r2)
            mae_list1.append(mae)
            mse_list1.append(mse)
            r2_list1.append(r2)

            mae = mean_absolute_error(y_test, cams_model_validation)
            mse = mean_squared_error(y_test, cams_model_validation)
            r2 = r2_score(y_test, cams_model_validation)

            mae_list2.append(mae)
            mse_list2.append(mse)
            r2_list2.append(r2)

        avg_mae1 = np.mean(mae_list1)
        avg_mse1 = np.mean(mse_list1)
        avg_r21 = np.mean(r2_list1)

        avg_mae2 = np.mean(mae_list2)
        avg_mse2 = np.mean(mse_list2)
        avg_r22 = np.mean(r2_list2)



        print('---------VALIDATION (ARPA)  ---------')
        print('Mean Absolute Error: ',avg_mae1)
        print('Mean Squared Error: ',avg_mse1)
        print('R2 score: ',avg_r21)
        print('---------VALIDATION (CAMS) ---------')
        print('Mean Absolute Error: ',avg_mae2)
        print('Mean Squared Error: ',avg_mse2)
        print('R2 score: ',avg_r22)


        mae_list1 = []
        mse_list1 = []
        r2_list1 = []
        mae_list2 = []
        mse_list2 = []
        r2_list2 = []


        results.loc[index]= [round(avg_mae1,3), round(avg_mse1, 3), round(avg_r21, 3), round(avg_mae2,3), round(avg_mse2, 3), round(avg_r22, 3)]
        results.rename(index={index: grid}, inplace=True)

    new = results.T
    new.to_excel('assets/test/RF'+TARGET+par+'.xlsx')
    new.to_csv('assets/test/RF'+TARGET+par+'.csv')


---------0324_0331_2021---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION (ARPA)  ---------
Mean Absolute Error:  1.1979038945219853
Mean Squared Error:  3.1726327752356998
R2 score:  0.8853403172730212
---------VALIDATION (CAMS) ---------
Mean Absolute Error:  7.800208709370421
Mean Squared Error:  86.2788155812207
R2 score:  -2.146964304028448
---------0418_0425_2021---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION (ARPA)  ---------
Mean Absolute Error:  0.8978343137301839
Mean Squared Error:  1.4001559921794364
R2 score:  0.7956425777281082
---------VALIDATION (CAMS) ---------
Mean Absolute Error:  6.556384805964745
Mean Squared Error:  57.59264305401499
R2 score:  -7.491962328401664
---------0717_0724_2021---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION (ARPA)  ---------
Mean