# PM2.5 prediction

### Import libraries

In [1]:
import warnings
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore")


### Data Import

In [2]:

RESOLUTION = '0_1'
NO_MOUNTAINS = True
KNN = True
knn_value=10
GENERAL = True
geopackages = os.listdir('assets/grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')

TARGET = 'pm25_st'
NUMBER_OF_PARAMS = 50
#NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2


results1 = pd.DataFrame(columns=['MAE', 'MSE', 'R2'])
results2 = pd.DataFrame(columns=['MAE', 'MSE', 'R2'])

for index, grid in enumerate(geopackages):
    regressor = RandomForestRegressor(max_depth=100, n_estimators=300)

    data = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)

    if KNN:
        data = m.process_data(data, knn_value, 'pm25_st', NO_MOUNTAINS)

    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)

    data.pop('geometry')



    if (GENERAL == False):
        labels = pd.read_csv('assets/features_'+RESOLUTION + '/'+grid[:-5]+'.csv')['Features']
    else :
        labels = pd.read_csv('assets/features_'+RESOLUTION+'general'+'.csv')['Features']
    labels=labels[0:NUMBER_OF_PARAMS]

    #labels = pd.read_csv('fs.csv')


    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
    cams_model = X['pm25_cams'].to_numpy()
    X.pop('pm25_cams')

    X = X.to_numpy()
    Y = pd.DataFrame(data=data, columns=[TARGET] )

    Y = Y.values.ravel()

    skf = KFold(n_splits=5, shuffle = True)

    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)
    i = 1
    mae_list1 = []
    mse_list1 = []
    r2_list1 = []
    mae_list2 = []
    mse_list2 = []
    r2_list2 = []
    print('---------'+ grid +'---------')

    for train_index, test_index in skf.split(X):
        print("Iteration n°:  ", i)
        i = i + 1

        X_train = X[train_index]
        X_test = X[test_index]
        y_train, y_test = y1[train_index], y1[test_index]
        cams_model_validation = cams_model[test_index]

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)




        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        #print('Mean Absolute Error: ',mae)
        #print('Mean Squared Error: ',mse)
        #print('R2 score: ',r2)
        mae_list1.append(mae)
        mse_list1.append(mse)
        r2_list1.append(r2)

        mae = mean_absolute_error(y_test, cams_model_validation)
        mse = mean_squared_error(y_test, cams_model_validation)
        r2 = r2_score(y_test, cams_model_validation)

        mae_list2.append(mae)
        mse_list2.append(mse)
        r2_list2.append(r2)

    avg_mae1 = np.mean(mae_list1)
    avg_mse1 = np.mean(mse_list1)
    avg_r21 = np.mean(r2_list1)

    avg_mae2 = np.mean(mae_list2)
    avg_mse2 = np.mean(mse_list2)
    avg_r22 = np.mean(r2_list2)



    print('---------VALIDATION (ARPA)  ---------')
    print('Mean Absolute Error: ',avg_mae1)
    print('Mean Squared Error: ',avg_mse1)
    print('R2 score: ',avg_r21)
    print('---------VALIDATION (CAMS) ---------')
    print('Mean Absolute Error: ',avg_mae2)
    print('Mean Squared Error: ',avg_mse2)
    print('R2 score: ',avg_r22)


    mae_list1 = []
    mse_list1 = []
    r2_list1 = []
    mae_list2 = []
    mse_list2 = []
    r2_list2 = []


    results1.loc[index]= [round(avg_mae1,3), round(avg_mse1, 3), round(avg_r21, 3)]
    results1.rename(index={index: grid}, inplace=True)

    results2.rename(index={index: grid}, inplace=True)

results1 = results1.T
results1.to_excel('assets/test/RF1'+RESOLUTION+'.xlsx')

results2 = results2.T
results2.to_excel('assets/test/RF2'+RESOLUTION+'.xlsx')


---------grid_0_1_0418_0425_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION (ARPA)  ---------
Mean Absolute Error:  1.2619259322573928
Mean Squared Error:  2.5066900354329253
R2 score:  0.6593425993655886
---------VALIDATION (CAMS) ---------
Mean Absolute Error:  7.88298117215829
Mean Squared Error:  75.95120481433688
R2 score:  -9.56226206816449
---------grid_0_1_0903_0910_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION (ARPA)  ---------
Mean Absolute Error:  1.141092437095318
Mean Squared Error:  2.3351125839915787
R2 score:  0.7997243482827221
---------VALIDATION (CAMS) ---------
Mean Absolute Error:  3.581216838248795
Mean Squared Error:  16.922894401788774
R2 score:  -0.48349701379999627
---------grid_0_1_1007_1014_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5