# PM2.5 prediction

### Import libraries

In [3]:
import warnings
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore")


### Data Import

In [4]:

RESOLUTION = '0_1'
KNN = True
knn_value=10
GENERAL = False
geopackages = os.listdir('assets/grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')

TARGET = 'pm25_st'
NUMBER_OF_PARAMS = 50
#NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2


results1 = pd.DataFrame(columns=['MAE', 'MSE', 'R2'])
for index, grid in enumerate(geopackages):
    regressor = RandomForestRegressor(max_depth=100, n_estimators=300)

    data = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)

    if KNN:
        data = m.process_data(data, knn_value, 'pm25_st')

    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)

    data.pop('geometry')

    if (GENERAL == False):
        labels = pd.read_csv('assets/features_'+RESOLUTION + '/'+grid[:-5]+'.csv')['Features']
    else :
        labels = pd.read_csv('assets/features_'+RESOLUTION+'general'+'.csv')['Features']
    labels=labels[0:NUMBER_OF_PARAMS]

    #labels = pd.read_csv('fs.csv')


    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1).to_numpy()

    Y = pd.DataFrame(data=data, columns=[TARGET] )
    Y = Y.values.ravel()

    skf = KFold(n_splits=5, shuffle = True)

    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)
    i = 1
    mae_list1 = []
    mse_list1 = []
    r2_list1 = []
    mae_list2 = []
    mse_list2 = []
    r2_list2 = []
    print('---------'+ grid +'---------')

    for train_index, test_index in skf.split(X):
        print("Iteration n°:  ", i)
        i = i + 1

        X_train = X[train_index]
        X_test = X[test_index]
        y_train, y_test = y1[train_index], y1[test_index]

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)




        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        #print('Mean Absolute Error: ',mae)
        #print('Mean Squared Error: ',mse)
        #print('R2 score: ',r2)
        mae_list1.append(mae)
        mse_list1.append(mse)
        r2_list1.append(r2)

    avg_mae1 = np.mean(mae_list1)
    avg_mse1 = np.mean(mse_list1)
    avg_r21 = np.mean(r2_list1)



    print('---------VALIDATION ---------')
    print('Mean Absolute Error: ',avg_mae1)
    print('Mean Squared Error: ',avg_mse1)
    print('R2 score: ',avg_r21)


    mae_list1 = []
    mse_list1 = []
    r2_list1 = []
    mae_list2 = []
    mse_list2 = []
    r2_list2 = []


    results1.loc[index]= [round(avg_mae1,3), round(avg_mse1, 3), round(avg_r21, 3)]
    results1.rename(index={index: grid}, inplace=True)

    results2.rename(index={index: grid}, inplace=True)

results1 = results1.T
results1.to_excel('assets/test/RF1'+RESOLUTION+'.xlsx')

---------grid_0_1_0418_0425_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION ---------
Mean Absolute Error:  1.0417098463418877
Mean Squared Error:  1.6605395375267509
R2 score:  0.7537186602929451
---------grid_0_1_0903_0910_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION ---------
Mean Absolute Error:  0.9602601500752751
Mean Squared Error:  1.6364655440942915
R2 score:  0.8519507638473136
---------grid_0_1_1007_1014_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION ---------
Mean Absolute Error:  0.8692115844715737
Mean Squared Error:  1.7418392650179837
R2 score:  0.7933313724776359
---------grid_0_1_0717_0724_2021.gpkg---------
Iteration n°:   1
Iteration n°:   2
Iteration n°:   3
Iteration n°:   4
Iteration n°:   5
---------VALIDATION ---------