# PM2.5 prediction

### Import libraries

In [17]:
import warnings
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore")


### Data Import

In [18]:
"""
1-->Pearson
2-->Spearmanr
3-->Kendall
4-->Fisher
5-->Random Forest
6-->Average
"""
FOLDER='5'
RESOLUTION = '0_1'
KNN = True
knn_value=10

geopackages = os.listdir('assets/grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')

TARGET = 'pm25_st'
NUMBER_OF_PARAMS = 20
#NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2


results1 = pd.DataFrame(columns=['MAE', 'MSE', 'mean Y-Test', 'mean Y-predicted', 'R2'])
results2 = pd.DataFrame(columns=['MAE', 'MSE', 'mean Y-Test', 'mean Y-predicted', 'R2'])
r2_scores = []
for index, grid in enumerate(geopackages):
    regressor = RandomForestRegressor(max_depth=100, n_estimators=300)

    data = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)

    if KNN:
        data = m.process_data(data, knn_value)

    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)

    data.pop('geometry')

    labels = pd.read_csv('assets/features_'+RESOLUTION+'/'+FOLDER+ '/'+grid[:-5]+'.csv')['Features']
    labels=labels[0:NUMBER_OF_PARAMS]

    #labels = pd.read_csv('fs.csv')


    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)

    Y = pd.DataFrame(data=data, columns=[TARGET] )
    Y = Y.values.ravel()


    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)

    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.30)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)


    regressor.fit(X_train, y_train)


    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print('---------'+ grid +'---------')
    print('---------RESULTS WITH RESPECT THE MODEL---------')
    print('Mean Absolute Error: ',mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ',mean_squared_error(y_test, y_pred))
    print('Mean (Y-test):',np.mean(y_test))
    print('Mean (Y-predicted): ',np.mean(y_pred))
    print('R2 score: ',r2_score(y_test, y_pred))

    results1.loc[index]= [round(mean_absolute_error(y_test, y_pred),3), round(mean_squared_error(y_test, y_pred), 3), round(np.mean(y_test), 3), round(np.mean(y_pred), 3), round(r2_score(y_test, y_pred), 3)]
    results1.rename(index={index: grid}, inplace=True)

    data_new = gpd.read_file('assets/grids_'+RESOLUTION+'/'+ grid)
    data_new = data_new[~data_new[TARGET].isnull()]
    pm25_obs = data_new['pm25_st']
    X_new = pd.DataFrame(data=data_new, columns=labels).dropna(axis=1)

    sc = StandardScaler()
    X_new = sc.fit_transform(X_new)
    predictions_pm25 = regressor.predict(X_new)
    predictions_pm25 = pd.Series(np.array(predictions_pm25).reshape((-1,)))
    e = pd.DataFrame()

    e['predictions_pm25'] = list(predictions_pm25)
    e['pm25_obs'] = list(pm25_obs)
    e = e.dropna(axis=0)
    print('---------RESULTS WITH RESPECT THE ENTIRE DATASET---------')
    print('Mean Absolute Error: ',mean_absolute_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean Squared Error: ',mean_squared_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean (Y-test):',np.mean(e['pm25_obs']))
    print('Mean (Y-predicted): ',np.mean(e['predictions_pm25']))
    r = r2_score(e['pm25_obs'], e['predictions_pm25'])
    print('R2 score: ',r)
    r2_scores.append(r)

    print('\n\n\n\n')
    results2.loc[index]= [round(mean_absolute_error(e['pm25_obs'], e['predictions_pm25']),3), round(mean_squared_error(e['pm25_obs'], e['predictions_pm25']),3), round(np.mean(e['pm25_obs']),3), round(np.mean(e['predictions_pm25']),3), round(r2_score(e['pm25_obs'], e['predictions_pm25']),3)]
    results2.rename(index={index: grid}, inplace=True)

results1 = results1.T
results2 = results2.T

results1.to_excel('assets/test/RF1'+RESOLUTION+'.xlsx')
results2.to_excel('assets/test/RF2'+RESOLUTION+'.xlsx')
print("MEAN R2:  ", np.mean(r2_scores))

---------grid_0_1_0418_0425_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.0677082483666593
Mean Squared Error:  1.8197224128163423
Mean (Y-test): 18.33022524919667
Mean (Y-predicted):  18.571148629687958
R2 score:  0.7425424576950516
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.6302175283967935
Mean Squared Error:  3.9192468935991402
Mean (Y-test): 18.203066539115646
Mean (Y-predicted):  18.261030333655793
R2 score:  0.6730912573074159





---------grid_0_1_0903_0910_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.2307682365733916
Mean Squared Error:  2.6444030397167135
Mean (Y-test): 15.232722387351632
Mean (Y-predicted):  15.449602944897496
R2 score:  0.8087011639338607
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.5673690597869239
Mean Squared Error:  4.93198735901308
Mean (Y-test): 15.44968665932991
Mean (Y-predicted):  