# PM2.5 prediction

### Import libraries

In [1]:
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score


### Data Import

In [2]:
RESOLUTION = '0_1'

geopackages = os.listdir('grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')

TARGET = 'pm25_st'
NUMBER_OF_PARAMS = 20
#NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2


results1 = pd.DataFrame(columns=['MAE', 'MSE', 'mean Y-Test', 'mean Y-predicted', 'R2'])
results2 = pd.DataFrame(columns=['MAE', 'MSE', 'mean Y-Test', 'mean Y-predicted', 'R2'])

for index, grid in enumerate(geopackages):
    regressor = RandomForestRegressor(max_depth=100, n_estimators=300)

    data = m.increase_data(gpd.read_file('grids_'+RESOLUTION+'/'+ grid), 'pm25_st', 5)


    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)


    data.pop('geometry')
    data.pop('bottom')
    data.pop('top')
    data.pop('left')
    data.pop('right')

    labels = pd.read_csv('features_'+RESOLUTION+'/'+grid[:-5]+'.csv')

    labels = pd.read_csv('fs.csv')

    labels = ml.remove_int_values(list(labels['Features']))


    #labels=labels[0:NUMBER_OF_PARAMS]
    #read variables which are not null
    score_results = pd.DataFrame()

    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
    X['lat_cen'] = data['lat_cen']
    X['lng_cen'] = data['lng_cen']

    Y = pd.DataFrame(data=data, columns=[TARGET] )
    Y = Y.values.ravel()


    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)

    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.30)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)


    regressor.fit(X_train, y_train)


    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print('---------'+ grid +'---------')
    print('---------RESULTS WITH RESPECT THE MODEL---------')
    print('Mean Absolute Error: ',mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ',mean_squared_error(y_test, y_pred))
    print('Mean (Y-test):',np.mean(y_test))
    print('Mean (Y-predicted): ',np.mean(y_pred))
    print('R2 score: ',r2_score(y_test, y_pred))

    results1.loc[index]= [round(mean_absolute_error(y_test, y_pred),3), round(mean_squared_error(y_test, y_pred), 3), round(np.mean(y_test), 3), round(np.mean(y_pred), 3), round(r2_score(y_test, y_pred), 3)]
    results1.rename(index={index: grid}, inplace=True)

    data_new = m.increase_data(gpd.read_file('grids_'+RESOLUTION+'/'+ grid), 'pm25_st', 5)
    data_new = data_new[~data_new[TARGET].isnull()]
    pm25_obs = data_new['pm25_st']
    X_new = pd.DataFrame(data=data_new, columns=labels).dropna(axis=1)
    X_new['lat_cen'] = data_new['lat_cen']
    X_new['lng_cen'] = data_new['lng_cen']

    sc = StandardScaler()
    X_new = sc.fit_transform(X_new)
    predictions_pm25 = regressor.predict(X_new)
    predictions_pm25 = pd.Series(np.array(predictions_pm25).reshape((-1,)))
    e = pd.DataFrame()

    e['predictions_pm25'] = list(predictions_pm25)
    e['pm25_obs'] = list(pm25_obs)
    e = e.dropna(axis=0)
    print('---------RESULTS WITH RESPECT THE ENTIRE DATASET---------')
    print('Mean Absolute Error: ',mean_absolute_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean Squared Error: ',mean_squared_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean (Y-test):',np.mean(e['pm25_obs']))
    print('Mean (Y-predicted): ',np.mean(e['predictions_pm25']))
    print('R2 score: ',r2_score(e['pm25_obs'], e['predictions_pm25']))
    print('\n\n\n\n')
    results2.loc[index]= [round(mean_absolute_error(e['pm25_obs'], e['predictions_pm25']),3), round(mean_squared_error(e['pm25_obs'], e['predictions_pm25']),3), round(np.mean(e['pm25_obs']),3), round(np.mean(e['predictions_pm25']),3), round(r2_score(e['pm25_obs'], e['predictions_pm25']),3)]
    results2.rename(index={index: grid}, inplace=True)

results1 = results1.T
results2 = results2.T

results1.to_excel('test/RF1'+RESOLUTION+'.xlsx')
results2.to_excel('test/RF2'+RESOLUTION+'.xlsx')



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0418_0425_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.6150361391633397
Mean Squared Error:  4.810247076483149
Mean (Y-test): 17.993116670668634
Mean (Y-predicted):  18.71300868619147
R2 score:  0.27417124768468115



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.0862794918524445
Mean Squared Error:  2.0783282440269657
Mean (Y-test): 18.31672157449959
Mean (Y-predicted):  18.275100586368026
R2 score:  0.7068806318758304








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0903_0910_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.1411950836096532
Mean Squared Error:  2.2115996323314016
Mean (Y-test): 15.3590675119966
Mean (Y-predicted):  14.93165650156732
R2 score:  0.7897788455015057



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.8233631382562746
Mean Squared Error:  1.1859747796874593
Mean (Y-test): 15.359205870728628
Mean (Y-predicted):  15.2640818432433
R2 score:  0.8943499255409425








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_1007_1017_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.1991268738712284
Mean Squared Error:  2.1893071976953755
Mean (Y-test): 18.149625068970852
Mean (Y-predicted):  18.70391388985368
R2 score:  0.8241765136330285



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.161815372033894
Mean Squared Error:  2.405891067467592
Mean (Y-test): 17.946663233524674
Mean (Y-predicted):  17.69830836825937
R2 score:  0.8766519942309853








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0717_0724_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  0.8718373981184904
Mean Squared Error:  1.336153407825329
Mean (Y-test): 12.80764184972366
Mean (Y-predicted):  13.072642565750137
R2 score:  0.414033890248254



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.6254661450674305
Mean Squared Error:  0.7061579932137168
Mean (Y-test): 12.971714477298589
Mean (Y-predicted):  13.114088886797152
R2 score:  0.6964802401794014








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0324_0331_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.9500056069616734
Mean Squared Error:  6.434502513849222
Mean (Y-test): 30.211608206070643
Mean (Y-predicted):  30.355719071897663
R2 score:  0.6168271128472789



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.4843852766173182
Mean Squared Error:  4.086921467696527
Mean (Y-test): 30.334952857935402
Mean (Y-predicted):  30.318089237902402
R2 score:  0.8650199591983497





