# PM2.5 prediction

### Import libraries

In [1]:
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score


### Data Import

In [3]:
RESOLUTION = '0_1'

geopackages = os.listdir('grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')

TARGET = 'pm25_st'
NUMBER_OF_PARAMS = 20
#NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2


results1 = pd.DataFrame(columns=['MAE', 'MSE', 'mean Y-Test', 'mean Y-predicted', 'R2'])
results2 = pd.DataFrame(columns=['MAE', 'MSE', 'mean Y-Test', 'mean Y-predicted', 'R2'])

for index, grid in enumerate(geopackages):
    regressor = RandomForestRegressor(max_depth=100, n_estimators=300)

    data = m.increase_data(gpd.read_file('grids_'+RESOLUTION+'/'+ grid), 'pm25_st', 5)


    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)


    data.pop('geometry')
    data.pop('bottom')
    data.pop('top')
    data.pop('left')
    data.pop('right')

    labels = pd.read_csv('features_'+RESOLUTION+'/'+grid[:-5]+'.csv')

    labels = pd.read_csv('fs.csv')

    labels = ml.remove_int_values(list(labels['Features']))


    #labels=labels[0:NUMBER_OF_PARAMS]
    #read variables which are not null
    score_results = pd.DataFrame()

    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
    X['lat_cen'] = data['lat_cen']
    X['lng_cen'] = data['lng_cen']

    Y = pd.DataFrame(data=data, columns=[TARGET] )
    Y = Y.values.ravel()


    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)

    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.30)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)


    regressor.fit(X_train, y_train)


    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print('---------'+ grid +'---------')
    print('---------RESULTS WITH RESPECT THE MODEL---------')
    print('Mean Absolute Error: ',mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ',mean_squared_error(y_test, y_pred))
    print('Mean (Y-test):',np.mean(y_test))
    print('Mean (Y-predicted): ',np.mean(y_pred))
    print('R2 score: ',r2_score(y_test, y_pred))

    results1.loc[index]= [round(mean_absolute_error(y_test, y_pred),3), round(mean_squared_error(y_test, y_pred), 3), round(np.mean(y_test), 3), round(np.mean(y_pred), 3), round(r2_score(y_test, y_pred), 3)]
    results1.rename(index={index: grid}, inplace=True)

    data_new = m.increase_data(gpd.read_file('grids_'+RESOLUTION+'/'+ grid), 'pm25_st', 5)
    data_new = data_new[~data_new[TARGET].isnull()]
    pm25_obs = data_new['pm25_st']
    X_new = pd.DataFrame(data=data_new, columns=labels).dropna(axis=1)
    X_new['lat_cen'] = data_new['lat_cen']
    X_new['lng_cen'] = data_new['lng_cen']

    sc = StandardScaler()
    X_new = sc.fit_transform(X_new)
    predictions_pm25 = regressor.predict(X_new)
    predictions_pm25 = pd.Series(np.array(predictions_pm25).reshape((-1,)))
    e = pd.DataFrame()

    e['predictions_pm25'] = list(predictions_pm25)
    e['pm25_obs'] = list(pm25_obs)
    e = e.dropna(axis=0)
    print('---------RESULTS WITH RESPECT THE ENTIRE DATASET---------')
    print('Mean Absolute Error: ',mean_absolute_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean Squared Error: ',mean_squared_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean (Y-test):',np.mean(e['pm25_obs']))
    print('Mean (Y-predicted): ',np.mean(e['predictions_pm25']))
    print('R2 score: ',r2_score(e['pm25_obs'], e['predictions_pm25']))
    print('\n\n\n\n')
    results2.loc[index]= [round(mean_absolute_error(e['pm25_obs'], e['predictions_pm25']),3), round(mean_squared_error(e['pm25_obs'], e['predictions_pm25']),3), round(np.mean(e['pm25_obs']),3), round(np.mean(e['predictions_pm25']),3), round(r2_score(e['pm25_obs'], e['predictions_pm25']),3)]
    results2.rename(index={index: grid}, inplace=True)

results1 = results1.T
results2 = results2.T

results1.to_excel('test/RF1'+RESOLUTION+'.xlsx')
results2.to_excel('test/RF2'+RESOLUTION+'.xlsx')



  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0418_0425_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.3461332526466936
Mean Squared Error:  3.3501954173093154
Mean (Y-test): 18.284558260108255
Mean (Y-predicted):  18.55006188245235
R2 score:  0.592278968791695
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.930335096795101
Mean Squared Error:  1.755874053948135
Mean (Y-test): 18.31672157449959
Mean (Y-predicted):  18.350478916102592
R2 score:  0.7523583222823086








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))

  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0903_0910_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.2630535616857719
Mean Squared Error:  2.658529584469607
Mean (Y-test): 15.31500177086997
Mean (Y-predicted):  15.570984978404525
R2 score:  0.7189264429720278
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.9193541228082379
Mean Squared Error:  1.5424555089513716
Mean (Y-test): 15.359205870728628
Mean (Y-predicted):  15.309774933357192
R2 score:  0.862593587855687








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))

  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_1007_1017_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  1.3972729247020723
Mean Squared Error:  2.8852995912626525
Mean (Y-test): 17.002690362659532
Mean (Y-predicted):  17.046175525362344
R2 score:  0.8055720538314244
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.1949257824103214
Mean Squared Error:  2.190878609615679
Mean (Y-test): 17.946663233524674
Mean (Y-predicted):  18.43450548476853
R2 score:  0.8876755015917919








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))

  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0717_0724_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  0.9335480965846811
Mean Squared Error:  1.5922156201895223
Mean (Y-test): 12.889858069111753
Mean (Y-predicted):  12.824778044150117
R2 score:  0.37767582366864694
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.6788477857644989
Mean Squared Error:  0.8332093051712494
Mean (Y-test): 12.971714477298589
Mean (Y-predicted):  12.905556179932615
R2 score:  0.6418712375754028








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))

  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))


---------grid_0_1_0324_0331_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  2.6494882324634683
Mean Squared Error:  13.396952192105159
Mean (Y-test): 31.31001690205961
Mean (Y-predicted):  30.21772947714655
R2 score:  0.5098059905808967
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  1.5794694099278062
Mean Squared Error:  5.740726818240282
Mean (Y-test): 30.334952857935402
Mean (Y-predicted):  29.834715652661604
R2 score:  0.8103992097029598








  nB = np.array(list(data.geometry.centroid.apply(lambda x: (x.x, x.y))))
