# PM2.5 prediction

### Import libraries

In [3]:
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score


### Data Import

In [4]:
RESOLUTION = '0_1'

geopackages = os.listdir('grids_'+RESOLUTION)
#geopackages.remove('.DS_Store')

TARGET = 'nh3_st'
NUMBER_OF_PARAMS = 20
NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2

NUMBER_OF_SAMPLES = 28


regressor = RandomForestRegressor(max_depth=100, max_features=14, n_estimators=300)

for grid in geopackages:
    data = gpd.read_file('grids_'+RESOLUTION+'/'+ grid)


    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)


    data.pop('geometry')
    data.pop('bottom')
    data.pop('top')
    data.pop('left')
    data.pop('right')

    #labels = pd.read_csv('features_'+RESOLUTION+'/'+grid[:-5]+'.csv')

    labels = pd.read_csv('fs.csv')

    labels = ml.remove_int_values(list(labels['Features']))


    labels=labels[0:NUMBER_OF_PARAMS]
    #read variables which are not null
    score_results = pd.DataFrame()

    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
    X['lat_cen'] = data['lat_cen']
    X['lng_cen'] = data['lng_cen']

    Y = pd.DataFrame(data=data, columns=[TARGET] )
    Y = Y.values.ravel()


    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)

    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.30)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)


    regressor.fit(X_train, y_train)


    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print('---------'+ grid +'---------')
    print('---------RESULTS WITH RESPECT THE MODEL---------')
    print('Mean Absolute Error: ',mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ',mean_squared_error(y_test, y_pred))
    print('Mean (Y-test):',np.mean(y_test))
    print('Mean (Y-predicted): ',np.mean(y_pred))
    print('R2 score: ',r2_score(y_test, y_pred))



    data_new = gpd.read_file('grids_'+RESOLUTION+'/'+ grid)
    data_new = data_new[~data_new[TARGET].isnull()]
    pm25_obs = data_new['nh3_cams']
    X_new = pd.DataFrame(data=data_new, columns=labels).dropna(axis=1)
    X_new['lat_cen'] = data_new['lat_cen']
    X_new['lng_cen'] = data_new['lng_cen']

    sc = StandardScaler()
    X_new = sc.fit_transform(X_new)
    predictions_pm25 = regressor.predict(X_new)
    predictions_pm25 = pd.Series(np.array(predictions_pm25).reshape((-1,)))
    e = pd.DataFrame()

    e['predictions_pm25'] = list(predictions_pm25)
    e['pm25_obs'] = list(pm25_obs)
    e = e.dropna(axis=0)
    print('---------RESULTS WITH RESPECT THE ENTIRE DATASET---------')
    print('Mean Absolute Error: ',mean_absolute_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean Squared Error: ',mean_squared_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean (Y-test):',np.mean(e['pm25_obs']))
    print('Mean (Y-predicted): ',np.mean(e['predictions_pm25']))
    print('R2 score: ',r2_score(e['pm25_obs'], e['predictions_pm25']))
    print('\n\n\n\n')


---------grid_0_1_0418_0425_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  5.0584623015873005
Mean Squared Error:  26.327498677874626
Mean (Y-test): 12.08174603174603
Mean (Y-predicted):  13.819958333333306
R2 score:  -0.5811226473832638
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  6.242950588941573
Mean Squared Error:  65.51880846734201
Mean (Y-test): 17.13021782040596
Mean (Y-predicted):  13.874380208333317
R2 score:  0.428910102922335





---------grid_0_1_0903_0910_2021.gpkg---------
---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  22.72205952380948
Mean Squared Error:  637.575826225078
Mean (Y-test): 8.3125
Mean (Y-predicted):  31.03455952380948
R2 score:  -19.364480623640443
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  10.393368463596945
Mean Squared Error:  179.3893459580656
Mean (Y-test): 13.719999922646416
Mean (Y-predicted):  24.1133683862433