# PM2.5 prediction

### Import libraries

In [1]:
import math
import os

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import seed
from scipy.stats import stats
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

seed(1)
from fs import methods as m
from fs import model as ml
import geopandas as gpd
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import tensorflow
tensorflow.random.set_seed(1)
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score


### Data Import

In [2]:
geopackages = os.listdir('grids')
geopackages.remove('.DS_Store')

TARGET = 'pm25_cams'
NUMBER_OF_PARAMS = 20
NUMBER_OF_COVARIATES = NUMBER_OF_PARAMS + 2

NUMBER_OF_SAMPLES = 28
regressor = RandomForestRegressor(n_estimators=250, random_state=10)

for grid in geopackages:
    data = gpd.read_file('grids/'+ grid)


    data = data[~data[TARGET].isnull()]
    data = data.dropna(axis=1)


    data.pop('geometry')
    data.pop('bottom')
    data.pop('top')
    data.pop('left')
    data.pop('right')

    labels = pd.read_csv('results/'+grid[:-5]+'.csv')
    labels = ml.remove_int_values(list(labels['Features']))
    labels=labels[0:NUMBER_OF_PARAMS]
    #read variables which are not null
    score_results = pd.DataFrame()

    #Store dataset in x and y variables
    X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
    X['lat_cen'] = data['lat_cen']
    X['lng_cen'] = data['lng_cen']

    Y = pd.DataFrame(data=data, columns=[TARGET] )
    Y = Y.values.ravel()


    #NUMBER_OF_COVARIATES = X.shape[1]
    y1 = np.array(Y)

    X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.20)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    print('---------RESULTS WITH RESPECT THE MODEL---------')
    print('Mean Absolute Error: ',mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ',mean_squared_error(y_test, y_pred))
    print('Mean (Y-test):',np.mean(y_test))
    print('Mean (Y-predicted): ',np.mean(y_pred))
    print('R2 score: ',r2_score(y_test, y_pred))



    data_new = gpd.read_file('grids/'+ grid)
    data_new = data_new[~data_new[TARGET].isnull()]
    pm25_obs = data_new['pm25_cams']
    X_new = pd.DataFrame(data=data_new, columns=labels).dropna(axis=1)
    X_new['lat_cen'] = data_new['lat_cen']
    X_new['lng_cen'] = data_new['lng_cen']

    sc = StandardScaler()
    X_new = sc.fit_transform(X_new)
    predictions_pm25 = regressor.predict(X_new)
    predictions_pm25 = pd.Series(np.array(predictions_pm25).reshape((-1,)))
    e = pd.DataFrame()

    e['predictions_pm25'] = list(predictions_pm25)
    e['pm25_obs'] = list(pm25_obs)
    e = e.dropna(axis=0)
    print('---------RESULTS WITH RESPECT THE ENTIRE DATASET---------')
    print('Mean Absolute Error: ',mean_absolute_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean Squared Error: ',mean_squared_error(e['pm25_obs'], e['predictions_pm25']))
    print('Mean (Y-test):',np.mean(e['pm25_obs']))
    print('Mean (Y-predicted): ',np.mean(e['predictions_pm25']))
    print('R2 score: ',r2_score(e['pm25_obs'], e['predictions_pm25']))
    print('\n\n\n\n')


---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  0.0957345183735161
Mean Squared Error:  0.01626432594860267
Mean (Y-test): 21.342704974429708
Mean (Y-predicted):  21.34455554306675
R2 score:  0.9998735123373554
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.057730762394991725
Mean Squared Error:  0.006856324526709493
Mean (Y-test): 21.388255779038776
Mean (Y-predicted):  21.392277466221294
R2 score:  0.9999471230383941





---------RESULTS WITH RESPECT THE MODEL---------
Mean Absolute Error:  0.13499293671191562
Mean Squared Error:  0.038499747943377904
Mean (Y-test): 30.41544211750299
Mean (Y-predicted):  30.362372947182457
R2 score:  0.9997960045536131
---------RESULTS WITH RESPECT THE ENTIRE DATASET---------
Mean Absolute Error:  0.19544233234904027
Mean Squared Error:  0.05886722683219865
Mean (Y-test): 29.705532800067555
Mean (Y-predicted):  29.551277765918858
R2 score:  0.9996651962713551





---------RESULTS WITH RESPEC