# Model prediction with Random Forest Regressor

### Import libraries

In [None]:
import warnings
import numpy as np
from numpy.random import seed
from sklearn.ensemble import RandomForestRegressor
seed(1)
from fs import methods as m
import geopandas as gpd
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore")
params = ['0_1_mountains', '0_1_nomountains', '0_01_mountains', '0_01_nomountains']


### Configuration
Final results are finally stored in a Dataframe in which columns are referred to the errors and accuracy reached by model if it's validated by:
- values interpolated from ARPA sensors (which are used also by the training);
- values of CAMS model;


In [None]:
#Number of variables used for training 
NUMBER_OF_COVARIATES = 20
#Configurations of the model
params = ['0_1_mountains', '0_1_nomountains', '0_01_mountains', '0_01_nomountains']
#Period
geopackages = ['0324_0331_2021','0418_0425_2021', '0717_0724_2021','0903_0910_2021','1007_1014_2021']
#target variable choosen
TARGET = 'pm25_st'
#results are stored in a df
results = pd.DataFrame(columns=['MAE_sensor', 'MSE_sensor', 'R2_sensor', 'MAE_cams', 'MSE_cams', 'R2_cams'])

### Data Import

In [None]:
#With these nested loops a training for each period and configuratio is performed
for par in params:
    for index, grid in enumerate(geopackages):
        
        #Declaration of RF regressor initialized for each period
        regressor = RandomForestRegressor(max_depth=100, n_estimators=300)
        #Data acquisition
        if(par[0:3] == '0_1'):
            data = gpd.read_file('assets/grids_0_1/grid_0_1_'+ grid+'.gpkg')
            data = m.process_data(data, 10, TARGET)
        else:
            data = gpd.read_file('assets/grids_0_01/grid_0_01_'+ grid+'.gpkg')
            data = m.process_data(data, 30, TARGET)

        if(par[-11:]=='nomountains'):
            data = data[data['clim_zone'] > 3]
        data.pop('clim_zone')
        data = data[~data[TARGET].isnull()]
        data = data.dropna(axis=1).dropna(axis=0)
        labels = pd.read_csv('assets/fs_results/'+TARGET + par + '_features_model.csv')['Features'].tolist()

        #Store dataset in x and y variables
        X = pd.DataFrame(data=data, columns=labels ).dropna(axis = 1)
        cams_model = X['pm25_cams'].to_numpy()
        X.pop('pm25_cams')

        X = X.iloc[:, :NUMBER_OF_COVARIATES].to_numpy()
        Y = pd.DataFrame(data=data, columns=[TARGET] )

        Y = Y.values.ravel()


        y1 = np.array(Y)
        mae_list1 = []
        mse_list1 = []
        r2_list1 = []
        mae_list2 = []
        mse_list2 = []
        r2_list2 = []
        print('---------'+ grid +'---------')
        
        #K-Fold is applied
        skf = KFold(n_splits=5, shuffle = True)
        i = 1
        for train_index, test_index in skf.split(X):
            print("Iteration nÂ°:  ", i)
            i = i + 1

            X_train = X[train_index]
            X_test = X[test_index]
            y_train, y_test = y1[train_index], y1[test_index]
            cams_model_validation = cams_model[test_index]
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)
            
            #training 
            regressor.fit(X_train, y_train)
            
            #validation with testset
            y_pred = regressor.predict(X_test)
            
            #result are stored
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mae_list1.append(mae)
            mse_list1.append(mse)
            r2_list1.append(r2)
                   
            #validation with CAMS model values
            mae = mean_absolute_error(y_test, cams_model_validation)
            mse = mean_squared_error(y_test, cams_model_validation)
            r2 = r2_score(y_test, cams_model_validation)
            
            #result are stored
            mae_list2.append(mae)
            mse_list2.append(mse)
            r2_list2.append(r2)
       
        #At the end of the K-Fold results obtained are averaged
        avg_mae1 = np.mean(mae_list1)
        avg_mse1 = np.mean(mse_list1)
        avg_r21 = np.mean(r2_list1)
        
        avg_mae2 = np.mean(mae_list2)
        avg_mse2 = np.mean(mse_list2)
        avg_r22 = np.mean(r2_list2)



        print('---------VALIDATION (ARPA)  ---------')
        print('Mean Absolute Error: ',avg_mae1)
        print('Mean Squared Error: ',avg_mse1)
        print('R2 score: ',avg_r21)
        print('---------VALIDATION (CAMS) ---------')
        print('Mean Absolute Error: ',avg_mae2)
        print('Mean Squared Error: ',avg_mse2)
        print('R2 score: ',avg_r22)


        mae_list1 = []
        mse_list1 = []
        r2_list1 = []
        mae_list2 = []
        mse_list2 = []
        r2_list2 = []


        results.loc[index]= [round(avg_mae1,3), round(avg_mse1, 3), round(avg_r21, 3), round(avg_mae2,3), round(avg_mse2, 3), round(avg_r22, 3)]
        results.rename(index={index: grid}, inplace=True)

    #results are exported
    new = results.T
    new.to_excel('assets/test/RF'+TARGET+par+'.xlsx')
    new.to_csv('assets/test/RF'+TARGET+par+'.csv')
