In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import os
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import regularizers
import random

In [4]:
sns.set_style("whitegrid")

In [5]:
dir_gdl = '../data/processed/2016-2019_3std_preprocessed.csv'
df = pd.read_csv(dir_gdl)
df_data = df[df['PARAM']=='PM10'].fillna(-1)
df_data.drop(columns=['Unnamed: 0'], inplace=True)
df_data = df_data[df_data.CEN != -1] #Elimina valores negativos en la columna de salida
print(df_data.shape)
df_data.head(5)

(27504, 13)


Unnamed: 0,FECHA,HORA,PARAM,AGU,ATM,CEN,LDO,LPIN,MIR,OBL,SFE,TLA,VAL
4,2016-01-01 00:00:00,00:00,PM10,49.92,146.95,86.12,174.04,-1.0,69.75,197.67,115.54,143.4,17.08
13,2016-01-01 01:00:00,01:00,PM10,52.8,-1.0,46.49,115.27,-1.0,68.99,138.09,84.24,100.46,29.15
22,2016-01-01 02:00:00,02:00,PM10,52.71,113.44,63.93,99.0,-1.0,117.7,98.79,135.39,82.05,30.89
31,2016-01-01 03:00:00,03:00,PM10,51.24,73.3,60.75,83.65,-1.0,160.3,97.94,117.6,114.74,38.74
40,2016-01-01 04:00:00,04:00,PM10,58.84,52.55,108.09,49.7,-1.0,180.89,134.39,164.68,118.83,51.48


In [6]:
X,Y = df_data[['AGU','ATM','LDO','LPIN','MIR','OBL','SFE','TLA','VAL']].to_numpy(), df_data["CEN"].to_numpy()   #separate data into input and output features

Y=np.reshape(Y, (-1,1))

X_std = (X - np.nanmin(np.where(X>=0, X, np.nan),axis=0)) / (X.max(axis=0) - np.nanmin(np.where(X>=0, X, np.nan),axis=0))
xscale = X_std * (1 - 0) + 0
xscale[X==-1]=-1

scaler_y = MinMaxScaler()
scaler_y.fit(Y)
yscale=scaler_y.transform(Y)

X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size = 0.2) #split

## Testing code configurations

#### Creating layers

In [7]:
def neuron_layers(nx,nh,ny,hl,act,r, seed=None):
    
    tf.keras.regularizers.l1(l1=r)
    
    initializer = tf.keras.initializers.RandomNormal(seed=seed)
    
    model = Sequential()
    
    for i in range(1, 3+hl):
        
        if i == 1:
            model.add(Dense(nx, input_dim=9, kernel_initializer=initializer,
                            activation=act,kernel_regularizer='l1'))
            
        elif i == (2+hl):
            model.add(Dense(ny, activation='linear'))
            
        else:
            model.add(Dense(nh, activation=act))
            
    return model

## Previous model

In [17]:
nn = pd.read_csv('../output/data/RNA/NN_ModelsNotebook6b_v1.csv')
nn.rename(columns={'Unnamed: 0':'model_number'}, inplace=True)
print(nn.shape)
nn.head(5)

(1260, 18)


Unnamed: 0,model_number,epochs,hidden_neurons,hidden_layers,activation,regularization,acc_train,mse_train,mae_train,rmse_train,std_res_train,ia_train,acc_test,mse_test,mae_test,rmse_test,std_res_test,ia_test
0,model1,50,5,1,relu,1.0,0.77,153.147,8.309,12.375,12.317,0.933,0.757,155.685,8.3,12.477,12.437,0.929
1,model2,50,5,1,relu,0.75,-0.504,1000.525,20.456,31.631,25.794,0.398,-0.525,976.515,20.416,31.249,25.304,0.401
2,model3,50,5,1,relu,0.5,0.753,164.516,8.459,12.826,12.806,0.923,0.738,168.005,8.512,12.962,12.927,0.919
3,model4,50,5,1,relu,0.1,0.715,189.513,9.138,13.766,13.761,0.911,0.706,188.523,9.123,13.73,13.722,0.909
4,model5,50,5,1,relu,0.05,0.775,149.374,8.139,12.222,12.222,0.933,0.758,154.946,8.184,12.448,12.444,0.929


### Best models

In [14]:
df_bestmodels = pd.read_csv('../output/data/RNA/BestModels_Notebook6b.csv')
print(df_bestmodels.shape)
df_bestmodels.head(4)

(10, 4)


Unnamed: 0.1,Unnamed: 0,model,variable,result
0,0,model1209,mse_train,76.078
1,1,model1254,mae_train,5.864
2,2,model1209,rmse_train,8.722
3,3,model732,mse_test,130.45


## Iteration over one model

In [None]:
i = 789
nh = int(nn.iloc[:,i+1]['hidden_neurons'])
hl = int(nn.iloc[:,i+1]['hidden_layers'])
e = int(nn.iloc[:,i+1]['epochs'])
a = nn.iloc[:,i+1]['activation']

models = {}
r = 0.01

for c in range(100):
    
    X,Y = df_data[['AGU','ATM','LDO','LPIN','MIR','OBL','SFE','TLA','VAL']].to_numpy(), df_data["CEN"].to_numpy()   #separate data into input and output features

    Y=np.reshape(Y, (-1,1))

    X_std = (X - np.nanmin(np.where(X>=0, X, np.nan),axis=0)) / (X.max(axis=0) - np.nanmin(np.where(X>=0, X, np.nan),axis=0))
    xscale = X_std * (1 - 0) + 0
    xscale[X==-1]=-1

    scaler_y = MinMaxScaler()
    scaler_y.fit(Y)
    yscale=scaler_y.transform(Y)

    X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size = 0.2) #split
        
    model = neuron_layers(10,nh,1,hl,a,r, 1)

    model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])

    history = model.fit(X_train, Y_train, epochs=e, batch_size=50,  verbose=0, validation_split=0.2)

    #statistics for train
    y_hat= model.predict(X_train)
    acc_train = r2_score(Y_train, y_hat)
    mse_train = mean_squared_error(Y_train, y_hat)
    mae_train = mean_absolute_error(Y_train, y_hat)
    rmse_train = mean_squared_error(Y_train, y_hat, squared=False)

    res_train = Y_train - y_hat
    std_res_train = round(res_train.std(),3)

    num_train = ((y_hat - Y_train)**2).sum()
    den_train = ((abs(y_hat - Y_train.mean()) + 
            abs(Y_train - Y_train.mean()))**2).sum()
    ia_train = round(1 - (num_train / den_train),3)

    #accuracy for test
    y_hat = model.predict(X_test)
    acc_test = r2_score(Y_test, y_hat)
    mse_test = mean_squared_error(Y_test, y_hat)
    mae_test = mean_absolute_error(Y_test, y_hat)
    rmse_test = mean_squared_error(Y_test, y_hat, squared=False)

    res_test = Y_test - y_hat
    std_res_test = round(res_test.std(),3)

    num_test = ((y_hat - Y_test)**2).sum()
    den_test = ((abs(y_hat - Y_test.mean()) + 
            abs(Y_test - Y_test.mean()))**2).sum()
    ia_test = round(1 - (num_test / den_test),3)

    models['model'+str(i+1)+'-'+str(r)] = [e, nh, hl, a, r, 
                                           acc_train, mse_train, mae_train, rmse_train, std_res_train, ia_train
                                           acc_test, mse_test, mae_test, rmse_test, std_res_test, ia_test]

    print ('\n*For model',str(i+1),'settings are:','-epochs:',str(e),'-hidden neurons:',str(nh),'-hidden layers:',str(hl),'-activation:',a,
           '-regularization cost:',r,
           '\nAccuracy for training is:', str(acc_train),'Accuracy for test is:',str(acc_test),
          '\nMSE for training is:', str(mse_train),'MSE for test is:',str(mse_test),
           '\nMAE for training is:', str(mae_train),'MAE for test is:',str(mae_test),
           '\nRMSE for training is:', str(rmse_train),'RMSE for test is:',str(rmse_test))

## Testing for different seeds in train split

In [33]:
i = 1
bm = list(df_bestmodels.model.unique())

models = {}

rndm = range(0,100,2)

for mn in bm:
    nh = int(nn.loc[nn.model_number==mn,'hidden_neurons'])
    hl = int(nn.loc[nn.model_number==mn,'hidden_layers'])
    e = int(nn.loc[nn.model_number==mn,'epochs'])
    a = 'relu'
    r = float(nn.loc[nn.model_number==mn,'regularization'])

    for rnd_st in rndm:

        X_train,X_test,Y_train,Y_test = train_test_split(X, Y, 
                                                         test_size = 0.2, 
                                                         random_state=rnd_st) #split

        model = neuron_layers(10,nh,1,hl,a,r,1)

        model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])

        history = model.fit(X_train, Y_train, epochs=e, 
                            batch_size=50,  verbose=0, validation_split=0.2)

        #statistics for train
        y_hat= model.predict(X_train)
        acc_train = r2_score(Y_train, y_hat)
        mse_train = mean_squared_error(Y_train, y_hat)
        mae_train = mean_absolute_error(Y_train, y_hat)
        rmse_train = mean_squared_error(Y_train, y_hat, squared=False)

        num_train = ((y_hat - Y_train)**2).sum()
        den_train = ((abs(y_hat - Y_train.mean()) + 
                abs(Y_train - Y_train.mean()))**2).sum()
        ia_train = round(1 - (num_train / den_train),3)

        #statistics for test
        y_hat = model.predict(X_test)
        acc_test = r2_score(Y_test, y_hat)
        mse_test = mean_squared_error(Y_test, y_hat)
        mae_test = mean_absolute_error(Y_test, y_hat)
        rmse_test = mean_squared_error(Y_test, y_hat, squared=False)

        num_test = ((y_hat - Y_test)**2).sum()
        den_test = ((abs(y_hat - Y_test.mean()) + 
                abs(Y_test - Y_test.mean()))**2).sum()
        ia_test = round(1 - (num_test / den_test),3)

        models['model'+str(i)] = [mn, e, nh, hl, a, r, rnd_st,
                                               acc_train, mse_train, mae_train, rmse_train, ia_train,
                                               acc_test, mse_test, mae_test, rmse_test, ia_test]

        print ('\n*For model',str(i),'settings are:','-epochs:',str(e),'-hidden neurons:',str(nh),'-hidden layers:',str(hl),'-activation:',a,
               '-regularization cost:',r,'-random_state:',rnd_st,
               '\nAccuracy for training is:', str(acc_train),'Accuracy for test is:',str(acc_test))

        i += 1


*For model 1 settings are: -epochs: 200 -hidden neurons: 100 -hidden layers: 3 -activation: relu -regularization cost: 0.05 -random_state: 0 
Accuracy for training is: 0.8720357126067578 Accuracy for test is: 0.7702782799800847

*For model 2 settings are: -epochs: 200 -hidden neurons: 100 -hidden layers: 3 -activation: relu -regularization cost: 0.05 -random_state: 2 
Accuracy for training is: 0.8608201262861888 Accuracy for test is: 0.7787189891666119

*For model 3 settings are: -epochs: 200 -hidden neurons: 100 -hidden layers: 3 -activation: relu -regularization cost: 0.05 -random_state: 4 
Accuracy for training is: 0.8768334878105168 Accuracy for test is: 0.7514542059731786

*For model 4 settings are: -epochs: 200 -hidden neurons: 100 -hidden layers: 3 -activation: relu -regularization cost: 0.05 -random_state: 6 
Accuracy for training is: 0.8677000456059568 Accuracy for test is: 0.7780860552422286

*For model 5 settings are: -epochs: 200 -hidden neurons: 100 -hidden layers: 3 -act

In [35]:
df_models = pd.DataFrame(models)
df_models.head(5)

Unnamed: 0,model1,model2,model3,model4,model5,model6,model7,model8,model9,model10,...,model291,model292,model293,model294,model295,model296,model297,model298,model299,model300
0,model1209,model1209,model1209,model1209,model1209,model1209,model1209,model1209,model1209,model1209,...,model788,model788,model788,model788,model788,model788,model788,model788,model788,model788
1,200,200,200,200,200,200,200,200,200,200,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,relu,relu,relu,relu,relu,relu,relu,relu,relu,relu,...,relu,relu,relu,relu,relu,relu,relu,relu,relu,relu


In [36]:
df_models.to_csv('../output/data/RNA/NN_RawModels_Notebooks6c_v1.csv')

In [37]:
df_models = df_models.transpose()
df_models = df_models.rename(columns={0:'model_number', 1:'epochs',2:'hidden_neurons',3:'hidden_layers',4:'activation',
                        5:'regularization',6:'random_state_trainsplit', 7:'acc_train',8:'mse_train',9:'mae_train',
                        10:'rmse_train',11:'ia_train',12:'acc_test',
                         13:'mse_test',14:'mae_test',15:'rmse_test',16:'ia_test'})
print(df_models.shape)
df_models.head(5)

(300, 17)


Unnamed: 0,model_number,epochs,hidden_neurons,hidden_layers,activation,regularization,random_state_trainsplit,acc_train,mse_train,mae_train,rmse_train,ia_train,acc_test,mse_test,mae_test,rmse_test,ia_test
model1,model1209,200,100,3,relu,0.05,0,0.872036,84.137671,6.142833,9.172659,0.965,0.770278,154.232328,8.025615,12.419031,0.934
model2,model1209,200,100,3,relu,0.05,2,0.86082,89.543812,6.302811,9.462759,0.962,0.778719,161.08846,8.000407,12.692063,0.937
model3,model1209,200,100,3,relu,0.05,4,0.876833,81.290216,6.125698,9.016109,0.966,0.751454,164.426907,7.964257,12.822906,0.928
model4,model1209,200,100,3,relu,0.05,6,0.8677,87.738725,6.265878,9.366895,0.963,0.778086,143.987011,7.707175,11.999459,0.936
model5,model1209,200,100,3,relu,0.05,8,0.870076,85.908342,6.280808,9.268675,0.964,0.775326,147.526076,7.878047,12.146031,0.937


In [38]:
df_models.to_csv('../output/data/RNA/NN_ModelsNotebook6c_v1.csv')

In [43]:
prev_model = df_models.loc[df_models.index=='model59','model_number']
prev_model[0]

'model1254'

## Filtering best models

In [44]:
df_best_models = pd.DataFrame(columns=['model','6b_model','variable','result'])

min_cols = ['mse_train','mae_train','rmse_train',
       'mse_test','mae_test','rmse_test']
max_cols = ['acc_train','ia_train',
            'acc_test','ia_test']

i = 0
for c in min_cols:
    val = df_models[c].min()
    df_tmp = df_models.loc[df_models[c]==val].copy()
    df_tmp[['epochs']] = df_tmp[['epochs']]/200
    df_tmp[['hidden_neurons']] = df_tmp[['hidden_neurons']]/100
    df_tmp[['hidden_layers']] = df_tmp[['hidden_layers']]/10
    df_tmp[['regularization']] = df_tmp[['regularization']]/1
    model = df_tmp[['epochs','hidden_neurons',
                    'hidden_layers','regularization']].sum(axis=1).idxmin()
    prev_model = df_tmp.loc[df_tmp.index==model,'model_number']
    df_best_models.loc[i] = [model, prev_model[0], c, val]
    i += 1
    
for c in max_cols:
    val = df_models[c].max()
    df_tmp = df_models.loc[df_models[c]==val].copy()
    df_tmp[['epochs']] = df_tmp[['epochs']]/200
    df_tmp[['hidden_neurons']] = df_tmp[['hidden_neurons']]/100
    df_tmp[['hidden_layers']] = df_tmp[['hidden_layers']]/10
    df_tmp[['regularization']] = df_tmp[['regularization']]/1
    model = df_tmp[['epochs','hidden_neurons',
                    'hidden_layers','regularization']].sum(axis=1).idxmin()
    prev_model = df_tmp.loc[df_tmp.index==model,'model_number']
    df_best_models.loc[i] = [model, prev_model[0], c, val]
    i += 1

In [45]:
df_best_models

Unnamed: 0,model,6b_model,variable,result
0,model59,model1254,mse_train,69.556236
1,model59,model1254,mae_train,5.496591
2,model59,model1254,rmse_train,8.340038
3,model255,model788,mse_test,126.002045
4,model111,model732,mae_test,7.347313
5,model255,model788,rmse_test,11.225063
6,model59,model1254,acc_train,0.894781
7,model59,model1254,ia_train,0.972
8,model149,model732,acc_test,0.808412
9,model255,model788,ia_test,0.947


In [49]:
df_best_models.to_csv('../output/data/RNA/BestModels_Notebook6c.csv')

In [48]:
model_var = ['random_state_trainsplit']

for m in model_var:
    df_models.groupby([m]).agg(['mean','std']).to_csv(f'../output/data/RNA/{m}_ModelAnalysis_Notebook6c.csv')