# Deep learning model implementation: PM10
Here the deep learning model is created, trained and tested.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
#Adding the code folder to the search path of python
from sys import path
path.append('../code')

In [3]:
dframe=pd.read_csv("../data/Monterrey/imputed/data/NOROESTE.csv", 
    parse_dates=["FECHA"], infer_datetime_format=True).set_index("FECHA")

## Normalizing the data
As the variables that we have are in different numeric scales, one of the tasks needed to obtain accurate prediction. This have impact in the loss function and the optimization algortihm that set the values of the deep neural network.

In [4]:
pollutants=list(dframe.columns)
dframe_norm=dframe.copy()
norm_guide={'CO':('max', 0.1),
    'NO': ('max', 0.1),
    'NO2': ('max', 0.1),
    'NOX': ('max', 0.1),
    'O3': ('max', 0.01),
    'PM10': ('max', 0.1),
    'PM2.5': ('max', 0.1),
    'PRS': ('max', 0),
    'RAINF': ('max', 0.001),
    'RH': ('max', 0),
    'SO2': ('max', 0.1),
    'SR': ('max', 0),
    'TOUT': ('max',0),
    'WDR': ('max', 0),
    'WSR': ('max', 0.1),
    }

norm_param=dict.fromkeys(norm_guide.keys())

for p in pollutants:
    if norm_guide[p][0]=='log':
        dframe_norm[p]=np.log(dframe[p]+norm_guide[p][1])
    elif norm_guide[p][0]=='mean':
        dframe_norm[p]=(dframe[p]-dframe[p].mean())/dframe[p].std()
        norm_param[p]=(dframe[p].mean(),dframe[p].std())
    elif norm_guide[p][0]=='max':
        dframe_norm[p]=(dframe[p]-dframe[p].min())/(dframe[p].max()-dframe[p].min())
        norm_param[p]=(dframe[p].min(),dframe[p].max())
    if norm_guide[p][0]=='none':
        dframe_norm[p]=dframe[p]+norm_guide[p][1]


In [5]:
dframe_norm.shape

(52608, 15)

## importing the generator functions
With the complete data, we will use a generator to iterate over the time windows created of 72 hours and extract 24h of data to predict the next 24h.

In [46]:
from ANN_model import windows_tensor
from tensorflow import keras
class observations_generator(keras.utils.Sequence):
    """
    Observation generator, it takes windows of 72 hours and from such widows it generates random observations, to predict 24 lead time taking 24 hours before.
    
    data_source is the tensor array of predictors and target variable
    
    target is the variable index to be predicted 
    
    batch_size in the size of the batches to be generated
    """

    def __init__(self, data_source, target, 
    samples_per_window=5, batch_size=64):
        self.data_source=data_source
        self.target=target
        self.batch_size=batch_size
        self.samples_per_window=samples_per_window
        self.n = 0
        self.max = self.__len__()
        c=0
        temp=0
        self.indexes=[]
        for i in range(0, int(self.data_source.shape[0]*self.samples_per_window)):
            self.indexes.append((temp,np.random.choice(np.array(list(range(24,48))),size=1)[0]))
            c=c+1
            if c==self.samples_per_window:
                c=0
            if c==0:
                temp+=1
                
    #Generate the tensor with
    def __len__(self):
        #Total lenght of the output
        return(int(np.ceil(self.data_source.shape[0]*self.samples_per_window)))
    
    def __getitem__(self, idx):
        ##Observations generator, Neural network input
        self.x_batch = self.data_source[self.indexes[idx][0],list(range((self.indexes[idx][1]-23),self.indexes[idx][1]+1)),:]
        self.y_batch = self.data_source[self.indexes[idx][0],int(self.indexes[idx][1]+24),self.target]
        return np.array(self.x_batch), np.array(self.y_batch)
    
    def __next__(self):
        if self.n >= self.max:
           self.n = 0
        result = self.__getitem__(self.n)
        self.n += 1
        return result



In [47]:
dframe_norm.head()

Unnamed: 0_level_0,CO,NO,NO2,NOX,O3,PM10,PM2.5,PRS,RAINF,RH,SO2,SR,TOUT,WDR,WSR
FECHA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-01-01 00:00:00,0.201699,0.041417,0.396682,0.164166,0.035211,0.299725,0.01859,0.542857,1.6924129999999999e-19,0.30898,0.141463,3.812011e-17,0.450818,0.740785,0.154179
2012-01-01 01:00:00,0.504336,0.38523,0.414528,0.474127,0.056338,0.254708,0.021667,0.555555,1.6924129999999999e-19,0.339878,0.160976,3.812011e-17,0.426165,0.662217,0.054403
2012-01-01 02:00:00,0.243409,0.118263,0.350696,0.217372,0.028169,0.432777,0.023643,0.565079,1.6924129999999999e-19,0.339878,0.106341,3.8715740000000006e-17,0.41445,0.745275,0.120164
2012-01-01 03:00:00,0.322948,0.241267,0.336282,0.321575,0.042254,0.215692,0.034,0.565079,1.6924129999999999e-19,0.339878,0.096585,4.050262e-17,0.410788,0.695889,0.183658
2012-01-01 04:00:00,0.12216,0.02495,0.269704,0.108753,0.049296,0.211691,0.016913,0.571428,1.6924129999999999e-19,0.278082,0.073171,3.812011e-17,0.429583,0.711603,0.249419


In [48]:
train_win,val_win,test_win=windows_tensor(dframe_norm, dframe_norm.columns.values)

######
###Creating the generators
######

train_gen=observations_generator(test_win,5)
val_gen=observations_generator(val_win,5)
test_win=observations_generator(val_win,5)

In [49]:
dframe.columns.values

array(['CO', 'NO', 'NO2', 'NOX', 'O3', 'PM10', 'PM2.5', 'PRS', 'RAINF',
       'RH', 'SO2', 'SR', 'TOUT', 'WDR', 'WSR'], dtype=object)

In [50]:
#%% Naieve Method, redefinition
def eval_naive_method(gen, steps,var):
    batch_logmse = []
    tar = []
    pred = []
    for step in range(steps):
        samples, targets = next(gen)
        preds = samples[:, -1, var] #Last index corresponds to PM10(5), PM2.5(6)
        logmse = np.log(np.mean(np.square(preds-targets)))
        #mae = np.mean(np.abs(preds - targets))
        batch_logmse.append(logmse)
        tar.extend(targets)
        pred.extend(preds)
    print("From naive assumption that the pollutant concentration"
     "\n will be the same that 24h before: \n")
    print("log(mse)= "+str(round(np.mean(batch_logmse),3)))
    from sklearn.metrics import r2_score
    print("r2= "+str(round(r2_score(tar,pred),5)))
    return round(np.mean(batch_logmse),3), round(r2_score(tar,pred),5)

#print("\n Training set: \n")
train_naive_loss, train_naive_r2 = eval_naive_method(train_gen, 511,5)


IndexError: too many indices for array

In [40]:
type(train_gen)

__main__.observations_generator