In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from itertools import chain
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
import joblib

# Mis utilitarios
from utils import *


In [3]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Concatenate, ConvLSTM2D, Reshape, Conv2D, Lambda
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import plot_model

# Set Path

In [4]:
el_path_main = Path(os.getcwd()).parent
el_path_main

WindowsPath('e:/backup Asus G15 27_10_2021/Colegio de Mates Bourbaki/DLA2501 - Deep Learning Avanzado/Flood forecasting')

In [5]:
# En caso sea en google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# Load Tensors

In [None]:
X_gts_train, Y_train, X_gts_miss_train, X_ctry_lvl_train = joblib.load(el_path_main / '2. Datos/Processed data/TRAIN_data.pkl')
# X_gts_oot, Y_oot, X_gts_miss_oot, X_ctry_lvl_oot = joblib.load(el_path_main / '2. Datos/Processed data/OOT_data.pkl')

# El orden de los inputs en X_gts_train es:
# ['cld', 'dtr', 'frs', 'pet', 'pre', 'tmn', 'tmp', 'tmx', 'vap', 'wet']

In [6]:
# Reducimos el tamaño del dataset para que pueda ser ejecutado en local
desde_index = 700
X_gts_train = X_gts_train[desde_index:,:]
Y_train = Y_train[desde_index:,:]
X_gts_miss_train = X_gts_miss_train[desde_index:,:]
X_ctry_lvl_train = X_ctry_lvl_train[desde_index:,:]

In [7]:
# X_gts_train.shape, Y_train.shape, X_gts_miss_oot.shape, X_ctry_lvl_oot.shape
X_gts_train.shape, Y_train.shape, X_gts_miss_train.shape, X_ctry_lvl_train.shape

((477, 10, 96, 36, 28), (477, 12, 36, 28), (477, 10, 96, 36, 28), (477, 180))

In [8]:
def format_Y(Y):
    """
    Formatea Y para que sea un array de numpy con la forma (n_samples, n_classes)
    """
    Y = np.array(Y)
    Y = to_categorical(Y, num_classes=2)
    return Y

In [9]:
Y_train_f = format_Y(Y_train)
# Y_oot_f = format_Y(Y_oot)

In [10]:
X_gts_train.shape, Y_train.shape, X_gts_miss_train.shape, X_ctry_lvl_train.shape

((477, 10, 96, 36, 28), (477, 12, 36, 28), (477, 10, 96, 36, 28), (477, 180))

In [11]:
def gen_X_gts_for_model(X_gts, X_gts_miss):
    n_vars = X_gts.shape[1]
    lista_array = []
    for var in range(n_vars):
        X_tmp      = np.expand_dims(X_gts[:, var, :], -1)
        X_tmp_miss = np.expand_dims(X_gts_miss[:, var, :], -1)
        X = np.concat((X_tmp, X_tmp_miss), axis=-1)
        lista_array.append(X)
    return lista_array

In [12]:
# la_lista_de_arrays_oot = gen_X_gts_for_model(X_gts_oot, X_gts_miss_oot)
la_lista_de_arrays_train = gen_X_gts_for_model(X_gts_train, X_gts_miss_train)

In [13]:
la_lista_de_arrays_train[0].shape

(477, 96, 36, 28, 2)

# Version Simple

In [None]:
n_vars_from_gts = X_gts_train.shape[1] # N of variables from GTS
n_times_for_lstm = X_gts_train.shape[2] # N of time steps usted for LSTM
n_lats = X_gts_train.shape[3] # N of latitudes
n_lons = X_gts_train.shape[4] # N of longitudes
vars_ctry_lvl = X_ctry_lvl_train.shape[1] # N of variables from country level data
n_codmes_fcast = Y_train.shape[1] # N of codmes for forecast

# Set of parameters that will be tunned
recurrent_dropout = 0.2
dropout = 0.2   
n_filters = 2
kernel_size = 3
include_miss_layer = True
dims_nn_ctry_lvl_tmp = [16] # La longitud de esta lista indica la cantidad de capas -1 (colcoar lista vacía para que sea de tamaño 1), los valores dentro indican el tamaño de la capa

# Set of parameters that are based on the previous ones
n_channels_conv_lstm = 2 if include_miss_layer else 1
dims_nn_ctry_lvl = dims_nn_ctry_lvl_tmp+[n_lats*n_lons*n_channels_conv_lstm] # La longitud de esta lista indica la cantidad de capas, los valores dentro indican el tamaño de la capa

In [15]:
def repeat_latent(x):
    return tf.repeat(x, repeats=n_codmes_fcast, axis=1)  # Repeat along time axis

In [16]:
mi_inp_gts = Input(shape=(n_times_for_lstm, n_lats, n_lons, n_channels_conv_lstm, ), name=f'input_gts')
mi_inp_ctry = Input(shape=(vars_ctry_lvl, ), name=f'input_ctry')

mi_lstm = ConvLSTM2D(filters=n_filters,
                     kernel_size=(kernel_size, kernel_size),
                     padding='same',
                     data_format = 'channels_last',
                     return_sequences=False,
                     recurrent_dropout=recurrent_dropout,
                     dropout=dropout,
                     name='conv_lstm_1')(mi_inp_gts)

mi_dense_ctry = Dense(36*28, activation='relu')(mi_inp_ctry)
mi_dense_ctry = Reshape((36, 28, 1))(mi_dense_ctry)

mi_encoded = Concatenate(axis=-1)([mi_lstm, mi_dense_ctry])
mi_encoded = Reshape((1, n_lats, n_lons, 3))(mi_encoded)
mi_encoded = Lambda(repeat_latent)(mi_encoded)

mi_decoder = ConvLSTM2D(filters=n_filters,
                        kernel_size=(kernel_size, kernel_size),
                        padding='same',
                        data_format = 'channels_last',
                        return_sequences=True,
                        recurrent_dropout=recurrent_dropout,
                        dropout=dropout,
                        name='conv_lstm_2')(mi_encoded)
output = TimeDistributed(Conv2D(2, (1, 1), activation='softmax'))(mi_decoder)
# Model
mi_simple_model = Model(inputs=[mi_inp_gts, mi_inp_ctry], outputs=output)
mi_simple_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
mi_simple_model.summary()





In [None]:
mi_simple_model.compile(optimizer='adam',         # el optimizador sirve para encontrar los pesos que minimizan la función de pérdida
                                            # adam: stochastic gradient descent adaptativo
                                            # https://www.tensorflow.org/api_docs/python/tf/keras/optimizers
                  loss="categorical_crossentropy", # función que evalua que tan bien el algoritmo modela el conjunto de datos
                                                   # https://www.tensorflow.org/api_docs/python/tf/keras/losses
                  metrics=['accuracy'])

mi_simple_model_history = mi_simple_model.fit(x = [la_lista_de_arrays_train[0], X_ctry_lvl_train],
                         y = Y_train_f,
                        validation_split=0.2,
                        batch_size=128,
                        epochs=20,   # número de iteraciones sobre los datos
                        verbose=1)

Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.7760 - loss: 0.6810 - val_accuracy: 0.8446 - val_loss: 0.6561
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 996ms/step - accuracy: 0.7927 - loss: 0.6674 - val_accuracy: 0.8467 - val_loss: 0.6452
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 955ms/step - accuracy: 0.8014 - loss: 0.6565 - val_accuracy: 0.8613 - val_loss: 0.6347
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 931ms/step - accuracy: 0.8130 - loss: 0.6469 - val_accuracy: 0.8645 - val_loss: 0.6260
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 943ms/step - accuracy: 0.8197 - loss: 0.6381 - val_accuracy: 0.8744 - val_loss: 0.6180
Epoch 6/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 956ms/step - accuracy: 0.8292 - loss: 0.6295 - val_accuracy: 0.8797 - val_loss: 0.6109
Epoch 7/20
[1m8/8[0m [32m━━━━━━━━━━━━━━

In [19]:
mi_simple_model.save_weights(el_path_main/'4. Analisis/mi_simple_model.weights.h5')
# Epch20: 2/2 ━━━━━━━━━━━━━━━━━━━━ 2s 488ms/step - accuracy: 0.7543 - loss: 0.6607 - val_accuracy: 0.7655 - val_loss: 0.6424


# Conv-LSTM

In [14]:
n_vars_from_gts = X_gts_train.shape[1] # N of variables from GTS
n_times_for_lstm = X_gts_train.shape[2] # N of time steps usted for LSTM
n_lats = X_gts_train.shape[3] # N of latitudes
n_lons = X_gts_train.shape[4] # N of longitudes
vars_ctry_lvl = X_ctry_lvl_train.shape[1] # N of variables from country level data
n_codmes_fcast = Y_train.shape[1] # N of codmes for forecast

# Set of parameters that will be tunned
recurrent_dropout = 0.2
dropout = 0.2   
n_filters = 2
kernel_size = 3
include_miss_layer = True
dims_nn_ctry_lvl_tmp = [16] # La longitud de esta lista indica la cantidad de capas -1 (colcoar lista vacía para que sea de tamaño 1), los valores dentro indican el tamaño de la capa

# Set of parameters that are based on the previous ones
n_channels_conv_lstm = 2 if include_miss_layer else 1
dims_nn_ctry_lvl = dims_nn_ctry_lvl_tmp+[n_lats*n_lons*n_channels_conv_lstm] # La longitud de esta lista indica la cantidad de capas, los valores dentro indican el tamaño de la capa

In [15]:
def repeat_latent(x):
    return tf.repeat(x, repeats=n_codmes_fcast, axis=1)  # Repeat along time axis

In [16]:
# Inputs for each variable from GTS    
map_inputs = [Input(shape=(n_times_for_lstm, n_lats, n_lons, n_channels_conv_lstm, ), name=f'input_gts_{i}') for i in range(n_vars_from_gts)]

# ConvLSTM encoding for maps
conv_outputs = [
    ConvLSTM2D(
            filters = n_filters,
            kernel_size = (kernel_size, kernel_size), # Tamaño del kernel
            padding = 'same', # Para que la salida tenga el mismo tamaño que la entrada
            data_format = 'channels_last', # Forma en que recibe los datos:Docu -> 5D tensor with shape: (samples, time, rows, cols, channels)
            return_sequences = False, # No retorna cada secuencia de salida
            # return_state=True, # Retorna los estados ocultos de la capa LSTM
            recurrent_dropout = recurrent_dropout, dropout = dropout  # Para evitar sobreajuste
            )(inp)
    for inp in map_inputs
]

# Input country level data
input_ctry_lvl = Input(shape=(vars_ctry_lvl,), name='input_ctry_lvl')
mi_dense_ctry_lvl = input_ctry_lvl

# Dense layers for country level data
for i in dims_nn_ctry_lvl:
    mi_dense_ctry_lvl = Dense(i, activation='relu')(mi_dense_ctry_lvl)

# Reshape so that it is compatible with the outputs of the LSTM.
mi_dense_ctry_lvl = Reshape((n_lats, n_lons, n_channels_conv_lstm))(mi_dense_ctry_lvl)

# Concatenate all the outputs of the LSTM layers and the dense layer for country level data
mi_concat = Concatenate()(conv_outputs+[mi_dense_ctry_lvl])
latent = Conv2D(n_filters, kernel_size=(kernel_size, kernel_size), padding='same', activation='relu')(mi_concat)
latent = Reshape((1, n_lats, n_lons, n_filters))(latent)


# Decoder
decoder_input = Lambda(repeat_latent)(latent) 
decoder = ConvLSTM2D(filters=n_filters, kernel_size=(kernel_size, kernel_size), padding='same', return_sequences=True)(decoder_input)

# Multi-class output
output = TimeDistributed(Conv2D(2, (1, 1), activation='softmax'))(decoder)

# Model
model = Model(inputs = map_inputs + [input_ctry_lvl], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()




In [17]:
model.compile(optimizer='adam',         # el optimizador sirve para encontrar los pesos que minimizan la función de pérdida
                                            # adam: stochastic gradient descent adaptativo
                                            # https://www.tensorflow.org/api_docs/python/tf/keras/optimizers
                  loss="categorical_crossentropy", # función que evalua que tan bien el algoritmo modela el conjunto de datos
                                                   # https://www.tensorflow.org/api_docs/python/tf/keras/losses
                  metrics=['accuracy'])

In [None]:
history = model.fit(x = la_lista_de_arrays_train + [X_ctry_lvl_train],
                         y = Y_train_f,
                        validation_split=0.2,
                        batch_size=128,
                        epochs=20,   # número de iteraciones sobre los datos
                        verbose=1)

Epoch 1/20


In [None]:
model.save_weights(el_path_main/'4. Analisis/model.weights.h5')

In [None]:
# Si es en google Colab


# Benchmark (Logit)

In [6]:
df_gts_reescal = pd.read_csv(el_path_main / '2. Datos/Main gridded data/consol_df_peru_reescal.csv')
df_gts_reescal.sample(5)

Unnamed: 0,lon,lat,cld,dtr,frs,pet,pre,tmn,tmp,tmx,vap,wet,codmes,cuadrante_v1,cuadrante_v2,cuadrante_v3,mi_y,el_th_usado
713874,-74.75,-14.25,0.771123,0.32766,0.2,0.358491,0.030681,0.445545,0.358663,0.256849,0.25,0.5,196001,0.0,0,1,0,
730313,-73.25,-8.75,0.773262,0.285106,,0.320755,0.082082,0.861386,0.854103,0.797945,0.801829,0.533333,196105,1.0,1,1,0,
341621,-71.25,-1.75,0.812834,0.26383,,0.396226,0.158408,0.883663,0.87538,0.811644,0.893293,0.733333,192903,1.0,1,1,0,
673819,-68.25,-9.75,0.679144,0.391489,,0.490566,0.028428,0.846535,0.87538,0.863014,0.79878,0.2,195609,1.0,1,1,0,
661783,-80.25,-8.25,,,,,,,,,,,195509,0.0,0,0,0,
