###  Import libraries


In [2]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from tensorflow import keras
from datetime import datetime
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
%matplotlib inline

import time

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### Hiperparámetros

In [None]:
csv_path          = "regina.csv"
dropped_features  = ['Hi Temp', 'Low Temp', 'Wind Chill', 'Heat Index', 'THSW Index', 'THW Index', 'Wind Run', 'Solar Energy', 'Hi Solar Rad.', 'In Heat', 'ISS Recept', 'Arc. Int']

train_perc = .8
val_perc   = .1

sequence_length = 90
offset          = 0
sampling_rate   = 1
length          = 6 #horas
min_temp        = 0.5
batch_size      = 256

learning_rate   = 0.001
epochs          = 100

### Read CSV

In [None]:
dateparse = lambda x: datetime.strptime(x, '%d.%m.%y %H:%M')

wind_dic = {'---' : 0,
            'E': 1,
            'W': 2,
            'N': 3,
            'S': 4,
            'NE': 5,
            'SE': 6,
            'NW': 7,
            'SW': 8,
            'ENE': 9,
            'NNE': 10,
            'WNW': 11,
            'NNW': 12,
            'ESE': 13,
            'SSE': 14,
            'WSW': 15,
            'SSW': 16}

data = pd.read_csv(csv_path, parse_dates=['Date Time'], date_parser=dateparse, na_values=['---', '------'], converters={'Wind Dir': lambda x: wind_dic[x], 'Hi Dir': lambda x: wind_dic[x]}, index_col=0)

### Agregamos los datos que faltan

In [None]:
data['Temp Out'] = data['Temp Out'].apply(lambda x: x-2)
data['Temp Out'] = data['Temp Out'].apply(lambda x: np.nan if x <= -10 else x)

data = data.resample('10min', origin='start').mean()

data['Wind Dir'] = data['Wind Dir'].replace(np.nan, 0)
data['Hi Dir'] = data['Hi Dir'].replace(np.nan, 0)

data = data.interpolate()

data.info()

### Convertimos estampillas de tiempo todo a segundos

In [None]:
timestamp_s = data.index
timestamp_s = timestamp_s.map(pd.Timestamp.timestamp)

day = 24*60*60
year = 365.2425 * day

data['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
data['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
data['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
data['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

## Correlación de los datos

In [None]:
data.corr()

## Dropeamos las características que no tienen correlación

In [None]:
df = data.drop(dropped_features, axis = 1)

### Normalización de los datos

In [None]:
df_length = len(df)

train_length = int(df_length*train_perc)
val_length   = int(df_length*(train_perc+val_perc))

train_mean = df.values[:train_length].mean(axis=0)
train_std  = df.values[:train_length].std(axis=0)

df_norm = (df.values - train_mean) / train_std
df_norm = pd.DataFrame(df_norm)

### Dividimos los datos

In [None]:
train_df = df_norm[:train_length-1]
val_df   = df_norm[train_length:val_length-1]
test_df  = df_norm[val_length:]

###  Create datasets

In [None]:
def create_y(df, start, end, length, min_temp):
    y_data = np.arange(end-start)
    
    df_bool = df['Temp Out'][start:end+length] <= min_temp
        
    for i in range(0, end-start):
        y_data[i] = 1 if np.any(df_bool[i:i+length]) else 0

    return y_data

In [None]:
start = sequence_length*sampling_rate + offset*sampling_rate
end   = train_length + offset*sampling_rate

x_train = train_df
y_train = create_y(df, start, end, length*6, min_temp)

dataset_train = keras.preprocessing.timeseries_dataset_from_array(
    x_train.values,
    y_train,
    sequence_length = sequence_length,
    sampling_rate = sampling_rate,
    shuffle=True,
    batch_size = batch_size
)

In [None]:
start = train_length + sequence_length*sampling_rate + offset*sampling_rate
end   = val_length + offset*sampling_rate

x_val = val_df
y_val = create_y(df, start, end, length*6, min_temp)

dataset_val = keras.preprocessing.timeseries_dataset_from_array(
    x_val.values,
    y_val,
    sequence_length = sequence_length,
    sampling_rate = sampling_rate,
    shuffle=False,
    batch_size = batch_size
)

In [None]:
start = val_length + sequence_length*sampling_rate + offset*sampling_rate
x_end = len(test_df) - sequence_length*sampling_rate - offset*sampling_rate

x_test = test_df[:x_end]
y_test = create_y(df, start, df_length, length*6, min_temp)

dataset_test = keras.preprocessing.timeseries_dataset_from_array(
    x_test.values,
    y_test,
    sequence_length = sequence_length,
    sequence_stride = sampling_rate,
    sampling_rate = sampling_rate,
    shuffle=False,
    batch_size = batch_size
)

### Creamos el modelo

In [None]:
for batch in dataset_train.take(1):
    x, y = batch
    
input_shape = x.shape[1], x.shape[2]

In [None]:
inputs = keras.layers.Input(input_shape)
lstm_layer = keras.layers.LSTM(8, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)(inputs)
lstm_layer2 = keras.layers.LSTM(8, dropout=0.3, recurrent_dropout=0.3)(lstm_layer)
output = keras.layers.Dense(1, activation="sigmoid")(lstm_layer2)

model = keras.Model(inputs, output)
model.compile(keras.optimizers.Adam(learning_rate), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False))
model.summary()

### Entrenamos el modelo

In [None]:
path_checkpoint = "model_test_regina.h5"
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5)

modelckpt_callback = keras.callbacks.ModelCheckpoint(
    monitor="val_loss",
    filepath=path_checkpoint,
    verbose=1,
    save_best_only=True,
)

history = model.fit(dataset_train, epochs = epochs, validation_data=dataset_val, callbacks = [es_callback, modelckpt_callback])

In [None]:
def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = range(len(loss))
    plt.figure()
    plt.plot(epochs, loss, "b", label="Training loss")
    plt.plot(epochs, val_loss, "r", label="Validation loss")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

visualize_loss(history, "Training and Validation Loss")

### Mostramos los resultados

In [None]:
i = start

print(data.index[start])

real_y = np.zeros(0)
pred_y = np.zeros(0)

metrica = np.zeros(0)
fechas  = np.zeros(0, dtype='datetime64')

plt.rcParams['figure.figsize'] = [15, 5]

for batch in dataset_test.take(244):
    x, y = batch
    y_pred = model.predict(x)
    
    real_y  = np.concatenate((real_y, y))
    pred_y  = np.concatenate((pred_y, np.squeeze(y_pred)))
    
    indexes = np.argwhere(y)
    if not indexes.size == 0:
        metrica = np.concatenate((metrica, np.squeeze(y_pred[indexes])))
        fechas  = np.concatenate((fechas, np.squeeze(np.array(data.index)[indexes+i])))
    
    fig, ax = plt.subplots()
    
    ax.plot(data.index[i:i+256], y[0:256], c='r', label="Test Data")
    ax.plot(data.index[i:i+256], (y_pred[0:256]),c='g', label="Prediction")
    
    i += 256
    
    plt.ylim(0, 1.1)
    plt.legend()
    plt.show()

In [None]:
ConfusionMatrixDisplay.from_predictions(real_y, np.around(pred_y), display_labels=['No Heló', 'Heló'])

plt.show()

In [None]:
print(metrica.shape)
print(fechas.shape)

print(metrica)
print(fechas)