###  Import libraries


In [1]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from tensorflow import keras
from datetime import datetime
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
%matplotlib inline

import time

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### Hiperparámetros

In [2]:
csv_path          = "regina.csv"
dropped_features  = ['Hi Temp', 'Low Temp', 'Wind Chill', 'Heat Index', 'THSW Index', 'THW Index', 'Wind Run', 'Solar Energy', 'Hi Solar Rad.', 'In Heat', 'ISS Recept', 'Arc. Int']

train_perc = .8
val_perc   = .1

sequence_length = 90
offset          = 0
sampling_rate   = 1
length          = 6 #horas
min_temp        = 0
batch_size      = 256

learning_rate   = 0.001
epochs          = 100

### Read CSV

In [3]:
dateparse = lambda x: datetime.strptime(x, '%d.%m.%y %H:%M')

wind_dic = {'---' : 0,
            'E': 1,
            'W': 2,
            'N': 3,
            'S': 4,
            'NE': 5,
            'SE': 6,
            'NW': 7,
            'SW': 8,
            'ENE': 9,
            'NNE': 10,
            'WNW': 11,
            'NNW': 12,
            'ESE': 13,
            'SSE': 14,
            'WSW': 15,
            'SSW': 16}

data = pd.read_csv(csv_path, parse_dates=['Date Time'], date_parser=dateparse, na_values=['---', '------'], converters={'Wind Dir': lambda x: wind_dic[x], 'Hi Dir': lambda x: wind_dic[x]}, index_col=0)

### Agregamos los datos que faltan

In [4]:
data['Temp Out'] = data['Temp Out'].apply(lambda x: x-2)
data['Temp Out'] = data['Temp Out'].apply(lambda x: np.nan if x <= -10 else x)

data = data.resample('10min', origin='start').mean()

data['Wind Dir'] = data['Wind Dir'].replace(np.nan, 0)
data['Hi Dir'] = data['Hi Dir'].replace(np.nan, 0)

data = data.interpolate()

data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 627179 entries, 2010-01-28 14:20:00 to 2022-01-01 00:00:00
Freq: 10T
Data columns (total 31 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Temp Out       627179 non-null  float64
 1   Hi Temp        627179 non-null  float64
 2   Low Temp       627179 non-null  float64
 3   Out Hum        627179 non-null  float64
 4   Dew Pt.        627179 non-null  float64
 5   Wind Speed     627179 non-null  float64
 6   Wind Dir       627179 non-null  float64
 7   Wind Run       627179 non-null  float64
 8   Hi Speed       627179 non-null  float64
 9   Hi Dir         627179 non-null  float64
 10  Wind Chill     627179 non-null  float64
 11  Heat Index     627179 non-null  float64
 12  THW Index      627179 non-null  float64
 13  THSW Index     622656 non-null  float64
 14  Bar            627179 non-null  float64
 15  Rain           627179 non-null  float64
 16  Rain Rate      627179 non-null

### Convertimos estampillas de tiempo todo a segundos

In [5]:
timestamp_s = data.index
timestamp_s = timestamp_s.map(pd.Timestamp.timestamp)

day = 24*60*60
year = 365.2425 * day

data['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
data['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
data['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
data['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

## Correlación de los datos

In [6]:
data.corr()

Unnamed: 0,Temp Out,Hi Temp,Low Temp,Out Hum,Dew Pt.,Wind Speed,Wind Dir,Wind Run,Hi Speed,Hi Dir,...,In Heat,ET,Wind Samp,Wind Tx,ISS Recept,Arc. Int,Day sin,Day cos,Year sin,Year cos
Temp Out,1.0,0.948278,0.945845,-0.641602,0.629661,0.212184,0.24441,0.212835,0.322092,0.246937,...,0.799596,0.202581,0.103744,-0.003166,0.102988,,-0.450316,-0.223826,0.0965,0.67225
Hi Temp,0.948278,1.0,0.995834,-0.618553,0.637439,0.203664,0.231939,0.204288,0.309187,0.23478,...,0.759106,0.195075,0.10444,-0.002983,0.104484,,-0.435216,-0.211761,0.07891,0.64732
Low Temp,0.945845,0.995834,1.0,-0.616361,0.635813,0.206264,0.233991,0.206885,0.311684,0.236768,...,0.756017,0.194955,0.103344,-0.002965,0.103364,,-0.434265,-0.209511,0.079588,0.644029
Out Hum,-0.641602,-0.618553,-0.616361,1.0,0.141228,-0.401745,-0.305096,-0.402432,-0.504793,-0.318324,...,-0.337264,-0.19601,0.022232,0.001619,0.014608,,0.452333,0.227616,0.212676,-0.284088
Dew Pt.,0.629661,0.637439,0.635813,0.141228,1.0,-0.120652,0.02554,-0.120465,-0.0856,0.014211,...,0.67409,0.057911,0.13879,-0.00253,0.131796,,-0.116473,-0.056457,0.316065,0.570534
Wind Speed,0.212184,0.203664,0.206264,-0.401745,-0.120652,1.0,0.318082,0.999991,0.918865,0.335477,...,-0.006695,0.134804,0.022525,-0.001167,0.040598,,-0.213365,-0.239211,-0.106014,-0.009716
Wind Dir,0.24441,0.231939,0.233991,-0.305096,0.02554,0.318082,1.0,0.318987,0.433014,0.70983,...,0.057501,0.099921,0.064045,-0.001784,0.078166,,-0.12532,-0.225141,-0.029331,0.088961
Wind Run,0.212835,0.204288,0.206885,-0.402432,-0.120465,0.999991,0.318987,1.0,0.919271,0.336315,...,-0.006427,0.135009,0.022511,-0.00117,0.040599,,-0.213583,-0.239824,-0.10609,-0.009435
Hi Speed,0.322092,0.309187,0.311684,-0.504793,-0.0856,0.918865,0.433014,0.919271,1.0,0.451702,...,0.082188,0.159157,0.057584,-0.001544,0.068617,,-0.239475,-0.268736,-0.111438,0.087644
Hi Dir,0.246937,0.23478,0.236768,-0.318324,0.014211,0.335477,0.70983,0.336315,0.451702,1.0,...,0.063142,0.100267,0.059237,-0.001771,0.075205,,-0.124374,-0.215857,-0.030606,0.096731


## Dropeamos las características que no tienen correlación

In [7]:
df = data.drop(dropped_features, axis = 1)

### Normalización de los datos

In [8]:
df_length = len(df)

train_length = int(df_length*train_perc)
val_length   = int(df_length*(train_perc+val_perc))

train_mean = df.values[:train_length].mean(axis=0)
train_std  = df.values[:train_length].std(axis=0)

df_norm = (df.values - train_mean) / train_std
df_norm = pd.DataFrame(df_norm)

### Dividimos los datos

In [9]:
train_df = df_norm[:train_length-1]
val_df   = df_norm[train_length:val_length-1]
test_df  = df_norm[val_length:]

###  Create datasets

In [10]:
def create_y_lento(df_norm, start, end, length, min_temp):
    y_data = []
    
    for i in range(start, end):
        freeze = 0
        for j in range(i, i+length):
            if df_norm.iloc[j][0] <= min_temp:
                freeze = 1
                break
        y_data.append(freeze)

    return y_data

In [11]:
def create_y(df, start, end, length, min_temp):
    y_data = np.arange(end-start)
    
    df_bool = df['Temp Out'][start:end+length] <= min_temp
        
    for i in range(0, end-start):
        y_data[i] = 1 if np.any(df_bool[i:i+length]) else 0

    return y_data

In [12]:
start = sequence_length*sampling_rate + offset*sampling_rate
end   = train_length + offset*sampling_rate

x_train = train_df
y_train = create_y(df, start, end, length*6, min_temp)

dataset_train = keras.preprocessing.timeseries_dataset_from_array(
    x_train.values,
    y_train,
    sequence_length = sequence_length,
    sampling_rate = sampling_rate,
    shuffle=True,
    batch_size = batch_size
)

In [13]:
start = train_length + sequence_length*sampling_rate + offset*sampling_rate
end   = val_length + offset*sampling_rate

x_val = val_df
y_val = create_y(df, start, end, length*6, min_temp)

dataset_val = keras.preprocessing.timeseries_dataset_from_array(
    x_val.values,
    y_val,
    sequence_length = sequence_length,
    sampling_rate = sampling_rate,
    shuffle=False,
    batch_size = batch_size
)

In [14]:
start = val_length + sequence_length*sampling_rate + offset*sampling_rate
x_end = len(test_df) - sequence_length*sampling_rate - offset*sampling_rate

x_test = test_df[:x_end]
y_test = create_y(df, start, df_length, length*6, min_temp)

dataset_test = keras.preprocessing.timeseries_dataset_from_array(
    x_test.values,
    y_test,
    sequence_length = sequence_length,
    sequence_stride = sampling_rate,
    sampling_rate = sampling_rate,
    shuffle=False,
    batch_size = batch_size
)

### Creamos el modelo

In [15]:
for batch in dataset_train.take(1):
    x, y = batch
    
input_shape = x.shape[1], x.shape[2]

In [16]:
inputs = keras.layers.Input(input_shape)
lstm_layer = keras.layers.LSTM(8, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(inputs)
lstm_layer2 = keras.layers.LSTM(8, dropout=0.5, recurrent_dropout=0.5)(lstm_layer)
output = keras.layers.Dense(1, activation="sigmoid")(lstm_layer2)

model = keras.Model(inputs, output)
model.compile(keras.optimizers.Adam(learning_rate), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False))
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 90, 23)]          0         
_________________________________________________________________
lstm (LSTM)                  (None, 90, 8)             1024      
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense (Dense)                (None, 1)                 9         
Total params: 1,577
Trainable params: 1,577
Non-trainable params: 0
_________________________________________________________________


### Entrenamos el modelo

In [None]:
path_checkpoint = "model_test_regina.h5"
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5)

modelckpt_callback = keras.callbacks.ModelCheckpoint(
    monitor="val_loss",
    filepath=path_checkpoint,
    verbose=1,
    save_best_only=True,
)

history = model.fit(dataset_train, epochs = epochs, validation_data=dataset_val, callbacks = [es_callback, modelckpt_callback])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.14034, saving model to model_test_regina.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.14034 to 0.13088, saving model to model_test_regina.h5
Epoch 3/100

Epoch 00003: val_loss improved from 0.13088 to 0.12593, saving model to model_test_regina.h5
Epoch 4/100

Epoch 00004: val_loss improved from 0.12593 to 0.12547, saving model to model_test_regina.h5
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.12547
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.12547
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.12547
Epoch 8/100

Epoch 00008: val_loss improved from 0.12547 to 0.12240, saving model to model_test_regina.h5
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.12240
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.12240
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.12240
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.12240
Epoch 13/100

In [None]:
def visualize_loss(history, title):
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs = range(len(loss))
    plt.figure()
    plt.plot(epochs, loss, "b", label="Training loss")
    plt.plot(epochs, val_loss, "r", label="Validation loss")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

visualize_loss(history, "Training and Validation Loss")

### Mostramos los resultados

In [None]:
i = start

print(start)

real_y = np.zeros(0)
pred_y = np.zeros(0)

plt.rcParams['figure.figsize'] = [15, 5]

for batch in dataset_test.take(244):
    x, y = batch
    y_pred = model.predict(x)
    
    real_y = np.concatenate((real_y, y))
    pred_y = np.concatenate((pred_y, np.reshape(y_pred, 256)))
    
    fig, ax = plt.subplots()
    
    ax.plot(data.index[i:i+256], y[0:256], c='r', label="Test Data")
    ax.plot(data.index[i:i+256], (y_pred[0:256]),c='g', label="Prediction")
    
    i += 256
    
    plt.ylim(0, 1.1)
    plt.legend()
    plt.show()

In [None]:
ConfusionMatrixDisplay.from_predictions(real_y, np.around(pred_y), display_labels=['No Heló', 'Heló'])

plt.show()