Se importan las librerías a ocupar.

In [4]:
import seaborn as sns
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import gc

# 
import datetime
from functools import reduce
import math
from keras.models import model_from_json
import joblib
import os

# Tensorflow GPU config
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

# Keras libs
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, Dropout, TimeDistributed, CuDNNGRU, GRU
import sklearn.metrics as metrics
from sklearn.preprocessing import RobustScaler

# Matplotlib setting
def set_chart_font():
    font = {'weight' : 'normal', 'size'   : 16}
    import matplotlib
    matplotlib.rc('font', **font)

# Jupyter stuff
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
display(HTML("<style>.text_cell_render p, .text_cell_render li { font-size: 13pt !important; }</style>"))
display(HTML("<style>.text_cell_render li { margin: 0 0 10px 0; }</style>"))

## Entrenamiento de la red

Se carga el dataset con las mediciones del interior y exterior de la casa unificadas. Se descartan las variables co2, ruido, dispositivo_id y measured_at.

In [2]:
df = pd.read_csv("./datasets/mediciones_unificadas.csv")
df.date_measured_at = pd.to_datetime(df.date_measured_at)
df = df.drop(columns=['co2', 'ruido', 'dispositivo_id', 'measured_at'])

Se analiza el estado de las series de tiempo para determinar conjunto de entrenamiento y validación.

In [3]:
HASH_OF_TIME_SERIES = {}
TARGET_DF = df

for i, g in TARGET_DF.groupby('vivienda_id'):
    HASH_OF_TIME_SERIES[i] = g.set_index('date_measured_at')

def show_stats(seried_df):
    print("Número de registros:", seried_df.shape[0])
    temp = seried_df.resample('30min').mean()
    missing_rate = (temp.isna().sum()/len(temp) * 100).temperatura_interior
    print("Porcentaje de información faltante:", missing_rate)

In [4]:
houses_stats = []
# Se define conjunto de entrenamiento
training_houses_pool = []
for i in HASH_OF_TIME_SERIES.keys():
    temp = HASH_OF_TIME_SERIES[i].resample('30min').mean()
    missing_rate = (temp.isna().sum()/len(temp) * 100).temperatura_interior
    
    # Que le falte 10% o menos y que tenga 6 meses de medición
    if missing_rate <= 10 and HASH_OF_TIME_SERIES[i].shape[0] > 8000:
        training_houses_pool.append(i)
        # print("Casa {}, Porcentage de null: {}%".format(i, missing_rate))
        houses_stats.append( (i, HASH_OF_TIME_SERIES[i].shape[0], missing_rate) )

# Se define el conjunto de validación
validation_houses_pool = set(HASH_OF_TIME_SERIES.keys()) - set(training_houses_pool)
        
print("Cantidad de casas con las que entrenar:", len(training_houses_pool))
print("Total casas:", len(HASH_OF_TIME_SERIES.keys()))

Cantidad de casas con las que entrenar: 89
Total casas: 303


Se definen las siguientes funciones para realizar todo el procedimiento asociado de tratamiento de datos y entrenamiento.

In [5]:
def generate_network_A(number_of_examples):
    model = Sequential()

    # Recurrente
    model.add(CuDNNGRU(input_shape=(number_of_examples, 4), units=100, return_sequences=False))

    # FF
    model.add(Dense(128, activation='relu'))
    model.add(Dense(4))

    model.compile(loss='mean_squared_error', optimizer='adam')#, metrics=['mean_absolute_error', 'mean_absolute_percentage_error'])
    return model

def generate_training_houses_set(array_of_houses, training_percentage):
    amount_of_houses = len(array_of_houses)
    train_lenght = int(amount_of_houses * training_percentage)
    training_houses = set(np.random.choice(array_of_houses, train_lenght, replace=False))
    return training_houses

# Función que recibe una serie de pandas y retorna un robust scaler
def initialize_robust_scalers(series_df):
    scalers_dictionary = dict()
    
    # Se pone un RobustScaler por cada feature
    for i in FEATURES_IN_STUDY:
        scalers_dictionary[i] = RobustScaler()
    
    for column_name in FEATURES_IN_STUDY:
        # Se le hace fit al scaler
        scalers_dictionary[column_name].fit( np.expand_dims(series_df[column_name], axis=1))

    return scalers_dictionary

# Recibe una serie de pandas y un diccionario de RobustScalers. Aplica los scalers a la serie.
def standarize_serie(serie_df, scalers_dictionary):
    copy = serie_df.copy()
    for column_name in FEATURES_IN_STUDY:
        # Se transforma la data
        copy[column_name] = scalers_dictionary[column_name].transform(np.expand_dims(copy[column_name], axis=1))
        
    return copy

# Recibe una serie (standarizada) y retorna la matriz laggeada
def generate_lagged_matrix(serie_df, amoung_of_lag):
    X = list()
    Y = list()
    last_seen = list()
    
    for index, row in serie_df.iterrows():
        current_record = list()
        for i in FEATURES_IN_STUDY:
            current_record.append(row[i])
        if len(last_seen) < amoung_of_lag:
            last_seen.append(current_record)
        else:
            Y.append(list(current_record))
            X.append(list(last_seen))
            del last_seen[0]
            last_seen.append(current_record)
            
    return X, Y

def generate_training_matrix(amount_of_lag, training_ids, general_df, hash_of_time_series):
    # Se proceden a inicializar los RobustScalers
    training_mask = general_df['vivienda_id'].map(lambda x: x in training_ids)
    scalers = initialize_robust_scalers(general_df[training_mask])
    
    standarized_time_series = dict()
    X_train, Y_train = [], []
    # training_matrixs = dict()
    for i in training_ids:
        standarized_time_series[i] = standarize_serie(hash_of_time_series[i], scalers)
        x, y = generate_lagged_matrix(standarized_time_series[i], amount_of_lag)
        X_train.append(x)
        Y_train.append(y)
        
    X_train = np.asarray(reduce(lambda x,y: x+y, X_train))
    Y_train = np.asarray(reduce(lambda x,y: x+y, Y_train))
    
    return X_train, Y_train, scalers

def generate_validation_matrix(amount_of_lag, scalers, validation_series):
    validation_hash = dict()
    for i in validation_series:
        standarized_serie = standarize_serie(validation_series[i], scalers)
        x, y = generate_lagged_matrix(standarized_serie, amount_of_lag)
        x, y = np.asarray(x), np.asarray(y)
        validation_hash[i] = (x, y)
        
    return validation_hash

def train_model(model, X_train, Y_train):
    history = model.fit(X_train, Y_train, epochs=8, batch_size=256, verbose=1) 
    return history

# Recibe un iterable con los ids de validación y el hash de todas la series
# Retorna un hash con las series. En cada entrada está la serie continua más larga
def clean_validation_data(validation_ids, hash_of_time_series):
    validation_series = {}
    for i in validation_ids:
        if hash_of_time_series[i].shape[0] >= 4000:
            resampled_house = hash_of_time_series[i].resample('30min').mean()
            initial_date=""
            best_initial_date = ""
            best_end_date = ""
            longest_days = 0

            for index, row in resampled_house.iterrows():

                temperature = getattr(row, 'temperatura_interior')
                current_date = index

                # Si es que si veo algo de temperatura
                if not math.isnan(temperature):  
                    if initial_date == "":
                        initial_date = current_date

                else:
                    if initial_date != "":
                        delta = last_date - initial_date
                        if delta.days > longest_days:
                            longest_days = delta.days
                            best_initial_date = initial_date
                            best_end_date = last_date
                        initial_date = ""

                last_date = current_date

            if best_end_date!="":
                temp = hash_of_time_series[i][best_initial_date:best_end_date]

                # Como mínimo 4 meses de medición
                if temp.shape[0] >= 4000:
                    validation_series[i] = hash_of_time_series[i][best_initial_date:best_end_date]
        
    return validation_series

# Convierte una matriz (m x 4) a una de (m x 1) en donde el 1 es el feature seleccionado
# en la escala original c:
def return_feature_to_original_scale(matrix, scalers, feature):
    if feature == 0:
        scaler = scalers['temperatura_interior']
    elif feature == 1:
        scaler = scalers['humedad_interior']
    elif feature == 2:
        scaler = scalers['temperatura_exterior']
    else:
        scaler = scalers['humedad_exterior'] 
    
    transformed = np.expand_dims(matrix[:,feature], axis=1)
    return scaler.inverse_transform(transformed)

# Calcula las metricas dado el modelo y el hash de series de validacion
# Calcula el mse general y el mae por atributo
def calculate_metrics(model, scalers, amount_of_lag, hash_of_matrix):
    
    # Calcula la media de una lista
    def calculate_mean(l):
        return sum(l)/len(l)

    in_temperature_mae = list()
    in_hum_mae = list()
    out_temperature_mae = list()
    out_hum_mae = list()
    
    total_mse = list()
    
    for i in hash_of_matrix:
        X_val, Y_val = hash_of_matrix[i]
        y_pred = model.predict(X_val)
        
        total_mse.append(metrics.mean_squared_error(Y_val, y_pred))
        
        for j in range(4):
            y_pred_transformed = return_feature_to_original_scale(y_pred, scalers, j)
            y_transformed = return_feature_to_original_scale(Y_val, scalers, j)

            # mse = metrics.mean_squared_error(y_transformed, y_pred_transformed)
            mae = metrics.mean_absolute_error(y_transformed, y_pred_transformed)
            
            if j == 0:
                in_temperature_mae.append(mae)
            elif j == 1:
                in_hum_mae.append(mae)
            elif j == 2:
                out_temperature_mae.append(mae)
            else:
                out_hum_mae.append(mae)
        
    return ( calculate_mean(total_mse), calculate_mean(in_temperature_mae), calculate_mean(in_hum_mae), calculate_mean(out_temperature_mae) , calculate_mean(out_hum_mae))

In [6]:
validation_series_hash = clean_validation_data(validation_houses_pool, HASH_OF_TIME_SERIES)

In [18]:
def generate_network_A(input_length):
    model = Sequential()

    # Recurrente
    model.add(CuDNNGRU(input_shape=(input_length, 4), units=100, return_sequences=False))

    # FF
    model.add(Dense(128, activation='relu'))
    model.add(Dense(4))

    model.compile(loss='mean_squared_error', optimizer='adam')#, metrics=['mean_absolute_error', 'mean_absolute_percentage_error'])
    return model

FEATURES_IN_STUDY = ["temperatura_interior", "humedad_interior", "temperatura_exterior", "humedad_exterior"]
LAG_30_MINUTES = 4

model_30minutes = generate_network_A(LAG_30_MINUTES)
ids_30minutes = generate_training_houses_set(training_houses_pool, 0.5)
X_train, Y_train, scalers_30minutes = generate_training_matrix(LAG_30_MINUTES, ids_30minutes, df, HASH_OF_TIME_SERIES)
hash_of_validation_matrix_30minutes = generate_validation_matrix(LAG_30_MINUTES, scalers_30minutes, validation_series_hash)

train_model(model_30minutes, X_train, Y_train)
model_30minutes_metrics = calculate_metrics(model_30minutes, scalers_30minutes, LAG_30_MINUTES, hash_of_validation_matrix_30minutes)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [19]:
def show_table_metrics(metrics):
    data = {'MSE':  [metrics[0]],
            'MAE Temperatura interior':  [metrics[1]],
            'MAE Humedad interior':  [metrics[2]],
            'MAE Temperatura exterior':  [metrics[3]],
            'MAE Humedad exterior':  [metrics[4]]}
    display(pd.DataFrame(data, columns = ['MSE','MAE Temperatura interior', 'MAE Humedad interior', 'MAE Temperatura exterior', 'MAE Humedad exterior']))
    
show_table_metrics(model_30minutes_metrics)

Unnamed: 0,MSE,MAE Temperatura interior,MAE Humedad interior,MAE Temperatura exterior,MAE Humedad exterior
0,0.002614,0.142847,0.492806,0.209711,0.906668


Se procede a guardar la red obtenida. Se guardan 3 archivos, el modelo va en `model_30minutes.json`, los pesos del modelo van en `model_30minutes.h5` y finalmente los escaladores van en `scalers_30minutes.save`

In [20]:
with open("model_30minutes.json", "w") as json_file:
    json_file.write(model_30minutes.to_json())
    
model_30minutes.save_weights("model_30minutes.h5")


scaler_filename = "scalers_30minutes.save"
joblib.dump(scalers_30minutes, "scalers_30minutes.save") 

['scalers_30minutes.save']

## Utilizando la red

Primero hay que cargar la RNA, los pesos y los escaladores.

In [7]:
def show_table_metrics(metrics):
    data = {'MSE':  [metrics[0]],
            'MAE Temperatura interior':  [metrics[1]],
            'MAE Humedad interior':  [metrics[2]],
            'MAE Temperatura exterior':  [metrics[3]],
            'MAE Humedad exterior':  [metrics[4]]}
    display(pd.DataFrame(data, columns = ['MSE','MAE Temperatura interior', 'MAE Humedad interior', 'MAE Temperatura exterior', 'MAE Humedad exterior']))

# Se cargan los escaladores
loaded_scarler = joblib.load("scalers_30minutes.save") 

# Se carga y crea el modelo
with open("model_30minutes.json", "r") as json_file:
    loaded_model_json = json_file.read()
loaded_model = model_from_json(loaded_model_json)

# Se cargan los pesos al modelo
loaded_model.load_weights("model_30minutes.h5")

# Se corrobora que el rendimiento sea el esperado
# show_table_metrics(calculate_metrics(loaded_model, loaded_scarler, LAG_30_MINUTES, hash_of_validation_matrix_30minutes))

Después es necesario tener un input a predecir, en este caso se utiliza la variable `example_data` a modo de ejemplo. Simplemente se tiene que invocar el método generate prediction y retornará la predicción segun corresponda el modelo.

Importante: Notar que como el modelo es de 30 minutos se trabaja con las últimas 4 mediciones, en el caso de 1 hora se debe invocar con 2

In [10]:
# Sacando datos de ejemplo
# display(validation_series_hash[2][:5])

example_data = [ [17.4, 74, 13.2, 88], [17.7, 73, 13.1, 89], [17.7, 73, 13.1, 89], [17.7, 72, 13.0, 90]]

def generate_prediction(model, scalers, input_array):
    DICT_FEATURES_IN_STUDY = {0:"temperatura_interior", 1:"humedad_interior", 2:"temperatura_exterior", 3:"humedad_exterior"}
    transformed_input = list()
    for i in input_array:
        transformed_data = list()
        for j in range(len(i)):
            # Se transforma la data
            transformed_data.append(scalers[DICT_FEATURES_IN_STUDY[j]].transform(np.expand_dims([i[j]], axis=1))[0][0])
        transformed_input.append(transformed_data)
    transformed_input = np.asarray([transformed_input])
    predicted_value = model.predict(transformed_input)
    
    # Se tienen que devolver los datos a la escala original
    final_answer = list()
    for i in range(4):
        a = np.expand_dims(predicted_value[:,i], axis=1)
        final_answer.append(scalers[DICT_FEATURES_IN_STUDY[i]].inverse_transform(a))
        
    return np.asarray(final_answer).flatten()

generate_prediction(loaded_model, loaded_scarler, example_data)

array([17.643192, 71.95427 , 12.886521, 90.59765 ], dtype=float32)