In [60]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from cyclic_boosting.pipelines import pipeline_CBClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

import tensorflow as tf

import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.losses import MeanSquaredError

import itertools
from keras.layers import LSTM

import seaborn as sns 

from scipy.stats import boxcox 
from scipy.special import inv_boxcox

from termcolor import colored

import joblib

from tqdm import tqdm

In [61]:
samples = pd.read_csv('preprocessed_lstm.csv')
samples['date'] = pd.to_datetime(samples['date'])
reading_types = pd.read_csv('reading_types.csv')
# samples.info()

df_lst = [(k, v) for k, v in samples.groupby('building_id')]

models = [[]]


In [62]:
from matplotlib import pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
value_type_ids = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
modelDict = {}
for building, df in df_lst:
    # modelDict[building] = []
    for typeId in value_type_ids:
        print(building, '=' * 80)

        df = df.set_index('date', drop = False) 
        idx = pd.date_range('01-01-2023', '12-31-2023 23:55', freq = '5min')
        df = df.reindex(idx, fill_value = np.nan)

        multivariate = df.drop(['Unnamed: 0', 'building_id', 'date'], axis = 1)  
        multivariate = multivariate.astype('float32')
        multivariate = multivariate[[typeId]]
        print(multivariate.info())

        # boxcox
        # multivariate[typeId] = multivariate[typeId].apply(lambda x: x + 1e-6)
        # df_boxcox_vals, fitted_lambda = boxcox(multivariate[typeId])
        # multivariate[typeId] = df_boxcox_vals

        sz = len(multivariate)
        train_sz = int(sz * 0.9)
        test_sz = len(multivariate) - train_sz

        scaler = StandardScaler()  
        scaler = scaler.fit(multivariate) 
        df_scaled = scaler.transform(multivariate)

        train, test = df_scaled[:train_sz,:], df_scaled[train_sz:sz,:]

        def create_dataset(dataset, look_back = 1):
            dataX, dataY = [], []
            for i in range(len(dataset) - look_back - 1):
                if(np.isnan(dataset[i + look_back])): 
                    continue
                a = dataset[i:(i + look_back), :]
                if(np.isnan(a).any()): # for masking
                    a.fill(-1)

                dataX.append(a)
                dataY.append(dataset[i + look_back])
            return np.array(dataX), np.array(dataY)
        
        look_back = 10
        trainX, trainY = create_dataset(train, look_back)
        testX, testY = create_dataset(test, look_back)
        
        # reshape input to be [samples, time steps, features]
        trainX, trainY = np.array(trainX), np.array(trainY)
        testX, testY = np.array(testX), np.array(testY)

        # print('trainX shape == {}.'.format(trainX.shape))
        # print('trainY shape == {}.'.format(trainY.shape)) 

        # x_train, x_test, y_train, y_test = train_test_split(df.drop(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], axis = 1), df[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']])
        # x_train = np.asarray(x_train).astype(np.float32)
        # y_train = np.asarray(y_train).astype(np.float32)
        model = keras.Sequential()
        model.add(keras.layers.Masking(mask_value = -1, input_shape = (trainX.shape[1], trainX.shape[2])))
        model.add(layers.LSTM(150, input_shape = (trainX.shape[1], trainX.shape[2]), return_sequences = True))
        model.add(LSTM(50, return_sequences = False))
        # model.add(LSTM(100))
        model.add(layers.BatchNormalization())
        model.add(layers.Dense(1, activation= 'linear'))


        early_stop = EarlyStopping(monitor = 'val_loss', patience = 3)

        model.compile(optimizer= tf.keras.optimizers.Adam(), loss= tf.keras.losses.Huber())

        history = model.fit(trainX, trainY, batch_size=64, validation_split = 0.2, epochs=1, verbose = 1, callbacks = [early_stop])
        
        # plt.figure(figsize = (18, 10))
        # plt.plot(history.history['loss'], label='Training loss')
        # plt.plot(history.history['val_loss'], label='Validation loss')
        # plt.legend()
        # plt.show()
        # plt.clf()
        # plt.cla()
        # plt.close()

        # predict_range = 24
        # test_dates = test_dates[0:predict_range]

        # n_future = pd.date_range(test_dates.iloc[0], periods = predict_range, freq = 'H').tolist()

        # prediction = model.predict(trainX[-predict_range:])
        prevVal = -1 

        def get_predictions(val): 
            global prevVal 

            progress_bar.update(1)
            if(np.isnan(val)): 
                prevVal_reshaped = np.array([[prevVal]], dtype=np.float32)
                pred = model.predict(prevVal_reshaped, verbose = 0)
                pred = scaler.inverse_transform(pred)[:, 0][0]
            else: 
                pred = val
            
            prevVal = pred
            return pred 

        df.to_csv('original.csv')
        progress_bar = tqdm(total = len(multivariate))
        df[typeId] = df[typeId].apply(get_predictions)
        df.to_csv('predictions.csv')
        break

        # col = 3

        # y_pred_future = scaler.inverse_transform(prediction)[:, 0]
        # print(y_pred_future)

        #inverse boxcox
        # y_pred_future = inv_boxcox(y_pred_future, fitted_lambda)
        # y_pred_future = np.vectorize(lambda x: x - 1e-6)(y_pred_future)
        # # y_pred_future = y_pred_future[0] - 1e-6

        # forecast_dates = []
        # for time_i in n_future:
        #     forecast_dates.append(time_i)


        # df_forecast = pd.DataFrame({'date':np.array(forecast_dates), reading_types.at[int(typeId) - 1, 'reading_type_name']:y_pred_future})
        # df_forecast['date']=pd.to_datetime(df_forecast['date'])

        # original = df[['date', typeId]]
        # original['date']=pd.to_datetime(original['date'])
        # original = original[train_sz:train_sz + predict_range]

        # print(reading_types.at[int(typeId) - 1, 'reading_type_name'], '='*100)

        # # print(trainX[-predict_range:])
        # # print(df[train_sz-predict_range:train_sz])
        # # print(df_forecast)
        # # print(original)
        # # print("original", '-'*80)
        # # print(original.head(24))

        # # print("df_forecast", '-'*80)
        # # print(df_forecast.head(24))

        # # original.set_index('date')
        # # df_forecast.set_index('date')

        # print(colored("MEAN SQUARED ERROR: ", 'red'), mean_squared_error(original[typeId], df_forecast[reading_types.at[int(typeId) - 1, 'reading_type_name']]))

        # # plt.figure(figsize=(18,8))
        # # plt.plot(original[typeId],label = "original")
        # # plt.plot(df_forecast[reading_types.at[int(typeId) - 1, 'reading_type_name']],label = "predicted")
        # # plt.title("Time Series Forecast")
        # # plt.xlabel("Date")
        # # plt.ylabel(reading_types.at[int(typeId) - 1, 'reading_type_name'])
        # # plt.legend()
        # # plt.show()

        # sns.lineplot(data= original, x = 'date', y = typeId)
        # sns.lineplot(data = df_forecast, x = 'date', y =  reading_types.at[int(typeId) - 1, 'reading_type_name'])
        # plt.show()
        modelDict[building].append(model)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 105120 entries, 2023-01-01 00:00:00 to 2023-12-31 23:55:00
Freq: 5T
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       87925 non-null  float32
dtypes: float32(1)
memory usage: 1.2 MB
None






 13%|█▎        | 13236/105120 [01:54<13:16, 115.36it/s]
 97%|█████████▋| 102170/105120 [21:37<02:17, 21.45it/s]  

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 105120 entries, 2023-01-01 00:00:00 to 2023-12-31 23:55:00
Freq: 5T
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Unnamed: 0     89691 non-null   float64       
 1   date           89691 non-null   datetime64[ns]
 2   building_id    89691 non-null   float64       
 3   1              105120 non-null  float64       
 4   2              88918 non-null   float64       
 5   3              87058 non-null   float64       
 6   4              88949 non-null   float64       
 7   5              88866 non-null   float64       
 8   6              89022 non-null   float64       
 9   7              87976 non-null   float64       
 10  8              88977 non-null   float64       
 11  9              88970 non-null   float64       
 12  10             88944 non-null   float64       
 13  11             77714 non-null   float64       
 14  12       

100%|██████████| 105120/105120 [21:52<00:00, 21.45it/s]

 41/568 [=>............................] - ETA: 2:09 - loss: 0.3102

KeyboardInterrupt: 

In [None]:
# joblib.dump(modelDict, 'modelDict.pkl', compress = 9)

['modelDict.pkl']