In [63]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from cyclic_boosting.pipelines import pipeline_CBClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

import tensorflow as tf

import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.losses import MeanSquaredError

import itertools
from keras.layers import LSTM

import seaborn as sns 

from scipy.stats import boxcox 
from scipy.special import inv_boxcox

from termcolor import colored

import joblib

from tqdm import tqdm

In [64]:
samples = pd.read_csv('preprocessed_lstm.csv')
samples['date'] = pd.to_datetime(samples['date'])
reading_types = pd.read_csv('reading_types.csv')
# samples.info()

df_lst = [(k, v) for k, v in samples.groupby('building_id')]

models = [[]]


In [65]:
from matplotlib import pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
value_type_ids = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
preds = pd.DataFrame()
cnt = 0
for building, df in df_lst[cnt:]:
    # modelDict[building] = []
    for typeId in value_type_ids:
        print(building, '=' * 80)

        df = df.set_index('date', drop = False) 
        idx = pd.date_range('01-01-2023', '12-31-2023 23:55', freq = '5min')
        df = df.reindex(idx, fill_value = np.nan)

        multivariate = df.drop(['Unnamed: 0', 'building_id', 'date'], axis = 1)  
        multivariate = multivariate.astype('float32')
        multivariate = multivariate[[typeId]]
        # print(multivariate.info())
        sz = len(multivariate)
        train_sz = int(sz * 0.9)
        test_sz = len(multivariate) - train_sz

        scaler = StandardScaler()  
        scaler = scaler.fit(multivariate) 
        df_scaled = scaler.transform(multivariate)

        train, test = df_scaled[:train_sz,:], df_scaled[train_sz:sz,:]

        def create_dataset(dataset, look_back = 1):
            dataX, dataY = [], []
            for i in range(len(dataset) - look_back - 1):
                if(np.isnan(dataset[i + look_back])): 
                    continue
                a = dataset[i:(i + look_back), :]
                if(np.isnan(a).any()): # for masking
                    a.fill(-1)

                dataX.append(a)
                dataY.append(dataset[i + look_back])
            return np.array(dataX), np.array(dataY)
        
        look_back = 10
        trainX, trainY = create_dataset(train, look_back)
        testX, testY = create_dataset(test, look_back)
        
        # reshape input to be [samples, time steps, features]
        trainX, trainY = np.array(trainX), np.array(trainY)
        testX, testY = np.array(testX), np.array(testY)

        model = keras.Sequential()
        model.add(keras.layers.Masking(mask_value = -1, input_shape = (trainX.shape[1], trainX.shape[2])))
        model.add(layers.LSTM(150, input_shape = (trainX.shape[1], trainX.shape[2]), return_sequences = True))
        model.add(LSTM(50, return_sequences = False))
        # model.add(LSTM(100))
        model.add(layers.BatchNormalization())
        model.add(layers.Dense(1, activation= 'linear'))


        early_stop = EarlyStopping(monitor = 'val_loss', patience = 3)

        model.compile(optimizer= tf.keras.optimizers.Adam(), loss= tf.keras.losses.Huber())

        history = model.fit(trainX, trainY, batch_size=64, validation_split = 0.2, epochs=20, verbose = 1, callbacks = [early_stop])
        
        # plt.figure(figsize = (18, 10))
        # plt.plot(history.history['loss'], label='Training loss')
        # plt.plot(history.history['val_loss'], label='Validation loss')
        # plt.legend()
        # plt.show()
        # plt.clf()
        # plt.cla()
        # plt.close()

        # predict_range = 24
        # test_dates = test_dates[0:predict_range]

        # n_future = pd.date_range(test_dates.iloc[0], periods = predict_range, freq = 'H').tolist()

        # prediction = model.predict(trainX[-predict_range:])
        prevVal = -1 

        def get_predictions(val): 
            global prevVal 

            progress_bar.update(1)
            if(np.isnan(val)): 
                prevVal_reshaped = np.array([[prevVal]], dtype=np.float32)
                pred = model.predict(prevVal_reshaped, verbose = 0)
                pred = scaler.inverse_transform(pred)[:, 0][0]
            else: 
                pred = val
            
            prevVal = pred
            return pred 

        progress_bar = tqdm(total = len(multivariate))
        df[typeId] = df[typeId].apply(get_predictions)

    preds = preds.append(df)
    preds.to_csv('output.csv')
    cnt += 1
    print(cnt, '+' * 80)



100%|██████████| 105120/105120 [1:09:38<00:00, 25.16it/s]


KeyboardInterrupt: 



In [None]:
# joblib.dump(modelDict, 'modelDict.pkl', compress = 9)

['modelDict.pkl']