In [150]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from cyclic_boosting.pipelines import pipeline_CBClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

import tensorflow as tf

import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.losses import MeanSquaredError

import itertools
from keras.layers import LSTM

import seaborn as sns 

from scipy.stats import boxcox 
from scipy.special import inv_boxcox

from termcolor import colored


In [151]:
samples = pd.read_csv('preprocessed_lstm.csv')
samples['date'] = pd.to_datetime(samples['date'])
building_encoder = pd.get_dummies(samples['building_id'])
samples = samples.join(building_encoder.add_suffix('_b'))
reading_types = pd.read_csv('reading_types.csv')
# samples.info()
samples.info()


df_lst = [(k, v) for k, v in samples.groupby('building_id')]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1832737 entries, 0 to 1832736
Data columns (total 51 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Unnamed: 0     int64         
 1   date           datetime64[ns]
 2   building_id    int64         
 3   1              float64       
 4   2              float64       
 5   3              float64       
 6   4              float64       
 7   5              float64       
 8   6              float64       
 9   7              float64       
 10  8              float64       
 11  9              float64       
 12  10             float64       
 13  11             float64       
 14  12             float64       
 15  work_hours     bool          
 16  day type       int64         
 17  Fall           bool          
 18  Spring         bool          
 19  Summer         bool          
 20  Winter         bool          
 21  trimester_day  int64         
 22  1_b            bool          
 23  2_b    

In [153]:
from matplotlib import pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
value_type_ids = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
scaler = StandardScaler()  
scaler = scaler.fit(samples[value_type_ids]) 

trainX, trainY, train_class = np.array([[[]]]).reshape(0, 10, 12), np.array([[]]).reshape(0, 12), np.array([[]]).reshape(0, 35)


for building, df in df_lst:
    print('building:', building, '-'*80)
    train_dates = pd.to_datetime(df['date'])
    df = df.set_index('date', drop = False) 
    idx = pd.date_range('01-01-2023', '12-31-2023 23:55', freq = '5min')
    df = df.reindex(idx, fill_value = np.nan)

    # df.info()
    df = df.drop(['day type', 'Winter', 'Spring', 'Summer', 'Fall', 'work_hours', 'trimester_day'], axis = 1)

    df['date'] = df.index
    
    df['day type'] = df['date'].dt.dayofweek.map({
        0: 1,
        1: 1,
        2: 1,
        3: 1,
        4: 1,
        5: 0, 
        6: 0
    })

        
    df['season'] = df['date'].dt.month.map({
        1: 'Winter',
        2: 'Winter',
        3: 'Spring',
        4: 'Spring',
        5: 'Spring',
        6: 'Summer',
        7: 'Summer',
        8: 'Summer',
        9: 'Fall',
        10: 'Fall',
        11: 'Fall',
        12: 'Winter'
    })

    df['work_hours'] = df['date'].dt.hour.between(8, 18)
    df['work_hours'].map({True: 1, False: 0})

    df['Winter'] = df['season'].map({
        'Winter': 1,
        'Spring': 0,
        'Summer': 0,
        'Fall': 0
    })
    df['Spring'] = df['season'].map({
        'Winter': 0,
        'Spring': 1,
        'Summer': 0,
        'Fall': 0
    })
    df['Summer'] = df['season'].map({
        'Winter': 0,
        'Spring': 0,
        'Summer': 1,
        'Fall': 0
    })
    df['Fall'] = df['season'].map({
        'Winter': 0,
        'Spring': 0,
        'Summer': 0,
        'Fall': 1
    })


    df = df.drop('season', axis = 1)

    multivariate = df.drop(['Unnamed: 0', 'building_id', 'date'], axis = 1)  
    multivariate = multivariate.astype('float32')
    classification_info = multivariate.drop(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], axis = 1)

    classification_info.info()

    multivariate = multivariate[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']]
    multivariate = multivariate.interpolate(method = 'linear').bfill().ffill()

    multivariate.info()


    # for typeId in value_type_ids:
    #     if(multivariate[typeId].isnull().sum() <= len(multivariate) * 0.5):
    #         print(typeId, "gogogiaojsugdiadsug")
    #         print(df.info(show_counts = True))
    #         multivariate[typeId] = multivariate[typeId].fillna(0)

    sz = len(multivariate)
    train_sz = int(sz * 0.9)
    test_sz = len(multivariate) - train_sz

    df_scaled = scaler.transform(multivariate)

    train, test = df_scaled[0:train_sz,:], df_scaled[train_sz:sz,:]
    tClass, tstClass = classification_info[0: train_sz], classification_info[train_sz:sz]

    test_dates = train_dates[train_sz:sz]

    def create_dataset(dataset, classes, look_back = 1):
        dataX, dataY = [], []
        class_X = []
        for i in range(len(dataset) - look_back - 1):
            if(np.isnan(dataset[i + look_back]).any()): 
                continue
            dataY.append(dataset[i + look_back, :])
            a = dataset[i:(i + look_back), :]
            if(np.isnan(a).any()): # for masking
                a.fill(-1)
            dataX.append(a)
            class_X.append(classes.iloc[i + look_back, :])

        return np.array(dataX), np.array(dataY), np.array(class_X)
    
    look_back = 10
    tX, tY, tClass = create_dataset(train, tClass, look_back)

    print(tX.shape, tY.shape, tClass.shape)

    # testX, testY, test_class = create_dataset(test, test_class, look_back)
    
    # reshape input to be [samples, time steps, features]
    tX, tY, tClass = np.array(tX), np.array(tY), np.array(tClass)

    print(tX.shape, tY.shape, tClass.shape, trainX.shape, trainY.shape, train_class.shape)
    trainX, trainY, train_class = np.append(trainX, tX, axis = 0), np.append(trainY, tY, axis = 0), np.append(train_class, tClass, axis = 0)
    # testX, testY, test_class = np.array(testX), np.array(testY), np.array(test_class)

    # print('trainX shape == {}.'.format(trainX.shape))
    # print('trainY shape == {}.'.format(trainY.shape)) 
    # print('train_class shape == {}.'.format(train_class.shape))
    
    # plt.figure(figsize = (18, 10))
    # plt.plot(history.history['loss'], label='Training loss')
    # plt.plot(history.history['val_loss'], label='Validation loss')
    # plt.legend()
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()


    # predict_range = 1
    # train_dates = train_dates[0:train_sz]

    # n_future = pd.date_range(train_dates.iloc[look_back], periods = predict_range, freq = '5min').tolist()

    # prediction = model.predict(trainX[look_back:look_back + predict_range])

    # col = 3

    # y_pred_future = scaler.inverse_transform(prediction)[:, 10]
    # print(y_pred_future)

    # #inverse boxcox
    # # y_pred_future = inv_boxcox(y_pred_future, fitted_lambda)
    # # y_pred_future = np.vectorize(lambda x: x - 1e-6)(y_pred_future)
    # # # y_pred_future = y_pred_future[0] - 1e-6

    # forecast_dates = []
    # for time_i in n_future:
    #     forecast_dates.append(time_i.date())


    # df_forecast = pd.DataFrame({'date':np.array(forecast_dates), reading_types.at[int('11') - 1, 'reading_type_name']:y_pred_future})
    # df_forecast['date']=pd.to_datetime(df_forecast['date'])

    # original = df[['date', '11']]
    # original['date']=pd.to_datetime(original['date'])
    # original = original[look_back:look_back+predict_range]

    # print("original", '-'*80)
    # print(original.head(1))

    # print("df_forecast", '-'*80)
    # print(df_forecast.head(1))
    # print(reading_types.at[int(typeId) - 1, 'reading_type_name'], '='*100)

    # # print(trainX[-predict_range:])
    # # print(df[train_sz-predict_range:train_sz])
    # # print(df_forecast)
    # # print(original)
    # # print("original", '-'*80)
    # # print(original.head(24))

    # # print("df_forecast", '-'*80)
    # # print(df_forecast.head(24))

    # # original.set_index('date')
    # # df_forecast.set_index('date')

    # print(colored("MEAN SQUARED ERROR: ", 'red'), mean_squared_error(original[typeId], df_forecast[reading_types.at[int(typeId) - 1, 'reading_type_name']]))

    # # plt.figure(figsize=(18,8))
    # # plt.plot(original[typeId],label = "original")
    # # plt.plot(df_forecast[reading_types.at[int(typeId) - 1, 'reading_type_name']],label = "predicted")
    # # plt.title("Time Series Forecast")
    # # plt.xlabel("Date")
    # # plt.ylabel(reading_types.at[int(typeId) - 1, 'reading_type_name'])
    # # plt.legend()
    # # plt.show()

    # sns.lineplot(data= original, x = 'date', y = typeId)
    # sns.lineplot(data = df_forecast, x = 'date', y =  reading_types.at[int(typeId) - 1, 'reading_type_name'])
    # plt.show()

print(trainX.shape, trainY.shape, train_class.shape)


building: 1 --------------------------------------------------------------------------------


AttributeError: 'numpy.ndarray' object has no attribute 'dt'

In [None]:

value_input = keras.Input(shape = (trainX.shape[1], trainX.shape[2]), name = "values")
lstm1_value = layers.LSTM(150, return_sequences = True)(value_input)
lstm2_value = layers.LSTM(50, return_sequences = False)(lstm1_value)
# bn_value = layers.BatchNormalization()(lstm2_value)
# output_value = layers.Dense(1, activation = 'linear')(bn_value)

classification_input = keras.Input(shape = (train_class.shape[1],), name = "class")

x = layers.concatenate([lstm2_value, classification_input])
dense_layer = layers.Dense(30, activation = 'relu')(x)
# batch_norm = layers.BatchNormalization()(x)
val_pred = layers.Dense(1, activation = 'linear', name = "output")(dense_layer)

model = keras.Model(inputs = [value_input, classification_input], outputs = val_pred)


early_stop = EarlyStopping(monitor = 'val_loss', patience = 5)

model.compile(optimizer= tf.keras.optimizers.Adam(), loss = {"output": tf.keras.losses.Huber() } )

history = model.fit({
    "values": trainX, 
    "class": train_class
}, {
    "output": trainY
}, batch_size = 64, validation_split = 0.2, epochs = 20, verbose = 1, callbacks = [early_stop])

Epoch 1/20
Epoch 2/20
 2730/34292 [=>............................] - ETA: 18:17 - loss: nan

KeyboardInterrupt: 