In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import pathlib 
import os
import joblib

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller  
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from collections import Counter
from datetime import date 
import datetime


In [None]:
devices_df = pd.read_csv('devices.csv')
readings_df = pd.read_csv('sampled_readings.csv')
reading_types_df = pd.read_csv('reading_types.csv')

### Outlier Detection Using IQR

In [None]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.nanpercentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.nanpercentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        print("First Quartertile:", Q1, ". Third Quartile: ", Q3, ".Interquartile Range: ", IQR)
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v >= n )
    
    return multiple_outliers   

In [None]:
df = readings_df 
for k, v in readings_df.groupby('value_type_id'):
    outliers = detect_outliers(v, 1, ['value'])
    df = df.drop(outliers, axis = 0) 

df.info()

First Quartertile: 428.0 . Third Quartile:  564.8 .Interquartile Range:  136.79999999999995
First Quartertile: 0.0 . Third Quartile:  1.7 .Interquartile Range:  1.7
First Quartertile: 31.0 . Third Quartile:  249.0 .Interquartile Range:  218.0
First Quartertile: 0.0 . Third Quartile:  0.0 .Interquartile Range:  0.0
First Quartertile: 0.0 . Third Quartile:  0.2 .Interquartile Range:  0.2
First Quartertile: 0.0 . Third Quartile:  0.1 .Interquartile Range:  0.1
First Quartertile: 3.8 . Third Quartile:  13.2 .Interquartile Range:  9.399999999999999
First Quartertile: 0.0 . Third Quartile:  31.7 .Interquartile Range:  31.7
First Quartertile: 18.6 . Third Quartile:  20.0 .Interquartile Range:  1.3999999999999986
First Quartertile: 0.0 . Third Quartile:  23.0 .Interquartile Range:  23.0
First Quartertile: 20.4 . Third Quartile:  23.6 .Interquartile Range:  3.200000000000003
First Quartertile: 27.9 . Third Quartile:  45.1 .Interquartile Range:  17.200000000000003
<class 'pandas.core.frame.DataF

### Merging devices with sampled readings
Since devices in the same building_id are situated in the same environment we should expect that they share similar IAQ. There may be differences depending on the # of people in different rooms but we will hypothesize that the difference is minimal. Here we map device_ids to buildings to group all devices by building_id

In [None]:
#merging devices with sampled readings

df = pd.merge(df, devices_df, on='device_id', how='inner')
df = df.drop('device_id', axis = 1)

### Have aggregate value_types with the same hour
Since the data is not given in consistent time-steps we will use downsampling to aggregate data points for 5 minute time-steps. We will partition the data based on value_type_id as well as building_id

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.floor('5min')

aggregate_function = {'value': 'mean'}
df = df.groupby(['building_id', 'date', 'value_type_id']).agg(aggregate_function)


#pivot table so that value_type_id is a column 
df = pd.pivot_table(df, values = 'value', index = ['date', 'building_id'], columns = 'value_type_id').reset_index()  
df = df.rename_axis(None).rename_axis(None, axis=1)
df.columns = df.columns.map(str)

### Interpolating Small Gaps
For small gaps (15 minutes) in data we will use interpolation to predict missing values 

In [None]:
# df = df.sort_values(by = 'date')

# def my_interp(x):
#     if x.notnull().sum() > 1:
#         return x.interpolate(method='nearest').ffill().bfill()
#     else:
#         return x.ffill().bfill()
interp = pd.DataFrame()
for building, df1 in df.groupby('building_id'):
        df1 = df1.sort_values(by = 'date')
        # resampled = df.resample('60min', on = 'date', label = 'left').mean() 
        # resampled ['date'] = resampled.index.values
        for i in range (1, 13): 
                cnt = df1[str(i)].count()
                if cnt > 1: 
                        df1[str(i)] = df1[str(i)].interpolate(method='spline', order = min(cnt - 1, 3), limit = 3, axis=0)
                elif cnt == 1: 
                        df1[str(i)] = df1[str(i)].interpolate(method='linear', limit = 3, axis=0)
                else:  
                        df1[str(i)] = df1[str(i)].fillna(0)
        interp = pd.concat([interp, df1], ignore_index = True)

ValueError: order needs to be specified and greater than 0; got order: 0

In [None]:
df = interp

# Feature Generation

### Working Hour
Since IAQ most likely decreases off-work hours or when there is a lack of personell we will add features to determine working hours and weekends

In [None]:
# hour_mapping (1 if between 8am and 6pm)

# readings_df['date'] = pd.to_datetime(readings_df['date'])

df['work_hours'] = df['date'].dt.hour.between(8, 18)
df['work_hours'].map({True: 1, False: 0})


In [None]:
# day of week mapping (1 weekday, 0 weekend)

df['day type'] = df['date'].dt.dayofweek.map({
    0: 1,
    1: 1,
    2: 1,
    3: 1,
    4: 1,
    5: 0, 
    6: 0
})


In [None]:
# season mapping, use or not depending on seasonality dicky-fuller test

df['season'] = df['date'].dt.month.map({
    1: 'Winter',
    2: 'Winter',
    3: 'Spring',
    4: 'Spring',
    5: 'Spring',
    6: 'Summer',
    7: 'Summer',
    8: 'Summer',
    9: 'Fall',
    10: 'Fall',
    11: 'Fall',
    12: 'Winter'
})

season_encoder = pd.get_dummies(df['season'])
df = df.join(season_encoder)
df = df.drop('season', axis = 1)

In [None]:
def get_trimester_day(row): 
    dt = (row['date']).date()
    if 3 <= dt.month <= 5:
        return (dt - date(year=dt.year, month=3, day=1)) # Spring
    elif 6 <= dt.month <= 8:
        return (dt - date(year=dt.year, month=6, day=1))  # Summer
    elif 9 <= dt.month <= 11:
        return (dt - date(year=dt.year, month=9, day=1))  # Autumn
    else:
        if(dt.month == 12): 
            return (dt - date(year=dt.year, month=12, day=1))
        return (dt - date(year=dt.year - 1, month=12, day=1))  # Winter
    
df['trimester_day'] = df.apply(get_trimester_day, axis = 1)
df['trimester_day'] = df['trimester_day'].dt.days


### Building Encoder

In [None]:
# final_df = resampled_df 
# building_encoder = pd.get_dummies(final_df['building_id'], prefix  = 'building')
# final_df = final_df.join(building_encoder)
# final_df = final_df.drop('building_id', axis = 1) 

# device_encoder = pd.get_dummies(mergedDf['device_id'], prefix = 'device')
# mergedDf = mergedDf.join(device_encoder) don't know if this matters as much 

#### IMPORTANT: uncomment the line highlighted if you do not have preprocessed.csv, use this in the model.ipynb (so we stop working on same file)

In [None]:
df.to_csv('preprocessed_lstm.csv') # <-- COMMENT THIS OUT IF YOU DON'T HAVE preprocessd.csv yet 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1832737 entries, 0 to 1832736
Data columns (total 21 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date           datetime64[ns]
 1   building_id    int64         
 2   1              float64       
 3   2              float64       
 4   3              float64       
 5   4              float64       
 6   5              float64       
 7   6              float64       
 8   7              float64       
 9   8              float64       
 10  9              float64       
 11  10             float64       
 12  11             float64       
 13  12             float64       
 14  work_hours     bool          
 15  day type       int64         
 16  Fall           bool          
 17  Spring         bool          
 18  Summer         bool          
 19  Winter         bool          
 20  trimester_day  int64         
dtypes: bool(5), datetime64[ns](1), float64(12), int64(3)
memory usage: 232.5 MB
N

In [None]:
print(df.info(verbose = True, show_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1832737 entries, 0 to 1832736
Data columns (total 21 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   date           1832737 non-null  datetime64[ns]
 1   building_id    1832737 non-null  int64         
 2   1              1043716 non-null  float64       
 3   2              1074519 non-null  float64       
 4   3              1024387 non-null  float64       
 5   4              943762 non-null   float64       
 6   5              971948 non-null   float64       
 7   6              983870 non-null   float64       
 8   7              1043221 non-null  float64       
 9   8              440684 non-null   float64       
 10  9              1006545 non-null  float64       
 11  10             598425 non-null   float64       
 12  11             1017428 non-null  float64       
 13  12             1072212 non-null  float64       
 14  work_hours     1832737 non-null  b

In [None]:
df = pd.read_csv('preprocessed_lstm.csv')
idx = pd.date_range('01-01-2023', '12-31-2023', freq = '5min')
print(idx)

DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 00:05:00',
               '2023-01-01 00:10:00', '2023-01-01 00:15:00',
               '2023-01-01 00:20:00', '2023-01-01 00:25:00',
               '2023-01-01 00:30:00', '2023-01-01 00:35:00',
               '2023-01-01 00:40:00', '2023-01-01 00:45:00',
               ...
               '2023-12-30 23:15:00', '2023-12-30 23:20:00',
               '2023-12-30 23:25:00', '2023-12-30 23:30:00',
               '2023-12-30 23:35:00', '2023-12-30 23:40:00',
               '2023-12-30 23:45:00', '2023-12-30 23:50:00',
               '2023-12-30 23:55:00', '2023-12-31 00:00:00'],
              dtype='datetime64[ns]', length=104833, freq='5T')


In [None]:
df.set_index('date', inplace = True, drop = False)

In [None]:
df = df.reindex(idx, fill_value = np.nan)

                     Unnamed: 0                 date  building_id          1  \
date                                                                           
2023-01-01 00:00:00       89691  2023-01-01 00:00:00            2        NaN   
2023-01-01 00:10:00       89692  2023-01-01 00:10:00            2        NaN   
2023-01-01 00:15:00       89693  2023-01-01 00:15:00            2        NaN   
2023-01-01 00:20:00       89694  2023-01-01 00:20:00            2  488.30000   
2023-01-01 00:25:00       89695  2023-01-01 00:25:00            2  488.05744   
...                         ...                  ...          ...        ...   
2023-12-31 23:35:00     1832732  2023-12-31 23:35:00           41  412.00000   
2023-12-31 23:40:00     1832733  2023-12-31 23:40:00           41  415.81025   
2023-12-31 23:45:00     1832734  2023-12-31 23:45:00           41  432.00000   
2023-12-31 23:50:00     1832735  2023-12-31 23:50:00           41  432.00000   
2023-12-31 23:55:00     1832736  2023-12

In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression #logistic Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from sklearn.ensemble import HistGradientBoostingRegressor

from datetime import date
from datetime import datetime

import tensorflow as tf

import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.losses import MeanSquaredError

import itertools
from keras.layers import LSTM

import seaborn as sns 

from scipy.stats import boxcox 
from scipy.special import inv_boxcox

from termcolor import colored


In [None]:
samples = pd.read_csv('preprocessed_lstm.csv')
samples['date'] = pd.to_datetime(samples['date'])
building_encoder = pd.get_dummies(samples['building_id'])
samples = samples.join(building_encoder.add_suffix('_b'))
reading_types = pd.read_csv('reading_types.csv')
# samples.info()
samples.info()


df_lst = [(k, v) for k, v in samples.groupby('building_id')]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1832737 entries, 0 to 1832736
Data columns (total 51 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Unnamed: 0     int64         
 1   date           datetime64[ns]
 2   building_id    int64         
 3   1              float64       
 4   2              float64       
 5   3              float64       
 6   4              float64       
 7   5              float64       
 8   6              float64       
 9   7              float64       
 10  8              float64       
 11  9              float64       
 12  10             float64       
 13  11             float64       
 14  12             float64       
 15  work_hours     bool          
 16  day type       int64         
 17  Fall           bool          
 18  Spring         bool          
 19  Summer         bool          
 20  Winter         bool          
 21  trimester_day  int64         
 22  1_b            bool          
 23  2_b    

In [None]:
from matplotlib import pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
value_type_ids = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
scaler = StandardScaler()  
scaler = scaler.fit(samples[value_type_ids]) 

trainX, trainY, train_class = np.array([[[]]]).reshape(0, 10, 12), np.array([[]]).reshape(0, 12), np.array([[]]).reshape(0, 35)


for building, df in df_lst:
    print('building:', building, '-'*80)
    train_dates = pd.to_datetime(df['date'])
    df = df.set_index('date', drop = False) 
    idx = pd.date_range('01-01-2023', '12-31-2023 23:55', freq = '5min')
    df = df.reindex(idx, fill_value = 0)

    # df.info()
    df = df.drop(['day type', 'Winter', 'Spring', 'Summer', 'Fall', 'work_hours', 'trimester_day'], axis = 1)

    df['date'] = df.index
    
    df['day type'] = df['date'].dt.dayofweek.map({
        0: 1,
        1: 1,
        2: 1,
        3: 1,
        4: 1,
        5: 0, 
        6: 0
    })

        
    df['season'] = df['date'].dt.month.map({
        1: 'Winter',
        2: 'Winter',
        3: 'Spring',
        4: 'Spring',
        5: 'Spring',
        6: 'Summer',
        7: 'Summer',
        8: 'Summer',
        9: 'Fall',
        10: 'Fall',
        11: 'Fall',
        12: 'Winter'
    })

    df['work_hours'] = df['date'].dt.hour.between(8, 18)
    df['work_hours'].map({True: 1, False: 0})

    df['Winter'] = df['season'].map({
        'Winter': 1,
        'Spring': 0,
        'Summer': 0,
        'Fall': 0
    })
    df['Spring'] = df['season'].map({
        'Winter': 0,
        'Spring': 1,
        'Summer': 0,
        'Fall': 0
    })
    df['Summer'] = df['season'].map({
        'Winter': 0,
        'Spring': 0,
        'Summer': 1,
        'Fall': 0
    })
    df['Fall'] = df['season'].map({
        'Winter': 0,
        'Spring': 0,
        'Summer': 0,
        'Fall': 1
    })


    df = df.drop('season', axis = 1)

    multivariate = df.drop(['Unnamed: 0', 'building_id', 'date'], axis = 1)  
    multivariate = multivariate.astype('float32')
    classification_info = multivariate.drop(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'], axis = 1)

    df[str(building) + '_b'] = 1

    # classification_info.info()

    multivariate = multivariate[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']]
    multivariate = multivariate.interpolate(method = 'linear').bfill().ffill()

    # multivariate.info()


    # for typeId in value_type_ids:
    #     if(multivariate[typeId].isnull().sum() <= len(multivariate) * 0.5):
    #         print(typeId, "gogogiaojsugdiadsug")
    #         print(df.info(show_counts = True))
    #         multivariate[typeId] = multivariate[typeId].fillna(0)

    sz = len(multivariate)
    train_sz = int(sz * 0.9)
    test_sz = len(multivariate) - train_sz

    df_scaled = scaler.transform(multivariate)

    train, test = df_scaled[0:train_sz,:], df_scaled[train_sz:sz,:]
    tClass, tstClass = classification_info[0: train_sz], classification_info[train_sz:sz]

    test_dates = train_dates[train_sz:sz]

    def create_dataset(dataset, classes, look_back = 1):
        dataX, dataY = [], []
        class_X = []
        for i in range(len(dataset) - look_back - 1):
            if(np.isnan(dataset[i + look_back]).any()): 
                continue
            dataY.append(dataset[i + look_back, :])
            a = dataset[i:(i + look_back), :]
            if(np.isnan(a).any()): # for masking
                a.fill(-1)
            dataX.append(a)
            class_X.append(classes.iloc[i + look_back, :])

        return np.array(dataX), np.array(dataY), np.array(class_X)
    
    look_back = 10
    tX, tY, tClass = create_dataset(train, tClass, look_back)

    print(tX.shape, tY.shape, tClass.shape)

    # testX, testY, test_class = create_dataset(test, test_class, look_back)
    
    # reshape input to be [samples, time steps, features]
    tX, tY, tClass = np.array(tX), np.array(tY), np.array(tClass)

    print(tX.shape, tY.shape, tClass.shape, trainX.shape, trainY.shape, train_class.shape)
    trainX, trainY, train_class = np.append(trainX, tX, axis = 0), np.append(trainY, tY, axis = 0), np.append(train_class, tClass, axis = 0)
    # testX, testY, test_class = np.array(testX), np.array(testY), np.array(test_class)

    # print('trainX shape == {}.'.format(trainX.shape))
    # print('trainY shape == {}.'.format(trainY.shape)) 
    # print('train_class shape == {}.'.format(train_class.shape))
    
    # plt.figure(figsize = (18, 10))
    # plt.plot(history.history['loss'], label='Training loss')
    # plt.plot(history.history['val_loss'], label='Validation loss')
    # plt.legend()
    # plt.show()
    # plt.clf()
    # plt.cla()
    # plt.close()


    # predict_range = 1
    # train_dates = train_dates[0:train_sz]

    # n_future = pd.date_range(train_dates.iloc[look_back], periods = predict_range, freq = '5min').tolist()

    # prediction = model.predict(trainX[look_back:look_back + predict_range])

    # col = 3

    # y_pred_future = scaler.inverse_transform(prediction)[:, 10]
    # print(y_pred_future)

    # #inverse boxcox
    # # y_pred_future = inv_boxcox(y_pred_future, fitted_lambda)
    # # y_pred_future = np.vectorize(lambda x: x - 1e-6)(y_pred_future)
    # # # y_pred_future = y_pred_future[0] - 1e-6

    # forecast_dates = []
    # for time_i in n_future:
    #     forecast_dates.append(time_i.date())


    # df_forecast = pd.DataFrame({'date':np.array(forecast_dates), reading_types.at[int('11') - 1, 'reading_type_name']:y_pred_future})
    # df_forecast['date']=pd.to_datetime(df_forecast['date'])

    # original = df[['date', '11']]
    # original['date']=pd.to_datetime(original['date'])
    # original = original[look_back:look_back+predict_range]

    # print("original", '-'*80)
    # print(original.head(1))

    # print("df_forecast", '-'*80)
    # print(df_forecast.head(1))
    # print(reading_types.at[int(typeId) - 1, 'reading_type_name'], '='*100)

    # # print(trainX[-predict_range:])
    # # print(df[train_sz-predict_range:train_sz])
    # # print(df_forecast)
    # # print(original)
    # # print("original", '-'*80)
    # # print(original.head(24))

    # # print("df_forecast", '-'*80)
    # # print(df_forecast.head(24))

    # # original.set_index('date')
    # # df_forecast.set_index('date')

    # print(colored("MEAN SQUARED ERROR: ", 'red'), mean_squared_error(original[typeId], df_forecast[reading_types.at[int(typeId) - 1, 'reading_type_name']]))

    # # plt.figure(figsize=(18,8))
    # # plt.plot(original[typeId],label = "original")
    # # plt.plot(df_forecast[reading_types.at[int(typeId) - 1, 'reading_type_name']],label = "predicted")
    # # plt.title("Time Series Forecast")
    # # plt.xlabel("Date")
    # # plt.ylabel(reading_types.at[int(typeId) - 1, 'reading_type_name'])
    # # plt.legend()
    # # plt.show()

    # sns.lineplot(data= original, x = 'date', y = typeId)
    # sns.lineplot(data = df_forecast, x = 'date', y =  reading_types.at[int(typeId) - 1, 'reading_type_name'])
    # plt.show()

print(trainX.shape, trainY.shape, train_class.shape)


building: 1 --------------------------------------------------------------------------------
(94597, 10, 12) (94597, 12) (94597, 35)
(94597, 10, 12) (94597, 12) (94597, 35) (0, 10, 12) (0, 12) (0, 35)
building: 2 --------------------------------------------------------------------------------
(94597, 10, 12) (94597, 12) (94597, 35)
(94597, 10, 12) (94597, 12) (94597, 35) (94597, 10, 12) (94597, 12) (94597, 35)
building: 3 --------------------------------------------------------------------------------
(94597, 10, 12) (94597, 12) (94597, 35)
(94597, 10, 12) (94597, 12) (94597, 35) (189194, 10, 12) (189194, 12) (189194, 35)
building: 6 --------------------------------------------------------------------------------
(94597, 10, 12) (94597, 12) (94597, 35)
(94597, 10, 12) (94597, 12) (94597, 35) (283791, 10, 12) (283791, 12) (283791, 35)
building: 8 --------------------------------------------------------------------------------
(94597, 10, 12) (94597, 12) (94597, 35)
(94597, 10, 12) (9459

In [None]:

value_input = keras.Input(shape = (trainX.shape[1], trainX.shape[2]), name = "values")
lstm1_value = layers.LSTM(150, return_sequences = True)(value_input)
lstm2_value = layers.LSTM(50, return_sequences = False)(lstm1_value)
# bn_value = layers.BatchNormalization()(lstm2_value)
# output_value = layers.Dense(1, activation = 'linear')(bn_value)

classification_input = keras.Input(shape = (train_class.shape[1],), name = "class")

x = layers.concatenate([lstm2_value, classification_input])
dense_layer = layers.Dense(30, activation = 'relu')(x)
# batch_norm = layers.BatchNormalization()(x)
val_pred = layers.Dense(12, activation = 'linear', name = "output")(dense_layer)

model = keras.Model(inputs = [value_input, classification_input], outputs = val_pred)


early_stop = EarlyStopping(monitor = 'val_loss', patience = 5)

model.compile(optimizer= tf.keras.optimizers.Adam(), loss = {"output": tf.keras.losses.Huber() } )

history = model.fit({
    "values": trainX, 
    "class": train_class
}, {
    "output": trainY
}, batch_size = 64, validation_split = 0.2, epochs = 50, verbose = 1, callbacks = [early_stop])

Epoch 1/50

In [None]:
model.save('model2.keras')

In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit #for data preprocessing and crass validating 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing

from datetime import date
from datetime import datetime
from datetime import timedelta

import tensorflow as tf
import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.losses import MeanSquaredError
from keras.layers import LSTM

import itertools

from sklearn.discriminant_analysis import StandardScaler


import seaborn as sns 

from scipy.stats import boxcox 
from scipy.special import inv_boxcox

from termcolor import colored

import joblib
from tqdm.notebook import tqdm 

import swifter
from pandarallel import pandarallel

In [None]:
tf.__version__
model = tf.keras.models.load_model('model2.keras')

In [None]:
samples = pd.read_csv('preprocessed_lstm.csv')
building_id = pd.read_csv('devices.csv')
test = pd.read_csv('test.csv', header = None)
reading_types = pd.read_csv('reading_types.csv')

samples['date'] = pd.to_datetime(samples['date'])

predictions = test
value_type_ids = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

train = pd.DataFrame()

In [None]:
for building, df in samples.groupby('building_id'): 
    df = df.set_index('date', drop = False) 
    idx = pd.date_range('01-01-2023', '12-31-2023 23:55', freq = '5min')
    df = df.reindex(idx, fill_value = building)
    df.index = pd.to_datetime(df.index)
    df['date'] = df.index

    df = df.drop(['day type', 'Winter', 'Spring', 'Summer', 'Fall', 'work_hours', 'trimester_day'], axis = 1)
    
    df['day type'] = df['date'].dt.dayofweek.map({
        0: 1,
        1: 1,
        2: 1,
        3: 1,
        4: 1,
        5: 0, 
        6: 0
    })

        
    df['season'] = df['date'].dt.month.map({
        1: 'Winter',
        2: 'Winter',
        3: 'Spring',
        4: 'Spring',
        5: 'Spring',
        6: 'Summer',
        7: 'Summer',
        8: 'Summer',
        9: 'Fall',
        10: 'Fall',
        11: 'Fall',
        12: 'Winter'
    })

    df['work_hours'] = df['date'].dt.hour.between(8, 18)
    df['work_hours'].map({True: 1, False: 0})

    df['Winter'] = df['season'].map({
        'Winter': 1,
        'Spring': 0,
        'Summer': 0,
        'Fall': 0
    })
    df['Spring'] = df['season'].map({
        'Winter': 0,
        'Spring': 1,
        'Summer': 0,
        'Fall': 0
    })
    df['Summer'] = df['season'].map({
        'Winter': 0,
        'Spring': 0,
        'Summer': 1,
        'Fall': 0
    })
    df['Fall'] = df['season'].map({
        'Winter': 0,
        'Spring': 0,
        'Summer': 0,
        'Fall': 1
    })


    df = df.drop('season', axis = 1)

    df[value_type_ids] = df[value_type_ids].interpolate(method = 'linear').bfill().ffill()

    df.info()

    train = pd.concat([train, df])

train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 105120 entries, 2023-01-01 00:00:00 to 2023-12-31 23:55:00
Freq: 5T
Data columns (total 21 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Unnamed: 0   105120 non-null  int64         
 1   date         105120 non-null  datetime64[ns]
 2   building_id  105120 non-null  int64         
 3   1            105120 non-null  float64       
 4   2            105120 non-null  float64       
 5   3            105120 non-null  float64       
 6   4            105120 non-null  float64       
 7   5            105120 non-null  float64       
 8   6            105120 non-null  float64       
 9   7            105120 non-null  float64       
 10  8            105120 non-null  float64       
 11  9            105120 non-null  float64       
 12  10           105120 non-null  float64       
 13  11           105120 non-null  float64       
 14  12           105120 non-null  float64    

In [None]:
scaler = StandardScaler()  
scaler = scaler.fit(samples[value_type_ids]) 

train[value_type_ids] = scaler.transform(train[value_type_ids])


In [None]:
predictions.columns = ['device_id', 'date', 'value_type_id']
predictions = pd.merge(predictions, building_id, on='device_id', how='inner')
predictions['date'] =  pd.to_datetime(predictions['date'])
predictions['floored_date'] = predictions['date'].dt.floor('5min')

In [None]:
building_encoder = pd.get_dummies(predictions['building_id'])
predictions = predictions.join(building_encoder.add_suffix('_b'))

predictions['day type'] = predictions['date'].dt.dayofweek.map({
    0: 1,
    1: 1,
    2: 1,
    3: 1,
    4: 1,
    5: 0, 
    6: 0
})

predictions['work_hours'] = predictions['date'].dt.hour.between(8, 18)
predictions['work_hours'].map({True: 1, False: 0})

predictions['season'] = predictions['date'].dt.month.map({
    1: 'Winter',
    2: 'Winter',
    3: 'Spring',
    4: 'Spring',
    5: 'Spring',
    6: 'Summer',
    7: 'Summer',
    8: 'Summer',
    9: 'Fall',
    10: 'Fall',
    11: 'Fall',
    12: 'Winter'
})

season_encoder = pd.get_dummies(predictions['season'])
predictions = predictions.join(season_encoder)
predictions = predictions.drop('season', axis = 1)


In [None]:
predictions.info() 
progressBar = tqdm(total = len(predictions) - 7e6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7856720 entries, 0 to 7856719
Data columns (total 40 columns):
 #   Column         Dtype         
---  ------         -----         
 0   device_id      int64         
 1   date           datetime64[ns]
 2   value_type_id  int64         
 3   building_id    int64         
 4   floored_date   datetime64[ns]
 5   1_b            bool          
 6   2_b            bool          
 7   3_b            bool          
 8   6_b            bool          
 9   8_b            bool          
 10  10_b           bool          
 11  11_b           bool          
 12  12_b           bool          
 13  13_b           bool          
 14  16_b           bool          
 15  17_b           bool          
 16  18_b           bool          
 17  19_b           bool          
 18  20_b           bool          
 19  21_b           bool          
 20  23_b           bool          
 21  24_b           bool          
 22  25_b           bool          
 23  26_b   

  0%|          | 0/856720.0 [00:00<?, ?it/s]

In [None]:
def get_predictions(row):
    type = row['value_type_id']
    date = row['floored_date'] 
    building = row['building_id']

    classes = row.drop(['device_id', 'date',  'value_type_id', 'floored_date', 'building_id'])

    start_date = date - timedelta(minutes=50)
    end_date = date - timedelta(minutes=5)
    inputs = train[(train['building_id'] == building)]
    inputs = inputs.loc[(inputs['date'] >= start_date)].head(10)

    inputs = inputs[value_type_ids]

    inputs = inputs.astype('float32')
    classes = classes.astype('float32')

    # inputs = scaler.transform(inputs)
    # print(inputs)
    # print(classes)

    dataX = []
    class_X = []

    dataX.append(inputs)  # Use iloc for DataFrame slicing
    class_X.append(classes) 

    dataX = np.array(dataX) 
    class_X = np.array(class_X)


    prediction = model.predict({
        "values": dataX,
        "class": class_X
    }, verbose = 0)


    pred = scaler.inverse_transform(prediction)
    
    progressBar.update(1)

    return pred[0][type - 1] #prob have to grab the value or smt





In [None]:
start, end  = int(6.9e6), int(7e6)
predictions = predictions.iloc[start:end]
progressBar = tqdm(total = len(predictions))
predictions['value'] = predictions.swifter.apply(get_predictions, axis=1)

  0%|          | 0/100000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/100000 [00:00<?, ?it/s]

In [None]:
predictions = predictions.drop(['floored_date', 'building_id'], axis = 1)
predictions.to_csv('thomaspc_1.csv', header = False)