In [79]:
# import
%load_ext autoreload
%autoreload 2

from time import time
from statistics import mean
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.5f}'.format

from IPython.display import display

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Neural Networks
import torch
import torch.nn as nn
import torch.nn.functional as func
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader

#import pytorch_lightning as pl
#from pytorch_lightning import Trainer, seed_everything
#from pytorch_lightning.loggers import CSVLogger

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

#path of data
path = 'energydata_complete.csv'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [80]:
class TimeseriesDataset(Dataset):   
    '''
    Custom Dataset subclass. 
    Serves as input to DataLoader to transform X 
      into sequence data using rolling window. 
    DataLoader using this dataset will output batches 
      of `(batch_size, seq_len, n_features)` shape.
    Suitable as an input to RNNs. 
    '''
    def __init__(self, X: np.ndarray, y: np.ndarray, seq_len: int = 1):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

In [81]:
class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

In [82]:
#prepare data
df=pd.read_csv(path,parse_dates = ['date']).dropna()
df = df.drop(['date'],axis = 1) # 27 features, 1 target
print(df.head(3))
target = 'Appliances'
#print(df.shape)
y_df = df[target]
x_df = df.loc[:,df.columns != target]
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, shuffle=False) #  test is 20%

#convert all column to float
x_train_float = x_train.astype(float)
y_train_float = y_train.astype(float)
x_test_float = x_test.astype(float)
y_test_float = y_test.astype(float)
#print(x_train_float.dtypes)

#normalized
scaler = MinMaxScaler(feature_range=(0, 1))
x_train_normalized = scaler.fit_transform(x_train_float.to_numpy())
y_train_normalized = scaler.fit_transform(y_train_float.to_numpy().reshape(-1, 1))
print(len(x_train_normalized))
print(len(y_train_normalized))
#convert to tensors
batch_size = 700
seq_len=4 #24
train_dataset = TimeseriesDataset(x_train_normalized, y_train_normalized, seq_len)
train_loader = DataLoader(train_dataset, batch_size, shuffle = False)



   Appliances  lights       T1     RH_1       T2     RH_2       T3     RH_3  \
0          60      30 19.89000 47.59667 19.20000 44.79000 19.79000 44.73000   
1          60      30 19.89000 46.69333 19.20000 44.72250 19.79000 44.79000   
2          50      30 19.89000 46.30000 19.20000 44.62667 19.79000 44.93333   

        T4     RH_4  ...       T9     RH_9   T_out  Press_mm_hg   RH_out  \
0 19.00000 45.56667  ... 17.03333 45.53000 6.60000    733.50000 92.00000   
1 19.00000 45.99250  ... 17.06667 45.56000 6.48333    733.60000 92.00000   
2 18.92667 45.89000  ... 17.00000 45.50000 6.36667    733.70000 92.00000   

   Windspeed  Visibility  Tdewpoint      rv1      rv2  
0    7.00000    63.00000    5.30000 13.27543 13.27543  
1    6.66667    59.16667    5.20000 18.60619 18.60619  
2    6.33333    55.33333    5.10000 28.64267 28.64267  

[3 rows x 28 columns]
15788
15788


In [83]:
#set model

model = LSTM(input_size = 27*4)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)


LSTM(
  (lstm): LSTM(108, 100)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)


In [84]:
#train model
epochs = 15
total_loss = []
for i in range(epochs):
    epoch_loss=[]
    for idx, (x,y) in enumerate (train_loader):
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),torch.zeros(1, 1, model.hidden_layer_size))

        y_pred = model(x)

        single_loss = loss_function(y_pred, y)
        epoch_loss.append(single_loss.item())
        single_loss.backward()
        optimizer.step()
        #epoch_loss.append(single_loss.item())
    print(f'epoch: {i:3} loss: {mean(epoch_loss):10.8f}') #10digits,8 digits after decimal
    #print(f'current total loss: {total_loss:10.10f}')
    total_loss.append(mean(epoch_loss))
print(f'Total average loss: {mean(total_loss):10.10f}')
model.eval()

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch:   0 loss: 0.01615768
epoch:   1 loss: 0.01341132
epoch:   2 loss: 0.01061046
epoch:   3 loss: 0.00987257
epoch:   4 loss: 0.00970546
epoch:   5 loss: 0.00966647
epoch:   6 loss: 0.00963747
epoch:   7 loss: 0.00963704
epoch:   8 loss: 0.00965812
epoch:   9 loss: 0.00969316
epoch:  10 loss: 0.00974318
epoch:  11 loss: 0.00981009
epoch:  12 loss: 0.00989933
epoch:  13 loss: 0.01002339
epoch:  14 loss: 0.01021134
Total average loss: 0.0105158050


LSTM(
  (lstm): LSTM(108, 100)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [85]:
#prepare test data
#path_test = 'energydata_complete_miss_all.csv'
path_test = 'energydata_complete.csv'
#df=pd.read_csv(path_test,parse_dates = ['date']).dropna()
df=pd.read_csv(path_test).dropna()
df = df.iloc[: , 1:]
#df = df.drop(['date'],axis = 1) # 27 features, 1 target
print(df.head(3))
target = 'Appliances'
#print(df.shape)
y_df = df[target]
x_df = df.loc[:,df.columns != target]
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, shuffle=False) #  test is 20%

#convert all column to float
#x_train_float = x_train.astype(float)
#y_train_float = y_train.astype(float)
x_test_float = x_test.astype(float)
y_test_float = y_test.astype(float)
#print(x_train_float.dtypes)

#normalized
scaler = MinMaxScaler(feature_range=(0, 1))
#x_train_normalized = scaler.fit_transform(x_train_float.to_numpy())
#y_train_normalized = scaler.fit_transform(y_train_float.to_numpy().reshape(-1, 1))
x_test_normalized = scaler.fit_transform(x_test_float.to_numpy())
y_test_normalized = scaler.fit_transform(y_test_float.to_numpy().reshape(-1, 1))
#print(len(x_train_normalized))
#print(len(y_train_normalized))
#convert to tensors
batch_size = 700
seq_len=4 #24
test_dataset = TimeseriesDataset(x_test_normalized, y_test_normalized, seq_len)
test_loader = DataLoader(test_dataset, batch_size, shuffle = False)
test_loss= []
with torch.no_grad():
    #for data, _ in test_loader:
    for batch_idx, (x,y) in enumerate(test_loader):
        y_pred = model(x)
        single_loss = loss_function(y_pred, y)
        test_loss.append(single_loss)
        print('batch id'+str(batch_idx)+',Test set loss: {:.8f}'.format(single_loss))
test_mean = torch.mean(torch.stack(test_loss))
print('====> Test set loss: {:.8f}'.format(test_mean))    


   Appliances  lights       T1     RH_1       T2     RH_2       T3     RH_3  \
0          60      30 19.89000 47.59667 19.20000 44.79000 19.79000 44.73000   
1          60      30 19.89000 46.69333 19.20000 44.72250 19.79000 44.79000   
2          50      30 19.89000 46.30000 19.20000 44.62667 19.79000 44.93333   

        T4     RH_4  ...       T9     RH_9   T_out  Press_mm_hg   RH_out  \
0 19.00000 45.56667  ... 17.03333 45.53000 6.60000    733.50000 92.00000   
1 19.00000 45.99250  ... 17.06667 45.56000 6.48333    733.60000 92.00000   
2 18.92667 45.89000  ... 17.00000 45.50000 6.36667    733.70000 92.00000   

   Windspeed  Visibility  Tdewpoint      rv1      rv2  
0    7.00000    63.00000    5.30000 13.27543 13.27543  
1    6.66667    59.16667    5.20000 18.60619 18.60619  
2    6.33333    55.33333    5.10000 28.64267 28.64267  

[3 rows x 28 columns]
batch id0,Test set loss: 0.01288379
batch id1,Test set loss: 0.00886956
batch id2,Test set loss: 0.01180605
batch id3,Test set loss

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
#test model
