# Timeseries pytorch-lightning

In [1]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plot
import math
from matplotlib import rc
from pylab import rcParams

import pandas as pd
import numpy as numpy
import pytorch_lightning as pl
from tqdm.notebook import tqdm
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [2]:
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter contrib nbextension install

In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retine'
# !jupyter nbextension enable --py widgetsnbextension

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ['#01BEFE', '#FFDD00', '#FF7D00', '#FF006D', '#ADFF02', '#8F00FF']
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
tqdm.pandas()
pl.seed_everything(42)

ERROR:root:supported formats are: 'png','png2x','pdf','retina','jpg','jpeg','svg' not 'retine'
Seed set to 42


42

## __Import dataset__

In [4]:
df = pd.read_csv('../datasets/household_power_consumption_v3_drop.txt', parse_dates={'datetime' : ['date','time']}, index_col='datetime', sep=';', usecols=[1,2,3,4,5,6,7,8,9,11,12,13])
df.head(5)

Unnamed: 0_level_0,global_active_power,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,16,12,2006
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,16,12,2006
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,16,12,2006
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,16,12,2006
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,16,12,2006


### Preprocessing

In [5]:
# rows = []

# for _, row in df.iterrows():
#     row_data = dict(
#         grp=row.global_reactive_power,
#         voltage=row.voltage,
#         global_intensity=row.global_intensity,
#         kitchen=row.kitchen,
#         laundry_room=row.laundry_room,
#         thermal_utilities=row.thermal_utilities,
#         day=row.day,
#         month=row.month,
#         year=row.year
#     )
#     rows.append(row_data)
# features_df = pd.DataFrame(rows)

In [6]:
cols = ['global_reactive_power', 'voltage', 'global_intensity', 'kitchen', 'laundry_room', 'thermal_utilities', 'day', 'month', 'year', 'global_active_power']
features_df = df[cols]
features_df

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,0.418,234.84,18.4,0.0,1.0,17.0,16,12,2006,4.216
2006-12-16 17:25:00,0.436,233.63,23.0,0.0,1.0,16.0,16,12,2006,5.360
2006-12-16 17:26:00,0.498,233.29,23.0,0.0,2.0,17.0,16,12,2006,5.374
2006-12-16 17:27:00,0.502,233.74,23.0,0.0,1.0,17.0,16,12,2006,5.388
2006-12-16 17:28:00,0.528,235.68,15.8,0.0,1.0,17.0,16,12,2006,3.666
...,...,...,...,...,...,...,...,...,...,...
2010-11-26 20:58:00,0.000,240.43,4.0,0.0,0.0,0.0,26,11,2010,0.946
2010-11-26 20:59:00,0.000,240.00,4.0,0.0,0.0,0.0,26,11,2010,0.944
2010-11-26 21:00:00,0.000,239.82,3.8,0.0,0.0,0.0,26,11,2010,0.938
2010-11-26 21:01:00,0.000,239.70,3.8,0.0,0.0,0.0,26,11,2010,0.934


In [93]:
cut_df = features_df[:500000]
train_size = int(len(cut_df) * .9)
train_size

450000

In [94]:
train_df, test_df = cut_df[:train_size], cut_df[train_size + 1:]
train_df.shape, test_df.shape

((450000, 10), (49999, 10))

In [95]:
# scalling data only using train df
scaler = MinMaxScaler(feature_range=(-1,1))
scaler = scaler.fit(train_df)

In [96]:
train_df = pd.DataFrame(
    scaler.transform(train_df), 
    index=train_df.index, 
    columns=train_df.columns)
train_df.head(3)

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,-0.271777,-0.195321,-0.217391,-1.0,-0.974359,0.7,0.0,1.0,-1.0,-0.219116
2006-12-16 17:25:00,-0.240418,-0.281106,-0.017391,-1.0,-0.974359,0.6,0.0,1.0,-1.0,-0.003022
2006-12-16 17:26:00,-0.132404,-0.305211,-0.017391,-1.0,-0.948718,0.7,0.0,1.0,-1.0,-0.000378


In [97]:
test_df = pd.DataFrame(
    scaler.transform(test_df), 
    index=test_df.index, 
    columns=test_df.columns)
test_df.head(3)

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2007-10-27 22:57:00,-1.0,0.453385,-0.982609,-1.0,-1.0,-1.0,0.733333,0.636364,1.0,-0.976199
2007-10-27 22:58:00,-0.850174,0.450549,-0.956522,-1.0,-1.0,-1.0,0.733333,0.636364,1.0,-0.958444
2007-10-27 22:59:00,-0.850174,0.441333,-0.965217,-1.0,-1.0,-1.0,0.733333,0.636364,1.0,-0.961844


__To sequences__

In [98]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length, offset = 1):
    sequences = []
    data_size = len(input_data)
    
    for i in range(data_size - sequence_length):
        sequence = input_data[i:i+sequence_length]
        
        label_poisition = i+sequence_length
        label = input_data.iloc[label_poisition:label_poisition+offset][target_column]
        
        sequences.append((sequence, label))
        
    return sequences

In [99]:
input_data = train_df[:100]

train_df[:500000]

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,-0.271777,-0.195321,-0.217391,-1.0,-0.974359,0.7,0.000000,1.000000,-1.0,-0.219116
2006-12-16 17:25:00,-0.240418,-0.281106,-0.017391,-1.0,-0.974359,0.6,0.000000,1.000000,-1.0,-0.003022
2006-12-16 17:26:00,-0.132404,-0.305211,-0.017391,-1.0,-0.948718,0.7,0.000000,1.000000,-1.0,-0.000378
2006-12-16 17:27:00,-0.125436,-0.273307,-0.017391,-1.0,-0.974359,0.7,0.000000,1.000000,-1.0,0.002267
2006-12-16 17:28:00,-0.080139,-0.135767,-0.330435,-1.0,-0.974359,0.7,0.000000,1.000000,-1.0,-0.323007
...,...,...,...,...,...,...,...,...,...,...
2007-10-27 22:51:00,-1.000000,0.431407,-0.982609,-1.0,-1.000000,-1.0,0.733333,0.636364,1.0,-0.976199
2007-10-27 22:52:00,-1.000000,0.463311,-0.982609,-1.0,-1.000000,-1.0,0.733333,0.636364,1.0,-0.976199
2007-10-27 22:53:00,-1.000000,0.473945,-0.982609,-1.0,-1.000000,-1.0,0.733333,0.636364,1.0,-0.976199
2007-10-27 22:54:00,-1.000000,0.461184,-0.982609,-1.0,-1.000000,-1.0,0.733333,0.636364,1.0,-0.976199


In [100]:
train_seq = create_sequences(input_data, 'global_active_power', 20, 1)

In [101]:
print(f'Train sequence shape (features): {train_seq[0][0].shape}; (labels): {train_seq[0][1].shape}')

Train sequence shape (features): (20, 10); (labels): (1,)


In [102]:
num_seq = 60 # 60 minutes
offset = 1
train_sequences = create_sequences(train_df, 'global_active_power', num_seq, offset)
test_sequences = create_sequences(test_df, 'global_active_power', num_seq, offset)


### Create pytorch dataset (time-series)

https://www.youtube.com/watch?v=ODEGJ_kh2aA

In [103]:
class PCDataset(Dataset):
    def __init__(self, sequences):
        # super().__init__()
        self.sequences = sequences
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, index):
        sequence, label = self.sequences[index]
        return dict(
            sequence = torch.Tensor(sequence.to_numpy()),
            label = torch.Tensor(label).float()    
        )
        

In [145]:
class PCDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=8):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size
        print('batch size', self.batch_size)
    def setup(self):
        self.train_dataset = PCDataset(self.train_sequences)
        self.test_dataset = PCDataset(self.test_sequences)
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            # num_workers=2
        )
        
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            # num_workers=2
        )
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            # num_workers=2
        )

In [146]:
# d = PCDataModule(train_sequences, test_sequences)
dl = DataLoader(
    PCDataset(train_sequences),
    batch_size=8,
    shuffle=False
)

for d in dl:
    print(d['sequence'].shape)
    break

torch.Size([8, 60, 10])


In [147]:
num_epoch = 5
batch_size = 64

data_module = PCDataModule(train_sequences, test_sequences, batch_size)
data_module.setup()

batch size 64


In [148]:
train_dataset = PCDataset(train_sequences)

In [149]:
len(train_sequences)

449940

In [153]:
# train_dataset.__len__()

for item in train_dataset:
    print(item['sequence'].shape)
    break
    print(item['sequence'].shape)
    print(item['label'].shape)
    print(item['label'])
    break

torch.Size([60, 10])


### Model

In [154]:
class Model(nn.Module):
    def __init__(self, n_features, n_hidden=128, n_layers=2) -> None:
        super().__init__()
        self.n_hidden = n_hidden
        self.lstm = nn.LSTM(
            input_size=n_features, #input feature
            hidden_size=n_hidden,
            batch_first=True,
            num_layers=n_layers,
            dropout=0.2
        )
        
        self.regressor = nn.Linear(n_hidden, 1) # final layer for prediction
        
    def forward(self, x):
        self.lstm.flatten_parameters()
        
        _, (hidden, _) = self.lstm(x) # take the hidden state
        out = hidden[-1] # take the last layer, which contains the features
        
        return self.regressor(out)

In [155]:
from typing import Any


from pytorch_lightning.utilities.types import STEP_OUTPUT, OptimizerLRScheduler


class PCPredictor(pl.LightningModule):
    def __init__(self, n_features: int):
        super().__init__()
        self.model = Model(n_features)
        self.criterion = nn.MSELoss() #loss function
    
    def forward(self,x,labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))
        return loss, output
    
    def training_step(self, batch, batch_index):
        sequences = batch['sequence']
        labels = batch['label']
        
        loss, outputs = self(sequences, labels)
        
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_index):
        sequences = batch['sequence']
        labels = batch['label']
        
        loss, outputs = self(sequences, labels)
        
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_index):
        sequences = batch['sequence']
        labels = batch['label']
        
        loss, outputs = self(sequences, labels)
        
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):# -> OptimizerLRScheduler:
        return optim.Adam(self.parameters(), lr=0.0001)

In [156]:
train_df.shape

(450000, 10)

In [157]:
model = PCPredictor(train_df.shape[1])

In [158]:
data_module.train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x24f8b896fa0>

In [159]:
for item in data_module.train_dataloader():
    # print(item.keys())
    # break
    print(item['sequence'].shape)
    print(item['label'].shape)
    print(item['label'])
    break

torch.Size([64, 60, 10])
torch.Size([64, 1])
tensor([[-0.3634],
        [-0.0956],
        [-0.0960],
        [-0.0963],
        [-0.4156],
        [-0.4639],
        [-0.4620],
        [-0.4654],
        [-0.5229],
        [-0.5028],
        [-0.3472],
        [ 0.1315],
        [-0.1587],
        [-0.1828],
        [-0.4654],
        [-0.5761],
        [-0.5878],
        [-0.5867],
        [-0.5890],
        [-0.6022],
        [-0.4530],
        [-0.2221],
        [-0.2214],
        [-0.2187],
        [-0.4892],
        [-0.5357],
        [-0.5440],
        [-0.5742],
        [-0.5769],
        [-0.5531],
        [-0.2036],
        [-0.2165],
        [-0.2165],
        [-0.2743],
        [-0.2187],
        [-0.2176],
        [-0.2467],
        [-0.3332],
        [-0.3623],
        [-0.3668],
        [-0.3797],
        [-0.3725],
        [-0.3767],
        [-0.3325],
        [-0.3589],
        [-0.3744],
        [-0.3733],
        [-0.3706],
        [-0.3672],
        [-0.3699],
     