# Timeseries pytorch-lightning

In [1]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plot
import math
from matplotlib import rc
from pylab import rcParams

import pandas as pd
import numpy as numpy
import pytorch_lightning as pl
from tqdm.notebook import tqdm
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [2]:
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter contrib nbextension install

In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retine'
# !jupyter nbextension enable --py widgetsnbextension

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ['#01BEFE', '#FFDD00', '#FF7D00', '#FF006D', '#ADFF02', '#8F00FF']
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
tqdm.pandas()
pl.seed_everything(42)

ERROR:root:supported formats are: 'jpg','png2x','retina','svg','pdf','jpeg','png' not 'retine'
Seed set to 42


42

## __Import dataset__

In [4]:
df = pd.read_csv('../datasets/household_power_consumption_v3_drop.txt', parse_dates={'datetime' : ['date','time']}, index_col='datetime', sep=';', usecols=[1,2,3,4,5,6,7,8,9,11,12,13])
df.head(5)

Unnamed: 0_level_0,global_active_power,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,16,12,2006
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,16,12,2006
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,16,12,2006
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,16,12,2006
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,16,12,2006


### Preprocessing

In [None]:
rows = []

for _, row in df.iterrows():
    row_data = dict(
        grp=row.global_reactive_power,
        voltage=row.voltage,
        global_intensity=row.global_intensity,
        kitchen=row.kitchen,
        laundry_room=row.laundry_room,
        thermal_utilities=row.thermal_utilities,
        day=row.day,
        month=row.month,
        year=row.year
    )
    rows.append(row_data)
features_df = pd.DataFrame(rows)

In [8]:
cols = ['global_reactive_power', 'voltage', 'global_intensity', 'kitchen', 'laundry_room', 'thermal_utilities', 'day', 'month', 'year', 'global_active_power']
features_df = df[cols]
features_df

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,0.418,234.84,18.4,0.0,1.0,17.0,16,12,2006,4.216
2006-12-16 17:25:00,0.436,233.63,23.0,0.0,1.0,16.0,16,12,2006,5.360
2006-12-16 17:26:00,0.498,233.29,23.0,0.0,2.0,17.0,16,12,2006,5.374
2006-12-16 17:27:00,0.502,233.74,23.0,0.0,1.0,17.0,16,12,2006,5.388
2006-12-16 17:28:00,0.528,235.68,15.8,0.0,1.0,17.0,16,12,2006,3.666
...,...,...,...,...,...,...,...,...,...,...
2010-11-26 20:58:00,0.000,240.43,4.0,0.0,0.0,0.0,26,11,2010,0.946
2010-11-26 20:59:00,0.000,240.00,4.0,0.0,0.0,0.0,26,11,2010,0.944
2010-11-26 21:00:00,0.000,239.82,3.8,0.0,0.0,0.0,26,11,2010,0.938
2010-11-26 21:01:00,0.000,239.70,3.8,0.0,0.0,0.0,26,11,2010,0.934


In [11]:
train_size = int(len(features_df) * .9)
train_size

1844352

In [13]:
train_df, test_df = features_df[:train_size], features_df[train_size + 1:]
train_df.shape, test_df.shape

((1844352, 10), (204927, 10))

In [14]:
# scalling data only using train df
scaler = MinMaxScaler(feature_range=(-1,1))
scaler = scaler.fit(train_df)

In [17]:
train_df = pd.DataFrame(
    scaler.transform(train_df), 
    index=train_df.index, 
    columns=train_df.columns)
train_df.head(3)

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-16 17:24:00,-1.573469,-15.439277,-1.018457,-1.022727,-1.024375,-0.993757,-1.066667,-1.0,-1004.5,-1.0591
2006-12-16 17:25:00,-1.536204,-15.44433,-1.010537,-1.022727,-1.024375,-0.997919,-1.066667,-1.0,-1004.5,-1.021596
2006-12-16 17:26:00,-1.407846,-15.44575,-1.010537,-1.022727,-1.02375,-0.993757,-1.066667,-1.0,-1004.5,-1.021137


In [18]:
test_df = pd.DataFrame(
    scaler.transform(test_df), 
    index=test_df.index, 
    columns=test_df.columns)
test_df.head(3)

Unnamed: 0_level_0,global_reactive_power,voltage,global_intensity,kitchen,laundry_room,thermal_utilities,day,month,year,global_active_power
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-06-28 21:50:00,-0.910791,0.079806,-0.834025,-1.0,-1.0,-0.935484,0.8,-0.090909,1.0,-0.833424
2010-06-28 21:51:00,-0.896403,0.055897,-0.850622,-1.0,-1.0,-0.935484,0.8,-0.090909,1.0,-0.845012
2010-06-28 21:52:00,-0.905036,0.042973,-0.834025,-1.0,-1.0,-1.0,0.8,-0.090909,1.0,-0.833424


__To sequences__

In [28]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length, offset = 1):
    sequences = []
    data_size = len(input_data)
    
    for i in range(data_size - sequence_length):
        sequence = input_data[i:i+sequence_length]
        
        label_poisition = i+sequence_length
        label = input_data.iloc[label_poisition:label_poisition+offset][target_column]
        
        sequences.append((sequence, label))
        
    return sequences

In [29]:
input_data = train_df[:100]

In [49]:
train_seq = create_sequences(input_data, 'global_active_power', 20, 1)

In [50]:
print(f'Train sequence shape (features): {train_seq[0][0].shape}; (labels): {train_seq[0][1].shape}')

Train sequence shape (features): (20, 10); (labels): (1,)


In [55]:
num_seq = 60 # 60 minutes
offset = 1
train_sequences = create_sequences(train_df, 'global_active_power', num_seq, offset)
test_sequences = create_sequences(test_df, 'global_active_power', num_seq, offset)


### Create pytorch dataset (time-series)

https://www.youtube.com/watch?v=ODEGJ_kh2aA