# Objective

* Dataset from: https://www.kaggle.com/uciml/electric-power-consumption-data-set
* Use PyTorchLightning to train and predict a model for 'Global_active_power'
    * Create a Datset for the time series
    * Create a Data Module
    * Create a model
* Use the information from the notebook 'DataAnalysis'
* The notebook has been inspired by: https://www.kaggle.com/tartakovsky/pytorch-lightning-lstm-timeseries-clean-code

# Setup

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything

# Create a Dataset

In [35]:
class PowConDataSet(Dataset):
    def __init__(self, X, y, seq_len=1):
        
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len 
        
    def __getitem__(self, idx):
        return (self.X[idx:idx+self.seq_len-1], self.y[idx+self.seq_len-1])
        
    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)       

# Create a Data Module

In [40]:
class PowConDataModule(pl.LightningDataModule):
    
    def __init__(self, seq_len = 1, batch_size = 128, num_workers=8):
        super().__init__()
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
    
    def setup(self, stage):
        '''
        * read Data
        * 'Date' and 'Time' columns are merged into 'date' index
        * convert all to float and delete nans
        * resampled to hourly intervals
        * define X (features) and y (lables)
        '''
        # read data
        filepath = 'data/household_power_consumption.txt'
        df_powcon = pd.read_csv(filepath, sep=';',
                        parse_dates={'date':['Date','Time']},
                        infer_datetime_format=True,
                        index_col='date')
        
        # change types to float (and all no number values to nan)
        for i in range(len(df_powcon.columns)):
            df_powcon.iloc[:,i] = pd.to_numeric(df_powcon.iloc[:,i], errors='coerce')
        
        # resamble to hourly means
        df_powcon = df_powcon.resample('h').mean()
        
        df_powcon.dropna(inplace=True)
        df_powcon = df_powcon.astype(float)
        
        # define features (X) and labels (y)
        y = df_powcon['Global_active_power'].values

        columns = ['Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
        X = np.zeros((len(columns), len(y)))
        X = df_powcon[columns].values
        
        # train - valid - test splits
        X_tmp, self.X_test, y_tmp, self.y_test = train_test_split(X, y, shuffle=False, test_size=.2)
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(X_tmp, y_tmp, shuffle=False, test_size=.25)
        
        # normalize each column
        scaler = StandardScaler()
        scaler.fit(self.X_train)
            
        self.X_train = scaler.transform(self.X_train)
        self.X_val = scaler.transform(self.X_val)
        self.X_test = scaler.transform(self.X_test)
        self.y_train = self.y_train.reshape(-1,1)
        self.y_val = self.y_val.reshape(-1,1)
        self.y_test = self.y_test.reshape(-1,1)
        
    def train_dataloader(self):
        '''
        * no further transformation necessary
        * wrap dataset in dataloader
        '''
        # create dataset
        train_dataset = PowConDataSet(self.X_train, self.y_train, seq_len=self.seq_len)
        
        # wrap dataset in dataloader
        train_dataloader = DataLoader(train_dataset, batch_size = self.batch_size, shuffle = False, 
                                      num_workers = self.num_workers)
        
        return train_dataloader
    
    def val_dataloader(self):
        # create dataset
        val_dataset = PowConDataSet(self.X_val, self.y_val, seq_len=self.seq_len)
        
        # wrap dataset in dataloader
        val_dataloader = DataLoader(val_dataset, batch_size = self.batch_size, shuffle = False,
                                   num_workers = self.num_workers)
        
        return val_dataloader
    
    def test_dataloader(self):
        # create dataset
        test_dataset = PowConDataSet(self.X_test, self.y_test, seq_len=self.seq_len)
        
        # wrap dataset in dataloader
        val_dataloader = DataLoader(test_dataset, batch_size = self.batch_size, shuffle = False,
                                   num_workers = self.num_workers)
        
        return test_dataloader

# Create a Model

In [41]:
class PowConModel(pl.LightningModule):
    
    def __init__(self, n_features, hidden_size, seq_len,
                 batch_size, num_layers, dropout,
                 learning_rate, criterion):
        super().__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.criterion = criterion
        
        self.lstm = nn.LSTM(input_size=n_features,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            dropout=dropout,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x) # lstm_out = (batch_size, seq_len, hidden_size)
        x = self.fc(lstm_out[:,-1])
        return x
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        #result = pl.TrainResult(minimize=loss)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        #result = pl.EvalResult(checkpoint_on=loss)
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, test_batch, batch_idx):
        x, y = test_batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat,y)
        #result = pl.EvalResult()
        self.log('test_loss', loss)
        return loss  

# Set Parameters

In [42]:
p = dict(
    seq_len = 24,
    batch_size = 128, 
    criterion = nn.MSELoss(),
    max_epochs = 1,
    n_features = 6,
    hidden_size = 10,
    num_layers = 1,
    dropout = 0.2,
    learning_rate = 0.01,
)

# Train 

In [43]:
data_module = PowConDataModule(seq_len = p['seq_len'],
                           batch_size = p['batch_size'])

model = PowConModel(n_features = p['n_features'],
                    hidden_size = p['hidden_size'],
                    seq_len = p['seq_len'],
                    batch_size = p['batch_size'],
                    criterion = p['criterion'],
                    num_layers = p['num_layers'],
                    dropout = p['dropout'],
                    learning_rate = p['learning_rate'])

trainer = pl.Trainer(max_epochs=p['max_epochs'])
trainer.fit(model, data_module)

  "num_layers={}".format(dropout, num_layers))
GPU available: False, used: False
TPU available: None, using: 0 TPU cores
  self.trainer.call_setup_hook(model)

  | Name      | Type    | Params
--------------------------------------
0 | criterion | MSELoss | 0     
1 | lstm      | LSTM    | 720   
2 | fc        | Linear  | 11    
--------------------------------------
731       Trainable params
0         Non-trainable params
731       Total params


Epoch 0:  75%|███████▍  | 160/214 [00:03<00:01, 42.79it/s, loss=0.502, v_num=8, train_loss=0.557] 
Validating: 0it [00:00, ?it/s][A
Epoch 0:  78%|███████▊  | 166/214 [00:03<00:01, 42.10it/s, loss=0.502, v_num=8, train_loss=0.557]
Epoch 0:  86%|████████▌ | 183/214 [00:04<00:00, 45.20it/s, loss=0.502, v_num=8, train_loss=0.557]
Epoch 0:  94%|█████████▍| 202/214 [00:04<00:00, 48.67it/s, loss=0.502, v_num=8, train_loss=0.557]
Epoch 0: 100%|██████████| 214/214 [00:04<00:00, 50.36it/s, loss=0.502, v_num=8, train_loss=0.386]
Epoch 0: 100%|██████████| 214/214 [00:04<00:00, 50.21it/s, loss=0.502, v_num=8, train_loss=0.386]


1