# Project 2

## Load libs

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Model definitions

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[-1] # Only keep the last output in the sequence
        out = self.fc(lstm_out)
        return out

## Load data

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd

In [None]:
class BikeDataset(Dataset):
    def __init__(self, csv_file, seq_length):
        self.df = pd.read_csv(csv_file)
        self.seq_length = seq_length
        
    def __len__(self):
        return len(self.df) - self.seq_length
    
    def __getitem__(self, index):
        beg_idx, end_idx = index, index+self.seq_length
        input_features = torch.tensor(self.df.iloc[beg_idx:end_idx,:-1].values,dtype=torch.float32)
        target_label = torch.tensor(self.df.iloc[end_idx,-1],dtype=torch.float32)
        return input_features, target_label

In [None]:
csv_file = 'data/Bike-Sharing-Dataset/hour.csv'

seq_length = 1
train_ratio = 0.8
test_ratio = 1-train_ratio
batch_size = 1

dataset = BikeDataset(csv_file, seq_length)

num_samples = len(dataset)
num_train_samples = int(train_ratio * num_samples)
num_test_samples = num_samples - num_train_samples
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [num_train_samples, num_test_samples])

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # batch_size 1
test_dl = DataLoader(test_dataset, batch_size=num_test_samples, shuffle=True) # batch_size ALL

In [None]:
# Iterate over the data loader to access batches of data
for batch in train_dl:
    input_features, target_label = batch
    #print('Input Features:', input_features)
    #print('Target Label:', target_label)
    print(input_features.transpose(0,1).shape)
    print(target_label.shape)
    break

## Fit models

### LSTM

In [None]:
input_dim = 11
hidden_dim = 5
output_dim = 1

model = LSTMModel(input_dim, hidden_dim, output_dim)
loss_fn = nn.MSELoss()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

In [None]:
from tqdm.notebook import tqdm # status bar

num_epochs=10

num_train_samples=len(train_dl)
num_val_samples=len(test_dl)

for epoch in range(num_epochs):
    
    # training
    model.train()
    train_loss = 0.0

    for batch_idx, (samples, labels) in enumerate(tqdm(train_dl)):

        # forward pass
        prediction = model(samples.transpose(0,1))
        loss = loss_fn(prediction, labels.view(-1,1))

        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # training stats
        train_loss += loss.item()
        
    # validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch_idx, (samples, labels) in enumerate(test_dl):

            # Forward pass
            prediction = model(samples.transpose(0,1))
            loss = loss_fn(prediction, labels.view(-1,1))

            # validation stats
            val_loss += loss.item()
    
    # lr scheduler
    scheduler.step(val_loss)
    
    # show training stats for the epoch
    print('Epoch [{}/{}]\tAvg Train Loss: {:.4f}\tAvg Val Loss: {:.4f}'
          .format(epoch + 1, num_epochs, train_loss/num_train_samples, val_loss/num_val_samples))

In [None]:
optimizer

### Results

In [None]:
from scipy.interpolate import UnivariateSpline

# seq_len, train_loss, val_loss
# 1      , 14563     , 14894
# 2      , 8508      , 8249
# 4      , 8027      , 7617
# 8      , 7534      , 7392
# 12     , 9304      , 9353
# 24     , 15075     , 14823
# 48     , 12119     , 12023

# Sample data
x = np.array([1, 2, 4, 8, 12, 24, 48])
y = np.array([14894,8249,7617,7392,9353,14823,12023])

# Create a smoothing spline
spl = UnivariateSpline(x, y)

# Generate more points for the plot
x_smooth = np.linspace(x.min(), x.max(), 1000)
y_smooth = spl(x_smooth)

# Create a line plot
plt.plot(x_smooth, y_smooth)
plt.xlabel('seq_length'); plt.ylabel('val loss')
plt.title('seq_length vs val loss')
plt.show()

In [None]:
from sklearn.manifold import TSNE

vis_dl = DataLoader(dataset, batch_size=len(dataset), shuffle=True) # batch_size ALL

# Testing
with torch.no_grad():
    model.eval()
    for batch in vis_dl:
        samples, labels = batch
        
        lstm_out, (hn, cn) = model.lstm(samples.transpose(0,1))
        
        prediction = model(samples.transpose(0,1))
        loss = loss_fn(prediction, labels.view(-1,1))
        
    #print(samples.transpose(0,1).shape)
    print(prediction.squeeze())
    print(labels.view(-1,1).squeeze())
    #print(loss)

In [None]:
if(hidden_dim == 2):
    # hidden
    plt.scatter(hn[:,:,0].tolist(),hn[:,:,1].tolist(),c=labels.tolist())
    plt.title('Scatter plot of hidden state')
    # cell
#     plt.scatter(cn[:,:,0].tolist(),cn[:,:,1].tolist(),c=labels.tolist())
#     plt.title('Scatter plot of cell state')
else:
    # Apply t-SNE for dimension reduction to 2-D
    tsne = TSNE(n_components=2, random_state=42)
    coord_2d = tsne.fit_transform(cn.squeeze().numpy())
    plt.scatter(coord_2d[:,0],coord_2d[:,1],c=labels.tolist())
    plt.title('t-SNE of cell state')