In [None]:
import os
import sys

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))

sys.path.append(parent_dir)

In [None]:
import torch
import numpy as np
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
from torch.nn.utils import rnn as rnn_utils
from generators.data_generation import generate_sequences
from model_managers.DeepLearningManager import DeepLearningManager
torch.cuda.empty_cache()

In [None]:
import matplotlib.pyplot as plt

def plot_sequence(points, direction):
    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(3, 2))
    
    # Plot the sequence of points
    point_array = np.array(points)
    ax.plot(point_array[:, 0], point_array[:, 1], marker='o', linestyle='-')
    
    # Plot direction arrow
    if direction == 1:  # Clockwise
        start_point = point_array[0]
        end_point = point_array[-1]
        dx = end_point[0] - start_point[0]
        dy = end_point[1] - start_point[1]
        ax.arrow(start_point[0], start_point[1], dx, dy, head_width=0.1, head_length=0.1, fc='k', ec='k')
    elif direction == 0:  # Counterclockwise
        start_point = point_array[-1]
        end_point = point_array[0]
        dx = end_point[0] - start_point[0]
        dy = end_point[1] - start_point[1]
        ax.arrow(start_point[0], start_point[1], dx, dy, head_width=0.1, head_length=0.1, fc='k', ec='k')
    
    # Set labels and title
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_title('Sequence of Points with Direction')
    
    # Show plot
    plt.grid()
    plt.show()


In [None]:
points, directions = generate_sequences(n=128, seed=13)

In [None]:

for i in range(3):
    plot_sequence(points[i], directions[i])

# Build a Recurrent Neural Network

In [None]:
n_features = 2
n_hidden_dim = 2

torch.manual_seed(101)
rnn_cell = nn.RNNCell(input_size=n_features, hidden_size=n_hidden_dim)
rnn_state = rnn_cell.state_dict()
rnn_state

##### To understand the RNN architecture, we utilize states generated by nn.RNNCell. This allows us to build the architecture from scratch, beginning with linear layers.

In [None]:
# Define the linear layers and get the generated parameters from the RNNCell
linear_input = nn.Linear(n_features, n_hidden_dim)
linear_hidden = nn.Linear(n_hidden_dim, n_hidden_dim)

with torch.no_grad():
    linear_input.weight = nn.Parameter(rnn_state['weight_ih'])
    linear_input.bias = nn.Parameter(rnn_state['bias_ih'])
    linear_hidden.weight = nn.Parameter(rnn_state['weight_hh'])
    linear_hidden.bias = nn.Parameter(rnn_state['bias_hh'])

In [None]:
# Initial hidden state set to 0 with dims 1 x 2
initial_hidden = torch.zeros(1, n_hidden_dim)
initial_hidden

In [None]:
# We can now generate the first hidden state, this is a simple linear transformation without any activ func
th = linear_hidden(initial_hidden)
th

In [None]:
# Now take the first sequence with 4 points, 2 x 4
X = torch.as_tensor(points[0]).float()
X, X.shape

In [None]:
tx = linear_input(X[0:1])
tx

In [None]:
# Add the linear transformations to replicate the RNN
adds = th + tx
# Then use the tanh activation function
torch.tanh(adds)

# What we get is the updated hidden state

In [None]:
rnn_cell(X[0:1])

In [None]:
X[0:1]

## RNN Layer

In [None]:
# Single layer RNN
n_features = 2
n_hidden_dim = 2

torch.manual_seed(101)
rnn_cell = nn.RNN(input_size=n_features, hidden_size=n_hidden_dim)
rnn_state = rnn_cell.state_dict()

# As you can see we have l0 added to the weights and biases that indicates the layer 0
rnn_state

### RNN Input Dimension
In PyTorch, if you set the batch_first argument to True when using the nn.RNN class, it adjusts the expected input tensor layout to have the batch dimension first. Therefore, if batch_first is set to True, the input tensor should have dimensions (batch_size, sequence_length, input_size). This is useful for compatibility with certain data formats or personal preference in organizing data.

However, by default, PyTorch's nn.RNN class assumes the sequence dimension comes first. So, if batch_first is not specified or set to False, the input tensor should have dimensions (sequence_length, batch_size, input_size).

In [None]:
batch = torch.as_tensor(points[:3]).float()
batch.shape

In [None]:
# Convert from B S F -> S B F
permuted_batch = batch.permute(1,0,2)

# RNN friendly dimensions: Sequence - batch - Features
permuted_batch.shape

In [None]:
# Batch second
torch.manual_seed(101)
rnn = nn.RNN(input_size=n_features, hidden_size=n_hidden_dim)
out, final_hidden = rnn(permuted_batch)
out.shape, final_hidden.shape

In [None]:
# Or use batch_first argument
torch.manual_seed(101)
rnn = nn.RNN(input_size=n_features, batch_first=True ,hidden_size=n_hidden_dim)
out, final_hidden = rnn(batch)
out.shape, final_hidden.shape

#### Remember that Datasets and Dataloaders have batch_number as first dimension!

In [None]:
# RNN Layers stacked
torch.manual_seed(101)
rnn_stacked = nn.RNN(input_size=2, hidden_size=2, batch_first=True, num_layers=2)
rnn_stacked_state = rnn_stacked.state_dict()
rnn_stacked_state

In [None]:
# RNN Bidirectional
torch.manual_seed(101)
rnn_bidirect = nn.RNN(input_size=2, hidden_size=2, batch_first=True, bidirectional=True)
state = rnn_bidirect.state_dict()
state

In [None]:
# Create forward RNN and backward RNN and pass the parameters to the models
torch.manual_seed(19)
forward_rnn = nn.RNN(input_size=2, hidden_size=2, batch_first=True)
backward_rnn = nn.RNN(input_size=2, hidden_size=2, batch_first=True)
state

In [None]:
[(k[:-8], v) for k, v in list(state.items())[4:]]

In [None]:
forward_rnn.load_state_dict(dict(list(state.items())[:4]))
backward_rnn.load_state_dict(dict([(k[:-8], v) for k, v in list(state.items())[4:]]))

In [None]:
# Convert the state dictionary into a list of key-value pairs and start from the fifth element
state_items = list(state.items())[4:]

# Initialize an empty dictionary to store the modified key-value pairs
modified_state_dict = {}

# Iterate over the key-value pairs obtained from the state dictionary
for key, value in state_items:
    # Modify the key to remove the '_reverse' suffix, assuming it's present
    modified_key = key[:-8]  # Remove the last 8 characters from the key
    # Add the modified key-value pair to the modified state dictionary
    modified_state_dict[modified_key] = value

# Convert the list of modified key-value pairs back into a dictionary
modified_state_dict = dict(modified_state_dict)

# Load the modified state dictionary into the backward RNN model
backward_rnn.load_state_dict(modified_state_dict)


In [None]:
X = X.reshape(1,4,2)

In [None]:
# Reverse the sequence input to the backward_rnn
x_rev = torch.flip(X, dims=[1])
x_rev, X

In [None]:
out, h = forward_rnn(X)
out, h

In [None]:
out_rev, h_rev = backward_rnn(x_rev)
out_rev, h_rev

In [None]:
torch.cat([out, out_rev], dim=2), torch.cat([h, h_rev])

In [None]:
rnn_bidirect(X)

# Sequence Training

In [None]:
test_points, test_directions = generate_sequences(seed=101)

In [None]:
# Prepare data
train_data = TensorDataset(torch.as_tensor(points).float(),
                           torch.as_tensor(directions).view(-1,1).float())
test_data = TensorDataset(torch.as_tensor(test_points).float(),
                           torch.as_tensor(test_directions).view(-1,1).float())

In [None]:
# Build Dataloaders
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

In [None]:
train_loader.dataset[0]

In [None]:
from models.SimpleRNN import SquareModel, SquareModelGRU

In [None]:
model = SquareModel(n_features=2, hidden_dim=2, n_outputs=1)
loss = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
model_manager = DeepLearningManager(model, loss, optimizer)

In [None]:
# model_manager.set_data_loaders(train_loader=train_loader, val_loader=test_loader)
# model_manager.train(n_epochs=1000)

In [None]:
# fig = model_manager.plot_losses()

In [None]:
# model_manager.loader_apply(test_loader, model_manager.correct)

## GRU Journey

### Why GRU?
GRU, short for Gated Recurrent Unit, addresses a fundamental challenge encountered in simple RNNs: their inability to effectively manage the flow of information from previous hidden states and current inputs. In traditional RNNs, this lack of control often leads to difficulties in retaining relevant information over long sequences, hindering performance in tasks such as sequence prediction and language modeling.

GRU tackles this issue by introducing "gates" into the network architecture. These gates, governed by sigmoid functions, enable precise control over the flow of information within the hidden state computation process. Specifically, GRUs employ two key gates: the update gate and the reset gate.

The update gate regulates the extent to which information from previous time steps should be retained or updated in the current hidden state.
The reset gate determines the degree to which past information should be forgotten or reset, allowing the model to adapt dynamically to changing input patterns.

By incorporating these gating mechanisms, GRU architectures empower neural networks to selectively process and retain relevant information, thereby mitigating the vanishing gradient problem and enhancing the model's ability to capture long-range dependencies in sequential data.

In [None]:
gru_cell = nn.GRUCell(input_size=2, hidden_size=2)
gru_state = gru_cell.state_dict()

The weights and biases represent the 2 gates and the candidate hidden state parameters. The state_dict() method show us the parameters concatenated

In [None]:
# Let's get each parameter
wx, bx = gru_state['weight_ih'], gru_state['bias_ih']
wh, bh = gru_state['weight_hh'], gru_state['bias_hh']
wxr, wxz, wxn = wx.split(2, dim=0)
whr, whz, whn = wh.split(2, dim=0)
bxr, bxz, bxn = bx.split(2, dim=0)
bhr, bhz, bhn = bh.split(2, dim=0)

print("Weight matrices for input-to-hidden connections:")
print("wxr:", wxr)
print("wxz:", wxz)
print("wxn:", wxn)

print("\nWeight matrices for hidden-to-hidden connections:")
print("whr:", whr)
print("whz:", whz)
print("whn:", whn)

print("\nBiases for input-to-hidden connections:")
print("bxr:", bxr)
print("bxz:", bxz)
print("bxn:", bxn)

print("\nBiases for hidden-to-hidden connections:")
print("bhr:", bhr)
print("bhz:", bhz)
print("bhn:", bhn)

## LSTM - Long Short-Term Memory

The primary distinction between a simple RNN and a GRU cell lies in the presence of an additional state in the LSTM called the cell state. This cell state is crucial in retaining sequential information over extended distances.

In a simple RNN, the network's hidden state is responsible for capturing and propagating information across time steps. However, as sequences grow longer, simple RNNs struggle to maintain relevant information over distant past states due to issues like vanishing gradients.

LSTMs address this limitation by introducing a separate cell state alongside the hidden state. This cell state serves as a conveyor belt for preserving crucial information across multiple time steps. By selectively updating, forgetting, and outputting information through specialized gating mechanisms, LSTMs can effectively capture and retain long-term dependencies in sequential data.

In summary, while both GRUs and LSTMs utilize gating mechanisms to regulate information flow, LSTMs further augment their capacity for modeling long-range dependencies by incorporating an additional cell state, enabling them to excel in tasks that require retaining context over extended sequences. 

In [None]:
lstm_cell = nn.LSTMCell(input_size=2, hidden_size=2)
lstm_state = lstm_cell.state_dict()

In [None]:
# Splitting parameters
wx, bx = lstm_state['weight_ih'], lstm_state['bias_ih']
wh, bh = lstm_state['weight_hh'], lstm_state['bias_hh']

wxi, wxf, wxg, wxo = wx.chunk(4, dim=0)
whi, whf, whg, who = wh.chunk(4, dim=0)
bxi, bxf, bxg, bxo = bx.chunk(4, dim=0)
bhi, bhf, bhg, bho = bh.chunk(4, dim=0)

print("Weight matrices for input-to-hidden connections:")
print("wxi:", wxi)
print("wxf:", wxf)
print("wxg:", wxg)
print("wxo:", wxo)

print("\nWeight matrices for hidden-to-hidden connections:")
print("whi:", whi)
print("whf:", whf)
print("whg:", whg)
print("who:", who)

print("\nBiases for input-to-hidden connections:")
print("bxi:", bxi)
print("bxf:", bxf)
print("bxg:", bxg)
print("bxo:", bxo)

print("\nBiases for hidden-to-hidden connections:")
print("bhi:", bhi)
print("bhf:", bhf)
print("bhg:", bhg)
print("bho:", bho)


In [None]:
from models.SimpleRNN import SquareModelLSTM

In [None]:
model = SquareModelLSTM(n_features=2, hidden_dim=2, n_outputs=1)
loss = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
model_manager = DeepLearningManager(model, loss, optimizer)
model_manager.set_data_loaders(train_loader=train_loader, val_loader=test_loader)
model_manager.train(n_epochs=100)

In [None]:
fig = model_manager.plot_losses()

In [None]:
model_manager.loader_apply(test_loader, model_manager.correct)

### Sequence Packing

In [None]:
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence, pack_padded_sequence

In [None]:
seq1 = torch.tensor([1, 2, 3, 4])
seq2 = torch.tensor([5, 6])
seq3 = torch.tensor([7, 8, 9])

# Create a list of sequences
sequences = [seq1, seq2, seq3]

In [None]:
packed = pack_sequence(sequences=sequences, enforce_sorted=False)
packed

In [None]:
packed.data[[0,3,6,8]]

At time step 1, the data considered consists of the elements from the original sequences that were active (non-padded) at that time step. The number of active sequences at each time step is indicated by the corresponding value in the batch_sizes tensor.

In the provided batch_sizes tensor [3, 3, 2, 1], each value represents the number of active sequences at the corresponding time step. Therefore, at time step 1, there are 3 active sequences. The data tensor [1, 7, 5, 2, 8, 6, 3, 9, 4] contains the packed data from all sequences, and the batch sizes tensor indicates how many sequences are active at each time step.

So, at time step 1, the data considered would be [1, 7, 5], representing the first elements of the three original sequences in the batch. These values are from the packed data tensor and correspond to the first time step of the sequences before they were padded and packed.

## Train a different lenght sequences

In [None]:
from data_preparation.sequences.diff_size_sequences import VariableSizeDataset
from models.SimpleRNN import SquareModelPacked

In [None]:
var_points, var_directions = generate_sequences(variable_len=True)
var_points[:3]

In [None]:
train_var_data = VariableSizeDataset(var_points, var_directions)

In [None]:
train_var_loader = DataLoader(
    train_var_data,
    batch_size=16,
    shuffle=True,
    collate_fn=VariableSizeDataset.pack_collate
)

In [None]:
x_batch, y_batch = next(iter(train_var_loader))

In [None]:
model = SquareModelPacked(n_features=2, hidden_dim=2, n_outputs=1)
loss = nn.BCEWithLogitsLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
mm = DeepLearningManager(model, loss, optim)
mm.set_data_loaders(train_var_loader)
mm.train(n_epochs=100)

## 1-D Convolution

In [None]:
import numpy as np

arr = np.array([ 5, 10, 4, 3, 5, 4, 11, 4, 0, 4, 2])
size = 5
weight = torch.ones(size)*0.2
out_tensor = F.conv1d(torch.as_tensor(arr).float().view( 1, 1, -1), weight=weight.view( 1, 1, -1))
out_tensor

In [None]:
out_tensor[0][0][0] == sum(arr[:size])*0.2

### Conv 1-D on Multiple Features/Channels
Default shape conv1d N, F=C, L 

N = Number of sequences (batch_size)

F = C = Features or Channels

L = Sequence length

Convolution of a sequence with 2 features and length of 4 and a single filter 2x2.

1x2x4 (sequence) * 1x2x2 (filter) = 1x1x3 output 

In [None]:
conv_seq = nn.Conv1d(in_channels=2, out_channels=1, kernel_size=2, bias=False)
conv_seq.weight, conv_seq.weight.shape

## Dilation

Dilated convolutions in 1D convolutional neural networks (Conv1D) expand the receptive field without increasing the number of parameters. By introducing gaps between elements in the convolutional filter, a dilation rate greater than 1 allows the filter to cover more input units per convolution, capturing wider-range features without additional computational cost. For instance, with a dilation rate of 2, the filter skips every other input element, effectively broadening its perspective on the input sequence while maintaining the same computational footprint. This technique is especially valuable in processing time series or sequential data, where understanding broader context or longer-range dependencies is crucial.
Dilation take the filter and apply it to the sequence and skip based on dilation number. Dilation = 1 means no skip, must be 2 or more.

In [None]:
conv_dilated = nn.Conv1d(in_channels=2, out_channels=1, kernel_size=2, dilation=2, bias=False)
conv_dilated.weight, conv_dilated.weight.shape