In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import h5py
import random

In sequential models, current hidden state is a function of the current input and previous hidden state:


### h(t) = f(h(t-1), x(t); W)

W are the parameters of function (in our case NN)
 

For RNN:

a(t) = W * h(t-1) + U * x(t) + b1
h(t) = tanh(a(t))
o(t) = V * h(t) + b2

In [2]:
# Initialise device

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Primary device set to GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Primary device set to CPU.")

CUDA is available. Primary device set to GPU.


In [3]:
#  Class for a single RNN Cell
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size

        self.hidden_layer = nn.Linear(hidden_size, hidden_size)
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, input_tensor, prev_hidden_state):
        inner_tensor = self.hidden_layer(prev_hidden_state) + self.input_layer(input_tensor)
        hidden_tensor = torch.tanh(inner_tensor)
        output_tensor = self.output_layer(hidden_tensor)

        return output_tensor, hidden_tensor


# Class for RNN model composed of one or more RNN cells
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, device='cpu'):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.device = device

        self.rnn_cells = nn.ModuleList([RNNCell(input_size, hidden_size, output_size) for _ in range(self.num_layers)])
        self.to(self.device)

    def forward(self, input_sequence):

        batch_size = input_sequence.size(0)
        sequence_size = input_sequence.size(1)
        # Different initialization techniques can be tried. For now, I am sticking to zeros
        hidden_state = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
        outputs = torch.zeros(sequence_size, batch_size, self.output_size).to(self.device)

        # iterate over each time step
        for seq in range(sequence_size):
            token_tensor = input_sequence[:, seq, :]
            # iterate over each layer in rnn
            for i, rnn_cell in enumerate(self.rnn_cells):
                y_out, h_out = rnn_cell(token_tensor, hidden_state[i])
                token_tensor = y_out

                # update hidden state of rnn cells of layer
                hidden_state[i] = h_out

            outputs[seq] = token_tensor

        return outputs.view(outputs.shape[1], outputs.shape[0], outputs.shape[2])

In [12]:
def sequence_generator(input_df, window_size, stride=1, batch_size=20000):
    df_array = input_df.values
    start = 0
    total_length = len(df_array)
    sequences = []
    outputs = []
    while start < total_length - window_size:
        sequences.append(df_array[start : start + window_size])
        outputs.append(df_array[start+window_size])
        start += stride
        
        if len(sequences) >= batch_size:
            yield np.array(sequences), np.array(outputs)
            sequences = []
            outputs = []
    
    if len(sequences) > 0:
        yield np.array(sequences), np.array(outputs)

In [25]:
def write_to_hdf5(input_df, window_size, stride_size, batch_size, storage_path, dataset_name, 
                  label_name = 'label'):
    sequence_data_size = int(np.floor((len(input_df) - window_size) / stride_size + 1))
    num_features = input_df.shape[1]  # Number of features (columns) in the DataFrame
    
    gen = sequence_generator(input_df, window_size, stride_size, batch_size)
    
    with h5py.File(storage_path, 'w') as f:
        # Create a dataset with preallocated memory for sequences and features
        dset = f.create_dataset(dataset_name, (sequence_data_size, window_size, num_features), dtype='float32')
        y_set = f.create_dataset(label_name, sequence_data_size)
        count = 0
        
        for batch in gen:
            features = batch[0]
            y = batch[1]
            num_data = features.shape[0]
            dset[count:count + num_data] = features
            y_set[count: count + num_data] = np.squeeze(y)
            count += num_data

In [26]:
# class HDF5SequenceDataset(torch.utils.data.Dataset):
#     def __init__(self, file_path, sequence_length, start_idx=None, end_idx=None):
#         self.file_path = file_path
#         self.start_idx = start_idx
#         self.end_idx = end_idx
#         self.sequence_length = sequence_length
#         self.length = end_idx - start_idx
#     
#         if self.length is None:
#             with h5py.File(file_path, 'r') as file:
#                 self.length = len(file['data/value']) - sequence_length
# arr = [1,2,3, 4, 5, 6]

df = pd.read_csv('data/daily-minimum-temperatures-in-me.csv')

window_size = 5
stride_size = 1
batch_size = 100

df['Daily minimum temperatures'] = pd.to_numeric(df['Daily minimum temperatures'], errors='coerce')
df['Daily minimum temperatures'].fillna(method='ffill', inplace=True)
# Fill any remaining NaNs with a default value, e.g., 0
df['Daily minimum temperatures'].fillna(0, inplace=True)
df['temp2'] = df['Daily minimum temperatures'] + 5
# display(df[['Daily minimum temperatures']])
arr = df['Daily minimum temperatures'].to_numpy()

gen = sequence_generator(df[['Daily minimum temperatures']], window_size, stride_size, batch_size)
arr_x = []
arr_y = []
for batch in gen:
    x = batch[0]
    y = batch[1]
    arr_x.append(x)
    arr_y.append(y)
    
sequence_x = np.concatenate(arr_x, axis=0)
sequence_y = np.concatenate(arr_y, axis=0)


print(sequence_x.shape)
print(sequence_y.shape)

# print(sequence_x)
# print(sequence_y)
# write_to_hdf5(arr, window_size, stride_size, batch_size, 'meta/sequence.h5')

write_to_hdf5(df[['Daily minimum temperatures']], window_size, stride_size, batch_size, 'meta/sequence.h5', 'sequences')
input_size = len(arr) 

# with h5py.File('meta/sequence.h5', 'r') as f:
#     data = f['sequences'][:]
#     print(data.shape)
#     
#     # Sample 5 random sequences
#     indices = np.random.randint(0, data.shape[0], 500)
#     for idx in indices:
#         print(f"Sample data at index {idx}: {data[idx]}")
#         print(data[idx] == sequence[idx])
# sequence_data_size = np.floor( (input_size - window_size) / stride_size + 1 )
# 
# input_size = len(arr)
# gen = sequence_generator(arr, 5, 3, 2)



(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(100, 5, 1) (100, 1)
(45, 5, 1) (45, 1)
(3645, 5, 1)
(3645, 1)


In [30]:
class HDF5Dataset(torch.utils.data.Dataset):
    
    def __init__(self, file_path, dataset_name, label_name = 'label', transform = None):
        self.file_path = file_path
        self.dataset_name = dataset_name
        self.label_name = label_name
        self.transform = transform
        self.file = None
        self.data_len = None
        
    def __len__(self):
        if self.data_len is None:
            with h5py.File(self.file_path, 'r') as file:
                self.data_len = len(file[self.dataset_name])
            return self.data_len
    
    def __getitem__(self, index):
        if self.file is None:
            self.file = h5py.File(self.file_path, 'r')
        x_in = self.file[self.dataset_name][index]
        label = self.file[self.label_name][index]
        if self.transform is not None:
            x_in = self.transform(x_in)
        return x_in, label
    
    def close(self):
        if self.file is not None:
            self.file.close()
            self.file = None
    
    def __del__(self):
        self.close()

In [34]:
hdf_dataset = HDF5Dataset('meta/sequence.h5', 'sequences', 'label')
hdf_dataset.__getitem__(2)

(array([[18.8],
        [14.6],
        [15.8],
        [15.8],
        [15.8]], dtype=float32),
 17.4)

In [24]:
input_size = 1
hidden_size = 4
output_size = 1
num_layers = 2

model = RNN(input_size, hidden_size, output_size, num_layers, device).to(device)

In [33]:
df = pd.read_csv('data/daily-minimum-temperatures-in-me.csv')
print(df.dtypes)
print(df.shape)
print(len(df))
df.head(10)

Date                          object
Daily minimum temperatures    object
dtype: object
(3650, 2)
3650


Unnamed: 0,Date,Daily minimum temperatures
0,1/1/1981,20.7
1,1/2/1981,17.9
2,1/3/1981,18.8
3,1/4/1981,14.6
4,1/5/1981,15.8
5,1/6/1981,15.8
6,1/7/1981,15.8
7,1/8/1981,17.4
8,1/9/1981,21.8
9,1/10/1981,20.0


In [26]:
df = pd.read_csv('data/daily-minimum-temperatures-in-me.csv')
df.to_hdf('meta/temperature.h5', mode='w', key='temperature' , format='table')
display(df)
print(df.dtypes)
df['Daily minimum temperatures'] = pd.to_numeric(df['Daily minimum temperatures'], errors='coerce').astype('float32')
arr = df['Daily minimum temperatures'].tolist()

window_size = 14
sequenceArr = [arr[i:i + window_size + 1] for i in range(len(arr) - window_size)]

assert len(sequenceArr) + window_size == len(arr)


Unnamed: 0,Date,Daily minimum temperatures
0,1/1/1981,20.7
1,1/2/1981,17.9
2,1/3/1981,18.8
3,1/4/1981,14.6
4,1/5/1981,15.8
...,...,...
3645,12/27/1990,14
3646,12/28/1990,13.6
3647,12/29/1990,13.5
3648,12/30/1990,15.7


Date                          object
Daily minimum temperatures    object
dtype: object


In [27]:

# Not the most optimal way to perform train test split on time series
def generate_train_test_split(arr, train_ratio = 0.8):
    # Don't shuffle since this is a time series
    # random.shuffle(arr)
    
    train_size = round(len(arr) * train_ratio)
    train_arr = arr[:train_size]
    test_arr = arr[train_size:]
    
    return train_arr, test_arr

### First train raw data without any preprocessing whatsoever

In [28]:
train_arr, test_arr = generate_train_test_split(sequenceArr, 0.9)

len(train_arr), len(test_arr), len(sequenceArr)

(3272, 364, 3636)

In [29]:
input_size = 1
hidden_size = 4
output_size = 1
num_layers = 1

model = RNN(input_size, hidden_size, output_size, num_layers)

In [14]:
moduleList = nn.ModuleList([RNNCell(input_size, hidden_size, output_size) for _ in range(5)])


In [32]:
input_sample = torch.Tensor(train_arr[0:3])
input_sample = input_sample.view(1, input_sample.shape[0], 1)
model_out = model(input_sample)
model_out

RuntimeError: shape '[1, 3, 1]' is invalid for input of size 45

In [35]:
for batch in gen:
    print(batch.shape)

In [17]:
for num in torch.Tensor(sequence[0]):
    print(num)

tensor([20.7000, 25.7000])
tensor([17.9000, 22.9000])
tensor([18.8000, 23.8000])
tensor([14.6000, 19.6000])
tensor([15.8000, 20.8000])


In [47]:
filePath = 'meta/sequence.h5'

with h5py.File(filePath, 'r') as file:
    data = file['sequences']
    dataArray = data[:3]

    print(model(torch.Tensor(dataArray)))

tensor([[[0.9110],
         [0.9160],
         [0.9141],
         [0.9934],
         [0.9789]],

        [[1.0464],
         [0.9601],
         [1.0233],
         [0.9816],
         [1.0301]],

        [[0.9887],
         [1.0026],
         [0.9866],
         [1.0002],
         [0.9957]]], grad_fn=<ViewBackward0>)


In [26]:
a = torch.full([2], 1)
b = torch.full([2], 3)
torch.stack((a,b), dim=1)

tensor([[1, 3],
        [1, 3]])

In [12]:
int(2.5/1.2)

2