In [3]:
%run preprocessing.ipynb


In [None]:
%temporal_preprocess.ipynb

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [22]:
# To inspect first group's datetime handling:
first_gid, first_grp = next(iter(transformer_input.items()))
print(pd.to_datetime(first_grp[:, -2]))  # Check what this outputs to verify correct handling


DatetimeIndex(['2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01',
               '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01',
               '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01',
               '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01',
               '2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-01-01', '2020-02-01', '2020-02-01',
               '2020-03-01', '2020-03-01', '2020-04-01', '2020-04-01',
               '2020-05-01', '2020-05-01', '2020-06-01', '2020-06-01',
               '2020-07-01', '2020-07-01', '2020-08-01', '2020-08-01',
               '2020-09-01', '2020-09-01', '2020-10-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01'],
     

In [23]:
class TransformerDataset(Dataset):
    def __init__(self, data, date_encodings):
        self.data = data
        self.date_encodings = date_encodings
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        gid = list(self.data.keys())[idx]
        features = np.array(self.data[gid][:, :-2], dtype=np.float32)  # Exclude the last two columns if last is target and second last is date
        target = np.array(self.data[gid][:, -1], dtype=np.float32)
        dates = self.date_encodings[gid]
        return torch.tensor(features), torch.tensor(dates), torch.tensor(target)

In [24]:
# Proper DataLoader initialization
dataset = TransformerDataset(transformer_input, date_encodings)

train_set, val_set = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_loader = DataLoader([dataset[i] for i in train_set], batch_size=32, shuffle=True)
val_loader = DataLoader([dataset[i] for i in val_set], batch_size=32, shuffle=False)

In [25]:
# Define the transformer model
class TemporalTransformerEncoder(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_layers, dim_feedforward=512, dropout=0.1):
        super(TemporalTransformerEncoder, self).__init__()
        self.input_embedding = nn.Linear(input_size, d_model)
        self.positional_encoder = nn.Embedding(int(np.max([np.max(dates) for dates in date_encodings.values()])) + 1, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output_layer = nn.Linear(d_model, 1)
    
    def forward(self, x, dates):
        x = self.input_embedding(x)
        dates = dates.long()
        pos_encoding = self.positional_encoder(dates)
        x += pos_encoding
        x = self.transformer_encoder(x)
        output = self.output_layer(x)
        return output.squeeze(-1)

In [26]:
# Initialize the model
model = TemporalTransformerEncoder(input_size=len(top_50_features), d_model=4, nhead=1, num_layers=1)
criterion = nn.MSELoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.0001)



In [None]:
# Training loop
# Training loop
for epoch in tqdm(range(100)):
    model.train()
    for i, (features, dates, targets) in enumerate(train_loader):
        try:
            optimizer.zero_grad()
            outputs = model(features.squeeze(0), dates.squeeze(0))
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        except Exception as e:
            print(f"Error at iteration {i} of epoch {epoch}: {str(e)}")
            break
    # Validation loop
    model.eval()
    val_losses = []
    with torch.no_grad():
        for features, dates, targets in val_loader:
            outputs = model(features.squeeze(0), dates.squeeze(0))
            val_loss = criterion(outputs, targets)
            val_losses.append(val_loss.item())
    val_loss_avg = np.mean(val_losses)
    print(f'Epoch {epoch+1}: Train Loss: {loss.item():.4f}, Val Loss: {val_loss_avg:.4f}')


In [None]:
for epoch in tqdm(range(100)):
   model.eval()
    val_losses = []
    with torch.no_grad():
        for features, dates, targets in val_loader:
            outputs = model(features.squeeze(0), dates.squeeze(0))
            val_loss = criterion(outputs, targets)
            val_losses.append(val_loss.item())
    val_loss_avg = np.mean(val_losses)
    print(f'Epoch {epoch+1}: Train Loss: {loss.item():.4f}, Val Loss: {val_loss_avg:.4f}')