---
### Imports

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

---
### Data Loading and prep

In [2]:
# Load the training data
raw_training_data = pd.read_pickle(r"data\training_data\training_data_v1.0.pkl")

In [3]:
#eliminate rows woth missing values in the policies
print("Number of rows before dropping missing values: ", len(raw_training_data))
nona_training_data = raw_training_data.dropna(subset=['C', 'E', 'G', 'S'])
print("Number of rows after dropping missing values: ", len(nona_training_data))

Number of rows before dropping missing values:  50644
Number of rows after dropping missing values:  48048


In [4]:
# Get the unique countries in the dataset
countries = nona_training_data.location.unique()
#Define the feature columns
feat_columns = ['C', 'E', 'G', 'S']
#Empty df to append the cleaned data country by country
nozero_training_data = pd.DataFrame()

# Here we remove the first rows for each country where all the features are zero
# This is done to remove the initial period where the policies were not yet implemented
for country in countries:
    temp = nona_training_data.loc[nona_training_data['location'] == country]
    temp = temp.reset_index(drop=True)
    first_non_zero_index = temp.loc[(temp[feat_columns] != 0).any(axis=1)].index[0]
    filtered_data = temp.iloc[first_non_zero_index:]
    nozero_training_data = pd.concat([nozero_training_data, filtered_data], ignore_index=True)

In [5]:
#Preparing data for the model
feat_policy_columns = ['C', 'E', 'G', 'S']
feat_totals = ['total_cases', 'total_deaths']
location = ['location']

target_cases = ['new_cases']
target_deaths = ['new_deaths']

model_data = nozero_training_data[location + feat_policy_columns + feat_totals + target_cases + target_deaths].copy()

scaler = MinMaxScaler()
model_data[feat_policy_columns] = scaler.fit_transform(model_data[feat_policy_columns])

In [6]:
model_data

Unnamed: 0,location,C,E,G,S,total_cases,total_deaths,new_cases,new_deaths
0,Algeria,0.039667,0.0,0.017701,0.0556,1,0,0.0,0.0
1,Algeria,0.039667,0.0,0.017701,0.0556,1,0,0.0,0.0
2,Algeria,0.039667,0.0,0.017701,0.0556,1,0,0.0,0.0
3,Algeria,0.039667,0.0,0.017701,0.0556,19,0,18.0,0.0
4,Algeria,0.039667,0.0,0.017701,0.0556,19,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
46886,United States,0.473000,0.0,0.404970,0.2872,99019493,1079976,0.0,0.0
46887,United States,0.473000,0.0,0.404970,0.2872,99019493,1079976,0.0,0.0
46888,United States,0.473000,0.0,0.404970,0.2872,99019493,1079976,0.0,0.0
46889,United States,0.473000,0.0,0.404970,0.2871,99019493,1079976,0.0,0.0


In [27]:
column_name = 'location'  # Replace with the desired column name
value_counts = nona_training_data[column_name].value_counts()

In [28]:
# Function to create sequences per country
def create_sequences_per_country(df, countries, feature_columns, seq_length):
    X_list, y_list = [], []
    
    for country in countries:
        country_data = df[df["location"] == country].drop(columns=["location"]).reset_index(drop=True)
        features = country_data[feature_columns].values
        target = country_data["new_cases"].values
        
        X, y = [], []
        for i in range(len(features) - seq_length):
            X.append(features[i : i + seq_length])
            y.append(target[i + seq_length])
        
        X_list.append(np.array(X))
        y_list.append(np.array(y))
    
    return np.vstack(X_list), np.hstack(y_list)

In [29]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Take the last time step output
        return out

In [43]:
class COVIDTimeSeriesDataset(Dataset):
    def __init__(self, df, countries, feature_columns, seq_length):
        """
        Args:
            df (pd.DataFrame): The full dataset containing all countries' data.
            countries (list): List of country names to include.
            feature_columns (list): List of feature column names.
            seq_length (int): Number of time steps per sequence.
        """
        # Generate sequences using the provided function
        X, y = create_sequences_per_country(df, countries, feature_columns, seq_length)

        # Convert to PyTorch tensors
        self.X = torch.tensor(X, dtype=torch.float32)  # Shape: (num_samples, seq_length, num_features)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(-1)  # Shape: (num_samples, 1)

    def __len__(self):
        """Return the number of samples"""
        return len(self.X)

    def __getitem__(self, idx):
        """Return one sample (X sequence and corresponding y)"""
        return self.X[idx], self.y[idx]
    def get_number_features(self):
        """Return the number of features"""
        return self.X.shape[2]


In [None]:
def create_dataloaders(dataset, train_split=0.8, batch_size=32):
    """Creates sequential DataLoaders for training and testing"""
    # countries = df["location"].unique().tolist()  # Get all unique countries

    # Create dataset
    # dataset = COVIDTimeSeriesDataset(df, countries, feature_columns, seq_length)

    # Split into train/test using time order
    train_size = int(len(dataset) * train_split)
    # Ensure that the split is sequential
    train_dataset = torch.utils.data.Subset(dataset, range(train_size))  # First 80%
    test_dataset = torch.utils.data.Subset(dataset, range(train_size, len(dataset)))  # Last 20%

    # DataLoader (no shuffle for sequential ordering)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader

In [None]:
batch_size = 32

dataset = COVIDTimeSeriesDataset(df=model_data, countries=countries, 
                                 feature_columns=feat_policy_columns+feat_totals, seq_length=14)
train_loader, test_loader = create_dataloaders(dataset=dataset,batch_size=batch_size)

In [None]:
# Model initialization
input_size = dataset.get_number_features()  # Number of features
hidden_size = 32
num_layers = 1

model = RNNModel(input_size, hidden_size, num_layers)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [51]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)  # RNN output
        out = self.fc(out[:, -1, :])  # Take the last time step's output
        return out

In [52]:
# Model parameters
input_size = dataset.get_number_features()  # Number of features
hidden_size = 64
num_layers = 2
output_size = 1  # Predicting one value (e.g., next time step)

# Initialize model, loss function, and optimizer
model = RNNModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()  # For regression tasks
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()  # Reset gradients
        y_pred = model(X_batch)  # Forward pass
        loss = criterion(y_pred, y_batch)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

print("Training complete!")


Epoch [1/20], Loss: 2960119169.9462
Epoch [2/20], Loss: 2959835260.3413
Epoch [3/20], Loss: 2959552808.9649
Epoch [4/20], Loss: 2959271799.2958
Epoch [5/20], Loss: 2958992614.1894
Epoch [6/20], Loss: 2958714634.4956
Epoch [7/20], Loss: 2958439394.7562
Epoch [8/20], Loss: 2958166565.3967
Epoch [9/20], Loss: 2957908527.3801
Epoch [10/20], Loss: 2957638625.3972
Epoch [11/20], Loss: 2957364341.9657
Epoch [12/20], Loss: 2957092844.8316
Epoch [13/20], Loss: 2956828194.1578
Epoch [14/20], Loss: 2956567616.9306
Epoch [15/20], Loss: 2956916606.9582
Epoch [16/20], Loss: 2956124266.4580
Epoch [17/20], Loss: 2956024279.1218
Epoch [18/20], Loss: 2955591032.9083
Epoch [19/20], Loss: 2955802588.3032
Epoch [20/20], Loss: 2956741175.9724
Training complete!


In [None]:
model_data.shape

(46891, 9)

In [None]:
X.shape

torch.Size([46275, 14, 6])

---
### BDHFVHJSBJ

Using a target size (torch.Size([16])) that is different to the input size (torch.Size([16, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.

/usr/local/lib/python3.11/dist-packages/torch/nn/modules/loss.py:610: UserWarning:

Using a target size (torch.Size([10])) that is different to the input size (torch.Size([10, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


In [9]:
spain_data = model_data.loc[model_data.location == 'Spain']
spain_data = spain_data[['new_cases']].reset_index(drop=True)
import plotly.express as px

fig = px.line(spain_data, y='new_cases', title='New Cases in Spain Over Time')
fig.show()

In [10]:
scaler = MinMaxScaler()
spain_data["new_cases_scaled"] = scaler.fit_transform(spain_data[["new_cases"]])
fig = px.line(spain_data, y='new_cases_scaled', title='New Cases in Spain Over Time')
fig.show()