In [None]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [None]:
df = pd.read_csv('all_time_series.csv')
# Pre-process inputs and split dataset into train/test
target = "time_to_home_goal"
print(f'dropped {df[target].isna().sum()} rows without target: {target}')
df.dropna(subset=[target],inplace=True)

features = [col for col in df.columns if col not in ['time_to_home_goal', 'time_to_away_goal', 'event_time', 'match_id', 'period']]

# Normalize numerical features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [None]:
segments = df[["match_id", "period"]].drop_duplicates()
print("Total segments:", len(segments))
train_segments, test_segments = train_test_split(segments, test_size=0.2, random_state=42)
print("Train:", len(train_segments), "Test:", len(test_segments))

train_df = df.merge(train_segments, on=["match_id", "period"])
test_df  = df.merge(test_segments, on=["match_id", "period"])

In [None]:
def create_sequences(df, target, features, seq_length):
    X, y  = [], []
    for (match_id, period), group in df.groupby(["match_id", "period"]):
        group = group.sort_values("event_time")
        feature_array = group[features].to_numpy()
        target_array = group[target].to_numpy()
        
        for i in range(len(group)-seq_length+1):
            X.append(feature_array[i:i+seq_length])
            y.append(target_array[i+seq_length-1])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train_df, target, features, 60)
X_test, y_test   = create_sequences(test_df, target, features, 60)

In [None]:
class SoccerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SoccerDataset(X_train, y_train)
test_ds = SoccerDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)


In [None]:
class GoalRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # regression output
        )

    def forward(self, x):
        # x shape: (batch, seq_len, input_dim)
        out, _ = self.lstm(x)
        out = out[:, -1, :]   # last timestep
        out = self.fc_layers(out)
        return out
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GoalRNN(input_dim=len(features), hidden_dim=64, num_layers=2).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_model(model, train_loader, test_loader, epochs=20):
    train_losses, test_losses = [], []

    for epoch in range(epochs):
        # ---- TRAIN ----
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            y_pred = model(X_batch).squeeze()
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)

        # ---- EVAL ----
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch).squeeze()
                loss = criterion(y_pred, y_batch)
                running_loss += loss.item() * X_batch.size(0)

        epoch_test_loss = running_loss / len(test_loader.dataset)
        test_losses.append(epoch_test_loss)

        # simple logging per epoch
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {epoch_train_loss:.4f} | Test Loss: {epoch_test_loss:.4f}")

    return train_losses, test_losses, model


train_losses, test_losses, model = train_model(model, train_loader, test_loader, epochs=20)
