In [31]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [32]:
df = pd.read_csv('all_time_series.csv')
print(f'{df['match_id'].nunique()} matches')
print(f'{len(df.dropna(subset=['time_to_home_goal']))//60} minutes of home goal data')
print(f'{len(df.dropna(subset=['time_to_away_goal']))//60} minutes of away goal data')

345 matches
11059 minutes of home goal data
9254 minutes of away goal data


In [33]:
# Pre-process inputs and split dataset into train/test
target = "time_to_home_goal"
print(f'dropped {df[target].isna().sum()} rows without target: {target}')
df.dropna(subset=[target],inplace=True)

features = [col for col in df.columns if col not in ['time_to_home_goal', 'time_to_away_goal', 'event_time', 'match_id', 'period']]
print(f'{len(features)} features: {features}')


#should print 0
print(f'{len(df[df[features].isna().any(axis=1)])} rows with missing feature value')

# Normalize numerical features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

dropped 345344 rows without target: time_to_home_goal
14 features: ['cumulative_possession_away', 'cumulative_possession_home', 'avg_pass_length_home', 'avg_pass_duration_home', 'cumulative_num_passes_home', 'avg_pass_length_away', 'avg_pass_duration_away', 'cumulative_num_passes_away', 'cumulative_shots_attempted_home', 'cumulative_shots_attempted_away', 'location_x_home', 'location_y_home', 'location_x_away', 'location_y_away']
0 rows with missing feature value


In [34]:
segments = df[["match_id", "period"]].drop_duplicates()
print("Total segments:", len(segments))
train_segments, test_segments = train_test_split(segments, test_size=0.2, random_state=42)
print("Train:", len(train_segments), "Test:", len(test_segments))

train_df = df.merge(train_segments, on=["match_id", "period"])
test_df  = df.merge(test_segments, on=["match_id", "period"])

Total segments: 386
Train: 308 Test: 78


In [35]:
def create_sequences(df, target, features, seq_length):
    X, y  = [], []
    for (match_id, period), group in df.groupby(["match_id", "period"]):
        group = group.sort_values("event_time")
        feature_array = group[features].to_numpy()
        target_array = group[target].to_numpy()
        
        for i in range(len(group)-seq_length+1):
            X.append(feature_array[i:i+seq_length])
            y.append(target_array[i+seq_length-1])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train_df, target, features, 60)
X_test, y_test   = create_sequences(test_df, target, features, 60)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(500563, 60, 14)
(500563,)
(140269, 60, 14)
(140269,)


In [36]:
class SoccerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SoccerDataset(X_train, y_train)
test_ds = SoccerDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)


In [37]:
class GoalRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # regression output
        )

    def forward(self, x):
        # x shape: (batch, seq_len, input_dim)
        out, _ = self.lstm(x)
        out = out[:, -1, :]   # last timestep
        out = self.fc_layers(out)
        return out
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GoalRNN(input_dim=len(features), hidden_dim=64, num_layers=2).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [38]:
def train_model(model, train_loader, test_loader, epochs=20):
    train_losses, test_losses = [], []

    for epoch in range(epochs):
        # ---- TRAIN ----
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:  # no tqdm here
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            y_pred = model(X_batch).squeeze()
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_train_loss)

        # ---- EVAL ----
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch).squeeze()
                loss = criterion(y_pred, y_batch)
                running_loss += loss.item() * X_batch.size(0)

        epoch_test_loss = running_loss / len(test_loader.dataset)
        test_losses.append(epoch_test_loss)

        # simple logging per epoch
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {epoch_train_loss:.4f} | Test Loss: {epoch_test_loss:.4f}")

    return train_losses, test_losses, model


train_losses, test_losses, model = train_model(model, train_loader, test_loader, epochs=20)


Epoch 1/20 | Train Loss: 268386.3128 | Test Loss: 474078.8880
Epoch 2/20 | Train Loss: 170434.0689 | Test Loss: 532184.3030
Epoch 3/20 | Train Loss: 106154.7185 | Test Loss: 564613.2242
Epoch 4/20 | Train Loss: 65658.0566 | Test Loss: 534455.4346
Epoch 5/20 | Train Loss: 48049.5779 | Test Loss: 547213.6632
Epoch 6/20 | Train Loss: 32286.1319 | Test Loss: 544684.7588
Epoch 7/20 | Train Loss: 24344.1837 | Test Loss: 556707.5353
Epoch 8/20 | Train Loss: 19785.1560 | Test Loss: 566941.5578
Epoch 9/20 | Train Loss: 17456.8563 | Test Loss: 563916.2226
Epoch 10/20 | Train Loss: 15102.8052 | Test Loss: 546099.7909
Epoch 11/20 | Train Loss: 14373.0420 | Test Loss: 555156.9070
Epoch 12/20 | Train Loss: 11746.2593 | Test Loss: 549604.1413
Epoch 13/20 | Train Loss: 10524.0190 | Test Loss: 564226.2588
Epoch 14/20 | Train Loss: 9306.5787 | Test Loss: 550000.4805
Epoch 15/20 | Train Loss: 10304.7629 | Test Loss: 560027.4071
Epoch 16/20 | Train Loss: 8387.6166 | Test Loss: 564177.8511
Epoch 17/20 | Tr

In [39]:
import torch

# Save
torch.save(model.state_dict(), 'model_state.pth')

# # Load
# model = GoalRNN(input_dim=len(features), hidden_dim=64, num_layers=2)  # re-instantiate model
# model.load_state_dict(torch.load('model_state.pth'))
# model.eval()  # set to evaluation mode
