In [75]:
import json
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from torch.utils.data import Dataset, DataLoader


In [None]:
# initial model only running on pass events, uses each event as a input data point
'''
Future ideas:
1. Create an input per minute of the game with cumulative and recent data (e.g. total passes, average pass duration in last 5 minutes,
home:away pass ratio)
2. (done) Also include non-numerical data (event name, under pressure Boolean, outcome name, etc.). Will require label encoding
3. Include more types of events, not just passes. This will require careful treatment for missing testues (e.g. non-pass events will not have pass angle feature)
4. (done) We can classify all events as home or away. This allows us to combine data from multiple matches in the same competition.
'''

df_match = pd.read_csv('3754039_features.csv', index_col = 'index')
df_match = df_match[df_match['type.name']=='Pass']
cols = [#numeric values
        "event_time", "possession", "total_away_possession", "total_home_possession",
        "pass.length", "pass.angle", "duration", 'location_x', 'location_y', 'pass.end_location_x', 'pass.end_location_y', 'home_possession',
        #categorical values
        "position.name", "player.name", "pass.height.name",
        #targets
        "time_to_home_goal", #"time_to_away_goal"
        ]
df_match = df_match[cols]
df_match.head(10)

Unnamed: 0_level_0,event_time,possession,total_away_possession,total_home_possession,pass.length,pass.angle,duration,location_x,location_y,pass.end_location_x,pass.end_location_y,home_possession,position.name,player.name,pass.height.name,time_to_home_goal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4,0,2,0.0,0.0,1.216552,-1.405648,0.631389,61.0,40.1,61.2,38.9,False,Center Forward,Odion Jude Ighalo,Ground Pass,2694.0
7,1,2,2.0,0.0,18.36001,-2.893972,1.754082,60.8,38.9,43.0,34.4,False,Left Wing,Nordin Amrabat,Ground Pass,2693.0
10,3,2,9.0,0.0,76.760666,0.287963,5.445957,42.6,35.9,116.2,57.7,False,Center Defensive Midfield,Ben Watson,High Pass,2691.0
14,13,3,9.0,6.0,39.85097,1.121738,2.243769,12.9,28.8,30.2,64.7,True,Goalkeeper,Wayne Hennessey,Low Pass,2681.0
17,17,3,9.0,9.0,14.599315,0.116709,1.215215,35.4,66.9,49.9,68.6,True,Right Back,Joel Ward,Ground Pass,2677.0
25,23,3,9.0,15.0,19.86001,-1.037285,1.324835,57.6,55.5,67.7,38.4,True,Right Midfield,Wilfried Zaha,Ground Pass,2671.0
29,26,3,9.0,19.0,23.839882,-1.018866,2.689562,72.1,26.7,84.6,6.4,True,Left Center Midfield,Jordon Mutch,Ground Pass,2668.0
32,29,3,9.0,21.0,23.5,0.927295,0.88722,89.5,5.7,103.6,24.5,True,Left Back,Pape N''Diaye SouarÃ©,Low Pass,2665.0
33,30,4,10.0,21.0,12.163059,1.191817,2.226119,16.5,55.6,21.0,66.9,False,Center Defensive Midfield,Ben Watson,Low Pass,2664.0
37,33,4,13.0,21.0,10.807867,-1.959894,1.129465,19.1,66.0,15.0,56.0,False,Right Wing,Troy Deeney,Low Pass,2661.0


In [100]:
# Pre-process inputs and split dataset into train/test
target = "time_to_home_goal"
df_match.dropna(subset=[target],inplace=True)

categorical_cols = df_match.drop(columns=[target]).select_dtypes(include="object").columns.tolist()
numerical_cols = [col for col in df_match.columns if col not in categorical_cols + [target]]
print(f"{len(categorical_cols)} categorical columns: {categorical_cols}")
print(f"{len(numerical_cols)} numerical columns: {numerical_cols}")

#should print 0 (empty dataframe)
print(f'{len(df_match[df_match.isna().any(axis=1)])} rows with empty value')

# Normalize numerical features
scaler = StandardScaler()
df_match[numerical_cols] = scaler.fit_transform(df_match[numerical_cols])

# Use One-Hot encoding on categorical features
enc = OneHotEncoder(handle_unknown='error')
one_hot_array = enc.fit_transform(df_match[categorical_cols]).toarray()
one_hot_df = pd.DataFrame(one_hot_array, columns=enc.get_feature_names_out(categorical_cols), index=df_match.index)

#concat new one-hot encoding columns with original df, drop original categorical columns
df_match = pd.concat([df_match.drop(columns=categorical_cols), one_hot_df], axis=1)

X = df_match.drop(columns=target, inplace=False).values
y = df_match[target].values

# Split df into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

3 categorical columns: ['position.name', 'player.name', 'pass.height.name']
12 numerical columns: ['event_time', 'possession', 'total_away_possession', 'total_home_possession', 'pass.length', 'pass.angle', 'duration', 'location_x', 'location_y', 'pass.end_location_x', 'pass.end_location_y', 'home_possession']
0 rows with empty value


In [101]:
# Create dataset class for DataLoader, which efficiently handles batching, shuffling, and loading data in parallel
class SoccerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  # shape: (N, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SoccerDataset(X_train, y_train)
val_ds = SoccerDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(val_ds, batch_size=32)

In [102]:
#Create a simple neural network with 5 hidden layers
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.model(x)

model = SimpleNN(input_dim=X.shape[1])

In [103]:
#Train the model with Mean Square Error
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#(increase epoch if not converging)
for epoch in range(100):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

Epoch 1, Loss: 2376842.1023
Epoch 2, Loss: 2424852.5909
Epoch 3, Loss: 2442551.9318
Epoch 4, Loss: 2453732.1932
Epoch 5, Loss: 2521884.5455
Epoch 6, Loss: 2443836.1364
Epoch 7, Loss: 2385426.1932
Epoch 8, Loss: 2365727.0568
Epoch 9, Loss: 2434993.6136
Epoch 10, Loss: 2467088.1250
Epoch 11, Loss: 2441308.5000
Epoch 12, Loss: 2309887.0227
Epoch 13, Loss: 2425791.3295
Epoch 14, Loss: 2244773.8068
Epoch 15, Loss: 2211543.8523
Epoch 16, Loss: 2071230.1818
Epoch 17, Loss: 2019738.0568
Epoch 18, Loss: 1887471.8295
Epoch 19, Loss: 1770040.0795
Epoch 20, Loss: 1605032.8693
Epoch 21, Loss: 1474408.9034
Epoch 22, Loss: 1452496.6023
Epoch 23, Loss: 1283093.5170
Epoch 24, Loss: 1135609.7500
Epoch 25, Loss: 986151.0795
Epoch 26, Loss: 853010.8011
Epoch 27, Loss: 706665.7869
Epoch 28, Loss: 621114.4062
Epoch 29, Loss: 505441.5881
Epoch 30, Loss: 416455.4176
Epoch 31, Loss: 326753.6335
Epoch 32, Loss: 257993.9119
Epoch 33, Loss: 197404.9822
Epoch 34, Loss: 159687.5739
Epoch 35, Loss: 126125.2273
Epoch

In [104]:
# Look at performance on test dataset
model.eval()
with torch.no_grad():
    test_preds = []
    test_targets = []
    for batch_X, batch_y in test_loader:
        preds = model(batch_X)
        test_preds.append(preds)
        test_targets.append(batch_y)

    test_preds = torch.cat(test_preds).squeeze()
    test_targets = torch.cat(test_targets).squeeze()
    test_mse = nn.functional.mse_loss(test_preds, test_targets)
    print(f"Testing MSE: {test_mse.item():.4f}")

Testing MSE: 5626.9702
