prepping code

In [23]:
from itertools import count
import numpy as np
import pandas as pd
import psycopg2
import dotenv
import os
from matplotlib import animation
from matplotlib import pyplot as plt
from mplsoccer import Pitch
from scipy.interpolate import interp1d

dotenv.load_dotenv()

PG_PASSWORD = os.getenv("PG_PASSWORD")
PG_USER = os.getenv("PG_USER")
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DATABASE = os.getenv("PG_DB")

conn = psycopg2.connect(
    host=PG_HOST,
    database=PG_DATABASE,
    user=PG_USER,
    password=PG_PASSWORD,
    port=PG_PORT,
    sslmode="require",
)


matches_1 = """
SELECT  
    me.*, 
    et.name AS event_name,  
    et.description AS event_description  
FROM public.matchevents me
INNER JOIN public.eventtypes et USING(eventtype_id)
INNER JOIN public.matches m USING(match_id)
WHERE m.match_id = '5pcyhm34h5c948yji4oryevpw'
"""

df_matches = pd.read_sql_query(matches_1,conn)


# Convert timedelta-like format to seconds
df_matches["timestamp"] = df_matches["timestamp"].astype(str).str.extract(r'(\d+\.\d+)')[0].astype(float)
df_matches["end_timestamp"] = df_matches["end_timestamp"].astype(str).str.extract(r'(\d+\.\d+)')[0].astype(float)

# Check the result
print(df_matches[["timestamp", "end_timestamp"]].head())


df_matches.head()

   timestamp  end_timestamp
0      0.087          1.627
1      1.627          4.178
2      4.178            NaN
3      6.575            NaN
4      6.576            NaN


Unnamed: 0,match_id,event_id,eventtype_id,result,success,period_id,timestamp,end_timestamp,ball_state,ball_owning_team,team_id,player_id,x,y,end_coordinates_x,end_coordinates_y,receiver_player_id,event_name,event_description
0,5pcyhm34h5c948yji4oryevpw,2704102295,e319ac55-ffaf-4e6d-87f7-7601d91bcd33,COMPLETE,True,1,0.087,1.627,alive,cyrrlv6l1onld5x247w1q1jlr,cyrrlv6l1onld5x247w1q1jlr,6g9x1y7xmjzwhk646v1iyuup5,50.0,50.0,37.0,49.3,cfxzvlgvt9jj17qxq41t9sxcl,PASS,StatsPerform/Opta event type: PASS
1,5pcyhm34h5c948yji4oryevpw,2704102881,e319ac55-ffaf-4e6d-87f7-7601d91bcd33,COMPLETE,True,1,1.627,4.178,alive,cyrrlv6l1onld5x247w1q1jlr,cyrrlv6l1onld5x247w1q1jlr,cfxzvlgvt9jj17qxq41t9sxcl,39.1,48.0,32.3,38.9,2ky2kn7gpjorkyg9zyg68pk6i,PASS,StatsPerform/Opta event type: PASS
2,5pcyhm34h5c948yji4oryevpw,2704102909,e319ac55-ffaf-4e6d-87f7-7601d91bcd33,INCOMPLETE,False,1,4.178,,alive,cyrrlv6l1onld5x247w1q1jlr,cyrrlv6l1onld5x247w1q1jlr,2ky2kn7gpjorkyg9zyg68pk6i,36.3,36.4,69.7,16.7,,PASS,StatsPerform/Opta event type: PASS
3,5pcyhm34h5c948yji4oryevpw,2704103595,2464af85-9671-4c4c-90ae-100af0a5e2ee,WON,True,1,6.575,,alive,cyrrlv6l1onld5x247w1q1jlr,bw9wm8pqfzcchumhiwdt2w15c,9fvwcsajeousbo5o0e84cj2sp,22.0,91.6,,,,DUEL,StatsPerform/Opta event type: DUEL
4,5pcyhm34h5c948yji4oryevpw,2704103597,2464af85-9671-4c4c-90ae-100af0a5e2ee,LOST,False,1,6.576,,alive,cyrrlv6l1onld5x247w1q1jlr,cyrrlv6l1onld5x247w1q1jlr,8ozlo6qrdavr86guhclwvh84q,78.0,8.4,,,,DUEL,StatsPerform/Opta event type: DUEL


testing code

In [24]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader

# Load tokenizer (Using BERT tokenizer for event names/descriptions)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Convert event names to tokenized inputs
df_matches["event_tokens"] = df_matches["event_name"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, return_tensors="pt")["input_ids"])

# Label Encoding for categorical variables
event_encoder = LabelEncoder()
df_matches["event_label"] = event_encoder.fit_transform(df_matches["event_name"])

# Normalize timestamps
scaler = StandardScaler()
df_matches[["timestamp", "end_timestamp"]] = scaler.fit_transform(df_matches[["timestamp", "end_timestamp"]])

# Show processed data
print(df_matches[["event_tokens", "event_label", "timestamp"]].head())


                                        event_tokens  event_label  timestamp
0  [[tensor(101), tensor(3413), tensor(102), tens...           21  -1.676841
1  [[tensor(101), tensor(3413), tensor(102), tens...           21  -1.590584
2  [[tensor(101), tensor(3413), tensor(102), tens...           21  -1.447699
3  [[tensor(101), tensor(14216), tensor(102), ten...            3  -1.313441
4  [[tensor(101), tensor(14216), tensor(102), ten...            3  -1.313385


In [25]:
class MatchDataset(Dataset):
    def __init__(self, df):
        self.events = df["event_tokens"].values
        self.timestamps = df[["timestamp", "end_timestamp"]].values
        self.labels = df["event_label"].values  # Event type classification

    def __len__(self):
        return len(self.events)

    def __getitem__(self, idx):
        return {
            "events": self.events[idx].squeeze(0),  # Tokenized text
            "timestamps": torch.tensor(self.timestamps[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create dataset and dataloader
dataset = MatchDataset(df_matches)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Check sample batch
sample_batch = next(iter(dataloader))
print(sample_batch)


{'events': tensor([[  101, 12391,  1024,  ...,     0,     0,     0],
        [  101,  3413,   102,  ...,     0,     0,     0],
        [  101,  3413,   102,  ...,     0,     0,     0],
        ...,
        [  101,  3413,   102,  ...,     0,     0,     0],
        [  101, 14216,   102,  ...,     0,     0,     0],
        [  101, 12391,  1024,  ...,     0,     0,     0]]), 'timestamps': tensor([[ 1.4978,     nan],
        [ 0.3323,  0.4839],
        [-0.5543,     nan],
        [ 1.0559,     nan],
        [ 1.2265,     nan],
        [-0.9098, -0.7552],
        [-0.7752,     nan],
        [ 1.3769,     nan]]), 'labels': tensor([ 9, 21, 21,  9,  8, 21,  3,  6])}


In [26]:
import torch.nn as nn

class MatchEventTransformer(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size + 2, num_classes)  # +2 for timestamp features
        self.relu = nn.ReLU()

    def forward(self, events, timestamps):
        bert_output = self.bert(events).pooler_output  # Get last hidden state
        x = torch.cat((bert_output, timestamps), dim=1)  # Concatenate timestamps
        x = self.relu(self.fc(x))
        return x

# Define model
num_classes = len(event_encoder.classes_)
model = MatchEventTransformer(num_classes)

# Check model architecture
print(model)


MatchEventTransformer(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [28]:
import torch.optim as optim

# Loss function & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        
        outputs = model(batch["events"], batch["timestamps"])
        loss = criterion(outputs, batch["labels"])
        
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")


KeyboardInterrupt: 

In [None]:
def predict_event(model, input_text, timestamp):
    tokens = tokenizer(input_text, padding="max_length", truncation=True, return_tensors="pt")["input_ids"]
    timestamp = torch.tensor(timestamp, dtype=torch.float).unsqueeze(0)
    
    with torch.no_grad():
        output = model(tokens, timestamp)
        predicted_label = torch.argmax(output, dim=1).item()

    return event_encoder.inverse_transform([predicted_label])[0]

# Example Prediction
example_event = "Pass"
example_time = [50.0, 55.0]  # Example timestamp (normalized)
predicted_event = predict_event(model, example_event, example_time)
print(f"Predicted event: {predicted_event}")


In [None]:
from mplsoccer import Pitch
import matplotlib.pyplot as plt

# Create pitch
pitch = Pitch(pitch_type='statsbomb', line_color='black')

# Filter events with valid x, y positions
df_events = df_matches.dropna(subset=['x', 'y'])

fig, ax = pitch.draw(figsize=(10, 6))

# Plot event locations
scatter = pitch.scatter(df_events['x'], df_events['y'], ax=ax, alpha=0.7, s=50, label="Event Locations")

plt.title("🔥 Heatmap of Event Locations in Match")
plt.legend()
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.countplot(y=df_matches["event_name"], order=df_matches["event_name"].value_counts().index)
plt.xlabel("Number of Events")
plt.ylabel("Event Type")
plt.title("📊 Event Frequency in Match")
plt.show()
