# 06. Pre-match Model Training

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import joblib

In [2]:
# Load the processed dataset
df = pd.read_csv('../data/processed/dataset_processed.csv')
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (380, 39)


Unnamed: 0,Round,Day,Venue,Result,Home_Goals,Away_Goals,Away_Team,Home_xG,Away_xG,Home_Poss,...,PK_scaled,PKatt_scaled,GD_scaled,xGD_scaled,Round_scaled,Away_Poss,Away_Shots,Away_SoT,Away_xG.1,Away_xGA
0,2,Sat,Home,H,1,0,Newcastle United,1.0,0.3,59.0,...,-0.351991,-0.37359,0.479899,0.486837,-1.595863,41.0,11.2,3.2,0.3,1.0
1,4,Sat,Home,H,5,1,Fulham,2.2,1.4,68.0,...,2.434604,2.279944,1.919596,0.556385,-1.413478,32.0,6.0,4.0,1.4,2.2
2,6,Sat,Home,H,2,0,Nott'ham Forest,1.3,1.0,57.0,...,-0.351991,-0.37359,0.959798,0.208644,-1.231094,43.0,5.6,3.2,1.0,1.3
3,9,Sat,Home,H,2,1,Brighton and Hove Albion,0.8,0.8,55.0,...,-0.351991,-0.37359,0.479899,0.0,-0.957518,45.0,8.0,4.0,0.8,0.8
4,11,Sat,Home,H,6,1,AFC Bournemouth,1.9,0.9,65.0,...,-0.351991,-0.37359,2.399495,0.695481,-0.775133,35.0,5.0,1.0,0.9,1.9


In [3]:
def create_prematch_features(df):
    # Initialize team statistics
    team_stats = {}
    all_teams = pd.concat([df['Home_Team'], df['Away_Team']]).unique()
    
    for team in all_teams:
        team_stats[team] = {
            'games': 0,
            'wins': 0, 
            'draws': 0,
            'losses': 0,
            'goals_for': [],
            'goals_against': [],
            'points': [],
            'home_games': 0,
            'away_games': 0,
            'home_wins': 0,
            'away_wins': 0
        }
    
    features_list = []
    
    for idx, row in df.iterrows():
        home_team = row['Home_Team']
        away_team = row['Away_Team']
        day = row['Day']
        round_num = row['Round']
        
        # Get historical stats before this match
        home_stats = team_stats[home_team]
        away_stats = team_stats[away_team]
        
        # Calculate features based on historical data
        # Win rates
        home_win_rate = home_stats['wins'] / max(1, home_stats['games'])
        away_win_rate = away_stats['wins'] / max(1, away_stats['games'])
        
        # Home/Away specific performance
        home_home_win_rate = home_stats['home_wins'] / max(1, home_stats['home_games'])
        away_away_win_rate = away_stats['away_wins'] / max(1, away_stats['away_games'])
        
        # Goal averages
        home_avg_goals = np.mean(home_stats['goals_for']) if home_stats['goals_for'] else 1.3
        away_avg_goals = np.mean(away_stats['goals_for']) if away_stats['goals_for'] else 1.3
        
        home_avg_conceded = np.mean(home_stats['goals_against']) if home_stats['goals_against'] else 1.3
        away_avg_conceded = np.mean(away_stats['goals_against']) if away_stats['goals_against'] else 1.3
        
        # Recent form (last 5 games)
        home_recent_points = sum(home_stats['points'][-5:]) if len(home_stats['points']) >= 5 else home_stats['games'] * 1.5
        away_recent_points = sum(away_stats['points'][-5:]) if len(away_stats['points']) >= 5 else away_stats['games'] * 1.5
        
        # Points per game
        home_ppg = sum(home_stats['points']) / max(1, home_stats['games'])
        away_ppg = sum(away_stats['points']) / max(1, away_stats['games'])
        
        features = {
            'home_team': home_team,
            'away_team': away_team,
            'day': day,
            'round': round_num,
            'is_weekend': 1 if day in ['Sat', 'Sun'] else 0,
            'home_win_rate': home_win_rate,
            'away_win_rate': away_win_rate,
            'home_home_win_rate': home_home_win_rate,
            'away_away_win_rate': away_away_win_rate,
            'home_avg_goals': home_avg_goals,
            'away_avg_goals': away_avg_goals,
            'home_avg_conceded': home_avg_conceded,
            'away_avg_conceded': away_avg_conceded,
            'home_recent_form': home_recent_points,
            'away_recent_form': away_recent_points,
            'home_ppg': home_ppg,
            'away_ppg': away_ppg,
            'games_played_diff': home_stats['games'] - away_stats['games'],
            'goal_diff_advantage': (home_avg_goals - home_avg_conceded) - (away_avg_goals - away_avg_conceded),
            'result': row['Result']  # Target variable
        }
        
        features_list.append(features)
        
        # Update team statistics after this match
        home_goals = row['Home_Goals']
        away_goals = row['Away_Goals']
        
        if row['Result'] == 'H':
            team_stats[home_team]['wins'] += 1
            team_stats[away_team]['losses'] += 1
            team_stats[home_team]['points'].append(3)
            team_stats[away_team]['points'].append(0)
            team_stats[home_team]['home_wins'] += 1
        elif row['Result'] == 'A':
            team_stats[away_team]['wins'] += 1
            team_stats[home_team]['losses'] += 1
            team_stats[away_team]['points'].append(3)
            team_stats[home_team]['points'].append(0)
            team_stats[away_team]['away_wins'] += 1
        else:  # Draw
            team_stats[home_team]['draws'] += 1
            team_stats[away_team]['draws'] += 1
            team_stats[home_team]['points'].append(1)
            team_stats[away_team]['points'].append(1)
            
        # Update goal statistics
        team_stats[home_team]['goals_for'].append(home_goals)
        team_stats[home_team]['goals_against'].append(away_goals)
        team_stats[away_team]['goals_for'].append(away_goals)
        team_stats[away_team]['goals_against'].append(home_goals)
        
        # Update game counts
        team_stats[home_team]['games'] += 1
        team_stats[away_team]['games'] += 1
        team_stats[home_team]['home_games'] += 1
        team_stats[away_team]['away_games'] += 1
    
    return pd.DataFrame(features_list)

print("Creating pre-match features...")
features_df = create_prematch_features(df)
print(f"Features created: {features_df.shape}")
features_df.head()

Creating pre-match features...
Features created: (380, 20)


Unnamed: 0,home_team,away_team,day,round,is_weekend,home_win_rate,away_win_rate,home_home_win_rate,away_away_win_rate,home_avg_goals,away_avg_goals,home_avg_conceded,away_avg_conceded,home_recent_form,away_recent_form,home_ppg,away_ppg,games_played_diff,goal_diff_advantage,result
0,Manchester City,Newcastle United,Sat,2,1,0.0,0.0,0.0,0.0,1.3,1.3,1.3,1.3,0.0,0.0,0.0,0.0,0,0.0,H
1,Manchester City,Fulham,Sat,4,1,1.0,0.0,1.0,0.0,1.0,1.3,0.0,1.3,1.5,0.0,3.0,0.0,1,1.0,H
2,Manchester City,Nott'ham Forest,Sat,6,1,1.0,0.0,1.0,0.0,3.0,1.3,0.5,1.3,3.0,0.0,3.0,0.0,2,2.5,H
3,Manchester City,Brighton and Hove Albion,Sat,9,1,1.0,0.0,1.0,0.0,2.666667,1.3,0.333333,1.3,4.5,0.0,3.0,0.0,3,2.333333,H
4,Manchester City,AFC Bournemouth,Sat,11,1,1.0,0.0,1.0,0.0,2.5,1.3,0.5,1.3,6.0,0.0,3.0,0.0,4,2.0,H


In [4]:
# Encode categorical variables
le_home = LabelEncoder()
le_away = LabelEncoder() 
le_day = LabelEncoder()
le_result = LabelEncoder()

features_df['home_team_encoded'] = le_home.fit_transform(features_df['home_team'])
features_df['away_team_encoded'] = le_away.fit_transform(features_df['away_team'])
features_df['day_encoded'] = le_day.fit_transform(features_df['day'])
features_df['result_encoded'] = le_result.fit_transform(features_df['result'])

print("Teams:", le_home.classes_)
print("Days:", le_day.classes_)
print("Results:", le_result.classes_)

Teams: ['AFC Bournemouth' 'Arsenal' 'AstonVilla' 'Brentford'
 'BrightonandHoveAlbion' 'Burnley' 'Chelsea' 'CrystalPalace' 'Everton'
 'Fulham' 'Liverpool' 'Luton Town' 'Manchester City' 'Manchester United'
 'NewcastleUnited' 'NottinghamForest' 'SheffieldUnited' 'TottenhamHotspur'
 'WestHamUnited' 'WolverhamptonWanderers']
Days: ['Fri' 'Mon' 'Sat' 'Sun' 'Thu' 'Tue' 'Wed']
Results: ['A' 'D' 'H']


In [5]:
# Select features for training
feature_columns = [
    'home_team_encoded', 'away_team_encoded', 'day_encoded', 'round',
    'is_weekend', 'home_win_rate', 'away_win_rate', 'home_home_win_rate',
    'away_away_win_rate', 'home_avg_goals', 'away_avg_goals',
    'home_avg_conceded', 'away_avg_conceded', 'home_recent_form', 
    'away_recent_form', 'home_ppg', 'away_ppg', 'games_played_diff',
    'goal_diff_advantage'
]

X = features_df[feature_columns]
y = features_df['result_encoded']

# Skip first 50 matches to have some historical data
X = X.iloc[50:]
y = y.iloc[50:]

print(f"Training data shape: {X.shape}")
print(f"Feature columns: {feature_columns}")

Training data shape: (330, 19)
Feature columns: ['home_team_encoded', 'away_team_encoded', 'day_encoded', 'round', 'is_weekend', 'home_win_rate', 'away_win_rate', 'home_home_win_rate', 'away_away_win_rate', 'home_avg_goals', 'away_avg_goals', 'home_avg_conceded', 'away_avg_conceded', 'home_recent_form', 'away_recent_form', 'home_ppg', 'away_ppg', 'games_played_diff', 'goal_diff_advantage']


In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Train set: (264, 19)
Test set: (66, 19)


In [7]:
# Train model
model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)

print("Training model...")
model.fit(X_train, y_train)
print("Training complete!")

Training model...
Training complete!


In [8]:
# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Pre-match Model Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_result.classes_))

Pre-match Model Accuracy: 0.379

Classification Report:
              precision    recall  f1-score   support

           A       0.36      0.33      0.35        24
           D       0.00      0.00      0.00        15
           H       0.47      0.63      0.54        27

    accuracy                           0.38        66
   macro avg       0.28      0.32      0.30        66
weighted avg       0.33      0.38      0.35        66



In [9]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(10))

Top 10 Most Important Features:
                feature  importance
18  goal_diff_advantage    0.095654
4            is_weekend    0.059268
15             home_ppg    0.055894
14     away_recent_form    0.055453
16             away_ppg    0.054534
12    away_avg_conceded    0.053510
7    home_home_win_rate    0.052582
5         home_win_rate    0.052501
6         away_win_rate    0.051715
8    away_away_win_rate    0.049772


In [11]:
# Save model and encoders
print("Saving model and encoders...")

joblib.dump(model, "../models/prematch_model.pkl")

encoders = {
    'home': le_home,
    'away': le_away, 
    'day': le_day,
    'result': le_result
}
joblib.dump(encoders, "../models/prematch_encoders.pkl")

# Also save feature columns for reference
joblib.dump(feature_columns, "../models/prematch_features.pkl")

print("Model and encoders saved successfully!")
print(f"Model accuracy: {accuracy:.3f}")

Saving model and encoders...
Model and encoders saved successfully!
Model accuracy: 0.379


In [12]:
# Test a sample prediction
sample_home = "Manchester City"
sample_away = "Arsenal"
sample_day = "Sat"

# Create sample features (using average values for unknown teams)
sample_features = np.array([[
    le_home.transform([sample_home])[0] if sample_home in le_home.classes_ else 0,
    le_away.transform([sample_away])[0] if sample_away in le_away.classes_ else 0,
    le_day.transform([sample_day])[0],
    20,  # round
    1,   # is_weekend
    0.5, 0.4,  # win rates
    0.6, 0.3,  # home win rates
    1.8, 1.2,  # avg goals
    1.0, 1.5,  # avg conceded
    8, 6,      # recent form
    1.8, 1.2,  # ppg
    0,         # games diff
    0.3        # goal diff advantage
]])

prediction = model.predict(sample_features)[0]
probabilities = model.predict_proba(sample_features)[0]

result = le_result.inverse_transform([prediction])[0]

print(f"\nSample Prediction:")
print(f"{sample_home} vs {sample_away} on {sample_day}")
print(f"Prediction: {result}")
print(f"Probabilities: Away={probabilities[0]:.3f}, Draw={probabilities[1]:.3f}, Home={probabilities[2]:.3f}")


Sample Prediction:
Manchester City vs Arsenal on Sat
Prediction: H
Probabilities: Away=0.267, Draw=0.287, Home=0.446
