In [11]:
import pandas as pd
import joblib

model = joblib.load("../../models/random_forest_model.pkl")

In [12]:
future_matches = pd.read_csv('../../data/pl25-26.csv')

In [13]:
future_matches = future_matches[[
    "Date", "HomeTeam", "AwayTeam",
    "FTHG", "FTAG", "FTR",
    "HS", "AS", "HST", "AST",
    "HF", "AF", "HC", "AC",
    "HY", "AY", "HR", "AR",
    "B365H", "B365D", "B365A"
]]

In [14]:
rename_map = {
    "Date": "date",
    "HomeTeam": "home_team",
    "AwayTeam": "away_team",
    "FTHG": "home_goals",
    "FTAG": "away_goals",
    "FTR":  "result",

    "HTHG": "home_ht_goals",
    "HTAG": "away_ht_goals",
    "HTR":  "ht_result",

    "HS": "home_shots",
    "AS": "away_shots",
    "HST": "home_sot",
    "AST": "away_sot",
    "HF": "home_fouls",
    "AF": "away_fouls",
    "HC": "home_corners",
    "AC": "away_corners",
    "HY": "home_yellow",
    "AY": "away_yellow",
    "HR": "home_red",
    "AR": "away_red",

    "B365H": "odds_home_win",
    "B365D": "odds_draw",
    "B365A": "odds_away_win"
}
future_matches.rename(columns=rename_map, inplace=True)

In [15]:
future_matchesXG = pd.read_csv('../../data/pl_all_seasonsXG.csv')

In [16]:
team_name_map = {
        "Newcastle Utd": "Newcastle",
        "Manchester Utd": "Man United",
        "Manchester City": "Man City",
        "Leicester City": "Leicester",
        "Leeds United": "Leeds",
        "Ipswich Town": "Ipswich",
        "Luton Town": "Luton",
        "Norwich City": "Norwich",
        "Sheffield Utd": "Sheffield United",
        "Nott'ham Forest": "Nott'm Forest"
    }

future_matchesXG['home_team'] = future_matchesXG['home_team'].replace(team_name_map)
future_matchesXG['away_team'] = future_matchesXG['away_team'].replace(team_name_map)

In [17]:
future_matches['date'] = pd.to_datetime(future_matches['date'])
future_matchesXG['date'] = pd.to_datetime(future_matchesXG['date'])

  future_matches['date'] = pd.to_datetime(future_matches['date'])


In [18]:
merged = future_matches.merge(future_matchesXG, on=['date', 'home_team', 'away_team'], how='left')

In [19]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           130 non-null    datetime64[ns]
 1   home_team      130 non-null    object        
 2   away_team      130 non-null    object        
 3   home_goals     130 non-null    int64         
 4   away_goals     130 non-null    int64         
 5   result         130 non-null    object        
 6   home_shots     130 non-null    int64         
 7   away_shots     130 non-null    int64         
 8   home_sot       130 non-null    int64         
 9   away_sot       130 non-null    int64         
 10  home_fouls     130 non-null    int64         
 11  away_fouls     130 non-null    int64         
 12  home_corners   130 non-null    int64         
 13  away_corners   130 non-null    int64         
 14  home_yellow    130 non-null    int64         
 15  away_yellow    130 non-

In [20]:
import pandas as pd

def add_rolling_team_stats(df, window=5):

    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date")

    stat_map = {
        "goals_scored": ("home_goals", "away_goals"),
        "goals_conceded": ("away_goals", "home_goals"),
        "shots": ("home_shots", "away_shots"),
        "shots_on_target": ("home_sot", "away_sot"),
        "fouls": ("home_fouls", "away_fouls"),
        "corners": ("home_corners", "away_corners"),
        "yellow_cards": ("home_yellow", "away_yellow"),
        "red_cards": ("home_red", "away_red"),
        "xG": ("home_xG", "away_xG"),
        "xG.1": ("away_xG", "home_xG"),
    }

    # Build long format
    home_df = df.rename(columns={v[0]: k for k, v in stat_map.items()})
    away_df = df.rename(columns={v[1]: k for k, v in stat_map.items()})

    home_df["team"] = home_df["home_team"]
    away_df["team"] = away_df["away_team"]

    cols = ["date", "team"] + list(stat_map.keys())
    long_df = pd.concat([home_df[cols], away_df[cols]], ignore_index=True)
    long_df = long_df.sort_values(["team", "date"]).reset_index(drop=True)

    # Compute rolling stats (fixed)
    for stat in stat_map.keys():
        long_df[f"{stat}_rolling_avg"] = (
            long_df.groupby("team")[stat]
                   .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean())
        )

    # HOME merge
    home_merge_cols = {f"{stat}_rolling_avg": f"home_{stat}_rolling"
                       for stat in stat_map.keys()}

    df = df.merge(
        long_df[["date", "team"] + list(home_merge_cols.keys())],
        left_on=["date", "home_team"],
        right_on=["date", "team"],
        how="left"
    ).drop(columns=["team"])
    df = df.rename(columns=home_merge_cols)

    # AWAY merge
    away_merge_cols = {f"{stat}_rolling_avg": f"away_{stat}_rolling"
                       for stat in stat_map.keys()}

    df = df.merge(
        long_df[["date", "team"] + list(away_merge_cols.keys())],
        left_on=["date", "away_team"],
        right_on=["date", "team"],
        how="left"
    ).drop(columns=["team"])
    df = df.rename(columns=away_merge_cols)

    # Fill NAN
    df = df.fillna(0)

    return df


In [21]:
match_stats = add_rolling_team_stats(merged, window=5)

In [22]:
match_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 44 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          130 non-null    datetime64[ns]
 1   home_team                     130 non-null    object        
 2   away_team                     130 non-null    object        
 3   home_goals                    130 non-null    int64         
 4   away_goals                    130 non-null    int64         
 5   result                        130 non-null    object        
 6   home_shots                    130 non-null    int64         
 7   away_shots                    130 non-null    int64         
 8   home_sot                      130 non-null    int64         
 9   away_sot                      130 non-null    int64         
 10  home_fouls                    130 non-null    int64         
 11  away_fouls                    13

In [23]:
import numpy as np
import pandas as pd

def expected_result(elo_a, elo_b):
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

def update_elo(elo_a, elo_b, score_a, score_b, k=20):
    expected_a = expected_result(elo_a, elo_b)
    expected_b = 1 - expected_a

    if score_a > score_b:   # home win
        actual_a, actual_b = 1, 0
    elif score_a < score_b: # away win
        actual_a, actual_b = 0, 1
    else:
        actual_a, actual_b = 0.5, 0.5

    new_a = elo_a + k * (actual_a - expected_a)
    new_b = elo_b + k * (actual_b - expected_b)

    return new_a, new_b

def add_elo_features(df):
    df = df.sort_values("date").copy()

    team_elos = {}
    base_elo = 1500

    df["home_elo_before"] = 0.0
    df["away_elo_before"] = 0.0

    for i, row in df.iterrows():
        home, away = row["home_team"], row["away_team"]

        # If new team, initialize
        team_elos.setdefault(home, base_elo)
        team_elos.setdefault(away, base_elo)

        # Assign ELO before match
        df.at[i, "home_elo_before"] = team_elos[home]
        df.at[i, "away_elo_before"] = team_elos[away]

        # Update after match
        new_home, new_away = update_elo(
            team_elos[home], team_elos[away],
            row["home_goals"], row["away_goals"]
        )

        team_elos[home] = new_home
        team_elos[away] = new_away

    return df


In [24]:
match_stats = add_elo_features(match_stats)

In [25]:
def add_bookmaker_features(df):
    df = df.copy()

    # Convert to implied probabilities
    df["prob_home"] = 1 / df["odds_home_win"]
    df["prob_draw"] = 1 / df["odds_draw"]
    df["prob_away"] = 1 / df["odds_away_win"]

    # Normalize to remove overround
    total = df[["prob_home", "prob_draw", "prob_away"]].sum(axis=1)

    df["prob_home"] /= total
    df["prob_draw"] /= total
    df["prob_away"] /= total

    return df

In [26]:
match_stats = add_bookmaker_features(match_stats)

In [27]:
match_stats = match_stats.sort_values('date')

match_stats['home_xG_rolling'] = (
    match_stats.groupby('home_team')['xG'].transform(lambda s: s.shift().rolling(5, min_periods=1).mean())
)

match_stats['away_xG_rolling'] = (
    match_stats.groupby('away_team')['xG.1'].transform(lambda s: s.shift().rolling(5, min_periods=1).mean())
)

match_stats['xG_diff_rolling'] = match_stats['home_xG_rolling'] - match_stats['away_xG_rolling']

match_stats['elo_diff'] = match_stats['home_elo_before'] - match_stats['away_elo_before']

match_stats['goals_diff_rolling'] = match_stats['home_goals_scored_rolling'] - match_stats['away_goals_scored_rolling']
match_stats['conceded_diff_rolling'] = match_stats['home_goals_conceded_rolling'] - match_stats['away_goals_conceded_rolling']

In [28]:
predict_matches = match_stats[['date', 'home_team', 'away_team', 'odds_home_win', 'odds_draw', 'odds_away_win',
       'home_goals_scored_rolling', 'home_goals_conceded_rolling',
       'home_shots_rolling', 'home_shots_on_target_rolling',
       'home_fouls_rolling', 'home_corners_rolling',
       'home_yellow_cards_rolling', 'away_goals_scored_rolling',
       'away_goals_conceded_rolling', 'away_shots_rolling',
       'away_shots_on_target_rolling', 'away_fouls_rolling',
       'away_corners_rolling', 'away_yellow_cards_rolling', 'home_elo_before',
       'away_elo_before', 'prob_home', 'prob_draw', 'prob_away',
       'home_xG_rolling', 'away_xG_rolling', 'xG_diff_rolling', 'elo_diff',
       'goals_diff_rolling', 'conceded_diff_rolling', 'result']]

In [29]:
predict_matches.info()

<class 'pandas.core.frame.DataFrame'>
Index: 130 entries, 0 to 129
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          130 non-null    datetime64[ns]
 1   home_team                     130 non-null    object        
 2   away_team                     130 non-null    object        
 3   odds_home_win                 130 non-null    float64       
 4   odds_draw                     130 non-null    float64       
 5   odds_away_win                 130 non-null    float64       
 6   home_goals_scored_rolling     130 non-null    float64       
 7   home_goals_conceded_rolling   130 non-null    float64       
 8   home_shots_rolling            130 non-null    float64       
 9   home_shots_on_target_rolling  130 non-null    float64       
 10  home_fouls_rolling            130 non-null    float64       
 11  home_corners_rolling          130 non

In [30]:
predict_matches.head(10)

Unnamed: 0,date,home_team,away_team,odds_home_win,odds_draw,odds_away_win,home_goals_scored_rolling,home_goals_conceded_rolling,home_shots_rolling,home_shots_on_target_rolling,...,prob_home,prob_draw,prob_away,home_xG_rolling,away_xG_rolling,xG_diff_rolling,elo_diff,goals_diff_rolling,conceded_diff_rolling,result
0,2025-08-15,Liverpool,Bournemouth,1.3,6.0,8.5,0.0,0.0,0.0,0.0,...,0.730136,0.158196,0.111668,,,,0.0,0.0,0.0,H
1,2025-08-16,Aston Villa,Newcastle,2.25,3.5,2.9,0.0,0.0,0.0,0.0,...,0.413442,0.265784,0.320774,,,,0.0,0.0,0.0,D
2,2025-08-16,Brighton,Fulham,1.91,3.6,4.0,0.0,0.0,0.0,0.0,...,0.497994,0.264214,0.237792,,,,0.0,0.0,0.0,D
3,2025-08-16,Sunderland,West Ham,3.25,3.4,2.25,0.0,0.0,0.0,0.0,...,0.294089,0.281115,0.424796,,,,0.0,0.0,0.0,H
4,2025-08-16,Tottenham,Burnley,1.38,4.75,8.5,0.0,0.0,0.0,0.0,...,0.688288,0.199966,0.111746,,,,0.0,0.0,0.0,H
5,2025-08-16,Wolves,Man City,6.25,4.75,1.45,0.0,0.0,0.0,0.0,...,0.150918,0.198576,0.650507,,,,0.0,0.0,0.0,A
8,2025-08-17,Chelsea,Crystal Palace,1.62,4.33,4.5,0.0,0.0,0.0,0.0,...,0.576657,0.215747,0.207596,,,,0.0,0.0,0.0,D
6,2025-08-17,Nott'm Forest,Brentford,2.2,3.3,3.5,0.0,0.0,0.0,0.0,...,0.435685,0.290456,0.273859,,,,0.0,0.0,0.0,H
7,2025-08-17,Man United,Arsenal,3.6,3.6,1.95,0.0,0.0,0.0,0.0,...,0.26,0.26,0.48,,,,0.0,0.0,0.0,A
9,2025-08-18,Leeds,Everton,2.38,3.3,3.0,0.0,0.0,0.0,0.0,...,0.397686,0.286816,0.315498,,,,0.0,0.0,0.0,H


In [31]:
def predict_match(model, match_row):
    """
    match_row = a single-row DataFrame with the same columns as X
    """
    home = match_row['home_team'].values[0]
    away = match_row['away_team'].values[0]
    probs = model.predict_proba(match_row.drop(['home_team', 'away_team', 'result', 'date'], axis=1))[0]

    print(f"Predicting match: {home} vs {away}")
    print(f'Probabilities:')
    print(f'  Home win: {probs[2]:.2%}')
    print(f'  Draw:     {probs[0]:.2%}')    
    print(f'  Away win: {probs[1]:.2%}')
    return probs


In [32]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == 'Sunderland') & (predict_matches['away_team'] == 'Bournemouth')])
print(probas)

Predicting match: Sunderland vs Bournemouth
Probabilities:
  Home win: 23.47%
  Draw:     31.61%
  Away win: 44.92%
[0.31609914 0.44921907 0.23468179]


In [33]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == 'Brentford') & (predict_matches['away_team'] == 'Burnley')])
print(probas)

Predicting match: Brentford vs Burnley
Probabilities:
  Home win: 66.50%
  Draw:     22.04%
  Away win: 11.46%
[0.22040061 0.11461658 0.66498282]


In [34]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == 'Man City') & (predict_matches['away_team'] == 'Leeds')])
print(probas)

Predicting match: Man City vs Leeds
Probabilities:
  Home win: 63.70%
  Draw:     26.42%
  Away win: 9.89%
[0.264184   0.09885134 0.63696465]


In [35]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == 'Everton') & (predict_matches['away_team'] == 'Newcastle')])
print(probas)

Predicting match: Everton vs Newcastle
Probabilities:
  Home win: 28.54%
  Draw:     41.99%
  Away win: 29.47%
[0.41986517 0.29471938 0.28541545]


In [36]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == 'Tottenham') & (predict_matches['away_team'] == 'Fulham')])
print(probas)

Predicting match: Tottenham vs Fulham
Probabilities:
  Home win: 33.78%
  Draw:     32.73%
  Away win: 33.49%
[0.32726107 0.33492865 0.33781028]


In [37]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == 'Crystal Palace') & (predict_matches['away_team'] == 'Man United')])
print(probas)

Predicting match: Crystal Palace vs Man United
Probabilities:
  Home win: 47.14%
  Draw:     29.16%
  Away win: 23.70%
[0.29155537 0.23701114 0.47143349]


In [38]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == "Nott'm Forest") & (predict_matches['away_team'] == 'Brighton')])
print(probas)

Predicting match: Nott'm Forest vs Brighton
Probabilities:
  Home win: 34.22%
  Draw:     27.04%
  Away win: 38.73%
[0.27040984 0.38734684 0.34224332]


In [39]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == "Aston Villa") & (predict_matches['away_team'] == 'Wolves')])
print(probas)

Predicting match: Aston Villa vs Wolves
Probabilities:
  Home win: 60.33%
  Draw:     32.23%
  Away win: 7.45%
[0.32225573 0.07448781 0.60325646]


In [40]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == "West Ham") & (predict_matches['away_team'] == 'Liverpool')])
print(probas)

Predicting match: West Ham vs Liverpool
Probabilities:
  Home win: 19.96%
  Draw:     18.27%
  Away win: 61.76%
[0.18273585 0.61761846 0.19964569]


In [41]:
probas = predict_match(model, predict_matches[(predict_matches['home_team'] == "Chelsea") & (predict_matches['away_team'] == 'Arsenal')])
print(probas)

Predicting match: Chelsea vs Arsenal
Probabilities:
  Home win: 32.14%
  Draw:     23.65%
  Away win: 44.21%
[0.2365422  0.44210292 0.32135488]
