In [None]:
matches_df.drop(columns=['Attendance', 'Referee', 'Match Report', 'Notes', 'Venue', 'Time', 'Day'], axis=1, inplace=True)

In [88]:
currmatches = pd.read_csv('../data/premier_league_matches_2025_2026.csv')

In [89]:
currmatches["Date"] = pd.to_datetime(currmatches["Date"])

In [90]:
currmatches = currmatches[currmatches["Date"] < pd.Timestamp.today() - pd.Timedelta(days=1)]

In [91]:
currmatches

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley
...,...,...,...,...,...,...,...,...,...
125,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest
126,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford
127,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City
128,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa


In [92]:
import pandas as pd

def clean_match_df(df):
    # Split score "4–2" into home_goals, away_goals
    df["home_goals"] = df["Score"].str.extract(r"(\d+)[–-](\d+)")[0].astype(int)
    df["away_goals"] = df["Score"].str.extract(r"(\d+)[–-](\d+)")[1].astype(int)

    # Rename xG columns
    df = df.rename(columns={
        "Home": "home_team",
        "Away": "away_team",
        "xG": "home_xg",
        "xG.1": "away_xg"
    })

    # Create label: win/draw/loss from home perspective
    df["result"] = df.apply(
        lambda row: 
            "W" if row.home_goals > row.away_goals
            else "L" if row.home_goals < row.away_goals
            else "D",
        axis=1
    )

    return df


In [93]:
clean_matches = clean_match_df(currmatches)
clean_matches

Unnamed: 0,Wk,Day,Date,Time,home_team,home_xg,Score,away_xg,away_team,home_goals,away_goals,result
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth,4,2,W
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd,0,0,D
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham,3,0,W
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham,1,1,D
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley,3,0,W
...,...,...,...,...,...,...,...,...,...,...,...,...
125,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest,0,3,L
126,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford,2,1,W
127,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City,2,1,W
128,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa,1,2,L


In [94]:
def add_rolling_features(df, window=5):

    df = df.sort_values("Date")

    # Team-based dictionaries for rolling stats
    team_stats = {}

    home_stats = []
    away_stats = []

    for _, row in df.iterrows():
        home = row.home_team
        away = row.away_team

        # Initialize if needed
        for t in [home, away]:
            if t not in team_stats:
                team_stats[t] = {
                    "gf": [], "ga": [], "pts": []
                }

        # Compute home rolling
        home_form = team_stats[home]
        away_form = team_stats[away]

        home_stats.append({
            "home_avg_gf_5": pd.Series(home_form["gf"][-window:]).mean(),
            "home_avg_ga_5": pd.Series(home_form["ga"][-window:]).mean(),
            "home_avg_pts_5": pd.Series(home_form["pts"][-window:]).mean(),
            "home_form_score": sum(home_form["pts"][-window:])
        })

        # Compute away rolling
        away_stats.append({
            "away_avg_gf_5": pd.Series(away_form["gf"][-window:]).mean(),
            "away_avg_ga_5": pd.Series(away_form["ga"][-window:]).mean(),
            "away_avg_pts_5": pd.Series(away_form["pts"][-window:]).mean(),
            "away_form_score": sum(away_form["pts"][-window:])
        })

        # Update team stats after computing features
        home_pts = 3 if row.home_goals > row.away_goals else 1 if row.home_goals == row.away_goals else 0
        away_pts = 3 if row.away_goals > row.home_goals else 1 if row.home_goals == row.away_goals else 0

        home_form["gf"].append(row.home_goals)
        home_form["ga"].append(row.away_goals)
        home_form["pts"].append(home_pts)

        away_form["gf"].append(row.away_goals)
        away_form["ga"].append(row.home_goals)
        away_form["pts"].append(away_pts)

    home_df = pd.DataFrame(home_stats)
    away_df = pd.DataFrame(away_stats)

    return pd.concat([df.reset_index(drop=True), home_df, away_df], axis=1)


In [95]:
currmatches = add_rolling_features(clean_matches, window=5)
currmatches

Unnamed: 0,Wk,Day,Date,Time,home_team,home_xg,Score,away_xg,away_team,home_goals,away_goals,result,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth,4,2,W,,,,0,,,,0
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd,0,0,D,,,,0,,,,0
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham,3,0,W,,,,0,,,,0
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham,1,1,D,,,,0,,,,0
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley,3,0,W,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest,0,3,L,1.2,2.0,0.6,3,1.0,2.0,0.8,4
115,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford,2,1,W,1.6,1.2,1.6,8,1.6,1.2,1.8,9
116,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City,2,1,W,1.4,1.8,1.2,6,1.8,0.4,2.4,12
117,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa,1,2,L,0.8,2.2,0.6,3,1.8,0.8,2.4,12


In [96]:
def update_elo(df, base_elo=1500, k=20):

    elos = {}
    home_elos = []
    away_elos = []

    for _, row in df.iterrows():
        h = row.home_team
        a = row.away_team

        if h not in elos: elos[h] = base_elo
        if a not in elos: elos[a] = base_elo

        home_elos.append(elos[h])
        away_elos.append(elos[a])

        # Determine actual score
        if row.home_goals > row.away_goals:
            score_h, score_a = 1, 0
        elif row.home_goals < row.away_goals:
            score_h, score_a = 0, 1
        else:
            score_h, score_a = 0.5, 0.5

        # Expected score
        expected_h = 1 / (1 + 10 ** ((elos[a] - elos[h]) / 400))
        expected_a = 1 - expected_h

        # Update
        elos[h] += k * (score_h - expected_h)
        elos[a] += k * (score_a - expected_a)

    df["home_elo"] = home_elos
    df["away_elo"] = away_elos
    return df


In [97]:
currmatches = update_elo(currmatches)
currmatches

Unnamed: 0,Wk,Day,Date,Time,home_team,home_xg,Score,away_xg,away_team,home_goals,...,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score,home_elo,away_elo
0,1.0,Fri,2025-08-15,20:00,Liverpool,2.2,4–2,1.7,Bournemouth,4,...,,,,0,,,,0,1500.000000,1500.000000
1,1.0,Sat,2025-08-16,12:30,Aston Villa,0.2,0–0,1.4,Newcastle Utd,0,...,,,,0,,,,0,1500.000000,1500.000000
2,1.0,Sat,2025-08-16,15:00,Sunderland,0.7,3–0,0.6,West Ham,3,...,,,,0,,,,0,1500.000000,1500.000000
3,1.0,Sat,2025-08-16,15:00,Brighton,1.5,1–1,0.7,Fulham,1,...,,,,0,,,,0,1500.000000,1500.000000
4,1.0,Sat,2025-08-16,15:00,Tottenham,2.3,3–0,0.9,Burnley,3,...,,,,0,,,,0,1500.000000,1500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,12.0,Sat,2025-11-22,15:00,Liverpool,1.9,0–3,1.6,Nott'ham Forest,0,...,1.2,2.0,0.6,3,1.0,2.0,0.8,4,1505.360738,1465.526210
115,12.0,Sat,2025-11-22,15:00,Brighton,1.2,2–1,2.0,Brentford,2,...,1.6,1.2,1.6,8,1.6,1.2,1.8,9,1510.250649,1502.713825
116,12.0,Sat,2025-11-22,17:30,Newcastle Utd,2.3,2–1,2.5,Manchester City,2,...,1.4,1.8,1.2,6,1.8,0.4,2.4,12,1479.040137,1538.852471
117,12.0,Sun,2025-11-23,14:00,Leeds United,1.6,1–2,1.6,Aston Villa,1,...,0.8,2.2,0.6,3,1.8,0.8,2.4,12,1467.932302,1522.926536


In [98]:
currmatches.drop(columns=['Score', 'Wk', 'Day', 'Time', 'Score', 'home_goals', 'away_goals', ], axis=1, inplace=True)

In [100]:
rolling_cols = [
    "home_avg_gf_5", "home_avg_ga_5", "home_avg_pts_5", "home_form_score",
    "away_avg_gf_5", "away_avg_ga_5", "away_avg_pts_5", "away_form_score"
]
currmatches[rolling_cols] = currmatches[rolling_cols].fillna(currmatches[rolling_cols].mean())

In [101]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
currmatches['result'] = le.fit_transform(currmatches['result']) # W=2, D=0, L=1

In [102]:
currmatches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       119 non-null    datetime64[ns]
 1   home_xg                    119 non-null    float64       
 2   away_xg                    119 non-null    float64       
 3   result                     119 non-null    int64         
 4   home_avg_gf_5              119 non-null    float64       
 5   home_avg_ga_5              119 non-null    float64       
 6   home_avg_pts_5             119 non-null    float64       
 7   home_form_score            119 non-null    int64         
 8   away_avg_gf_5              119 non-null    float64       
 9   away_avg_ga_5              119 non-null    float64       
 10  away_avg_pts_5             119 non-null    float64       
 11  away_form_score            119 non-null    int64         
 12  home_elo