In [2]:
import pandas as pd

In [30]:
all_matches = pd.DataFrame()
seasons = ['2021-2022', '2022-2023', '2023-2024', '2024-2025', '2025-2026']

for season in seasons:
    all_matches = pd.concat([all_matches, pd.read_csv(f'../../data/Advanced/pl{season}.csv')])
    all_matches['season'] = season

all_matches.reset_index(drop=True, inplace=True)

In [31]:
all_matches = all_matches[[
    "Date", "HomeTeam", "AwayTeam",
    "FTHG", "FTAG", "FTR",
    "HS", "AS", "HST", "AST",
    "HF", "AF", "HC", "AC",
    "HY", "AY", "HR", "AR",
    "B365H", "B365D", "B365A"
]]


In [32]:
rename_map = {
    "Date": "date",
    "HomeTeam": "home_team",
    "AwayTeam": "away_team",
    "FTHG": "home_goals",
    "FTAG": "away_goals",
    "FTR":  "result",

    "HTHG": "home_ht_goals",
    "HTAG": "away_ht_goals",
    "HTR":  "ht_result",

    "HS": "home_shots",
    "AS": "away_shots",
    "HST": "home_sot",
    "AST": "away_sot",
    "HF": "home_fouls",
    "AF": "away_fouls",
    "HC": "home_corners",
    "AC": "away_corners",
    "HY": "home_yellow",
    "AY": "away_yellow",
    "HR": "home_red",
    "AR": "away_red",

    "B365H": "odds_home_win",
    "B365D": "odds_draw",
    "B365A": "odds_away_win"
}
all_matches.rename(columns=rename_map, inplace=True)

In [33]:
all_matches["date"] = pd.to_datetime(all_matches["date"], dayfirst=True)
all_matches.sort_values(by="date", inplace=True)

In [34]:
all_matches["result"] = all_matches["result"].map({"H": 2, "D": 0, "A": 1})

In [68]:
all_matches.head().to_clipboard(index=False)

In [35]:
import pandas as pd

def add_rolling_team_stats(df, window=5):

    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date")

    stat_map = {
        "goals_scored": ("home_goals", "away_goals"),
        "goals_conceded": ("away_goals", "home_goals"),
        "shots": ("home_shots", "away_shots"),
        "shots_on_target": ("home_sot", "away_sot"),
        "fouls": ("home_fouls", "away_fouls"),
        "corners": ("home_corners", "away_corners"),
        "yellow_cards": ("home_yellow", "away_yellow"),
        "red_cards": ("home_red", "away_red"),
    }

    # Build long format
    home_df = df.rename(columns={v[0]: k for k, v in stat_map.items()})
    away_df = df.rename(columns={v[1]: k for k, v in stat_map.items()})

    home_df["team"] = home_df["home_team"]
    away_df["team"] = away_df["away_team"]

    cols = ["date", "team"] + list(stat_map.keys())
    long_df = pd.concat([home_df[cols], away_df[cols]], ignore_index=True)
    long_df = long_df.sort_values(["team", "date"]).reset_index(drop=True)

    # Compute rolling stats (fixed)
    for stat in stat_map.keys():
        long_df[f"{stat}_rolling_avg"] = (
            long_df.groupby("team")[stat]
                   .transform(lambda x: x.shift(1).rolling(window, min_periods=1).mean())
        )

    # HOME merge
    home_merge_cols = {f"{stat}_rolling_avg": f"home_{stat}_rolling"
                       for stat in stat_map.keys()}

    df = df.merge(
        long_df[["date", "team"] + list(home_merge_cols.keys())],
        left_on=["date", "home_team"],
        right_on=["date", "team"],
        how="left"
    ).drop(columns=["team"])
    df = df.rename(columns=home_merge_cols)

    # AWAY merge
    away_merge_cols = {f"{stat}_rolling_avg": f"away_{stat}_rolling"
                       for stat in stat_map.keys()}

    df = df.merge(
        long_df[["date", "team"] + list(away_merge_cols.keys())],
        left_on=["date", "away_team"],
        right_on=["date", "team"],
        how="left"
    ).drop(columns=["team"])
    df = df.rename(columns=away_merge_cols)

    # Fill NAN
    df = df.fillna(0)

    return df


In [36]:
match_stats = add_rolling_team_stats(all_matches, window=5)

In [37]:
match_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1640 entries, 0 to 1639
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   date                          1640 non-null   datetime64[ns]
 1   home_team                     1640 non-null   object        
 2   away_team                     1640 non-null   object        
 3   home_goals                    1640 non-null   int64         
 4   away_goals                    1640 non-null   int64         
 5   result                        1640 non-null   int64         
 6   home_shots                    1640 non-null   int64         
 7   away_shots                    1640 non-null   int64         
 8   home_sot                      1640 non-null   int64         
 9   away_sot                      1640 non-null   int64         
 10  home_fouls                    1640 non-null   int64         
 11  away_fouls                    

In [38]:
import numpy as np
import pandas as pd

def expected_result(elo_a, elo_b):
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

def update_elo(elo_a, elo_b, score_a, score_b, k=20):
    expected_a = expected_result(elo_a, elo_b)
    expected_b = 1 - expected_a

    if score_a > score_b:   # home win
        actual_a, actual_b = 1, 0
    elif score_a < score_b: # away win
        actual_a, actual_b = 0, 1
    else:
        actual_a, actual_b = 0.5, 0.5

    new_a = elo_a + k * (actual_a - expected_a)
    new_b = elo_b + k * (actual_b - expected_b)

    return new_a, new_b

def add_elo_features(df):
    df = df.sort_values("date").copy()

    team_elos = {}
    base_elo = 1500

    df["home_elo_before"] = 0.0
    df["away_elo_before"] = 0.0

    for i, row in df.iterrows():
        home, away = row["home_team"], row["away_team"]

        # If new team, initialize
        team_elos.setdefault(home, base_elo)
        team_elos.setdefault(away, base_elo)

        # Assign ELO before match
        df.at[i, "home_elo_before"] = team_elos[home]
        df.at[i, "away_elo_before"] = team_elos[away]

        # Update after match
        new_home, new_away = update_elo(
            team_elos[home], team_elos[away],
            row["home_goals"], row["away_goals"]
        )

        team_elos[home] = new_home
        team_elos[away] = new_away

    return df


In [39]:
match_stats = add_elo_features(match_stats)

In [40]:
match_stats.head()

Unnamed: 0,date,home_team,away_team,home_goals,away_goals,result,home_shots,away_shots,home_sot,away_sot,...,away_goals_scored_rolling,away_goals_conceded_rolling,away_shots_rolling,away_shots_on_target_rolling,away_fouls_rolling,away_corners_rolling,away_yellow_cards_rolling,away_red_cards_rolling,home_elo_before,away_elo_before
0,2021-08-13,Brentford,Arsenal,2,0,2,8,22,3,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1500.0,1500.0
1,2021-08-14,Man United,Leeds,5,1,2,16,10,8,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1500.0,1500.0
2,2021-08-14,Burnley,Brighton,1,2,1,14,14,3,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1500.0,1500.0
3,2021-08-14,Chelsea,Crystal Palace,3,0,2,13,4,6,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1500.0,1500.0
4,2021-08-14,Everton,Southampton,3,1,2,14,6,6,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1500.0,1500.0


In [41]:
def add_bookmaker_features(df):
    df = df.copy()

    # Convert to implied probabilities
    df["prob_home"] = 1 / df["odds_home_win"]
    df["prob_draw"] = 1 / df["odds_draw"]
    df["prob_away"] = 1 / df["odds_away_win"]

    # Normalize to remove overround
    total = df[["prob_home", "prob_draw", "prob_away"]].sum(axis=1)

    df["prob_home"] /= total
    df["prob_draw"] /= total
    df["prob_away"] /= total

    return df


In [42]:
match_stats = add_bookmaker_features(match_stats)

In [43]:
match_stats

Unnamed: 0,date,home_team,away_team,home_goals,away_goals,result,home_shots,away_shots,home_sot,away_sot,...,away_shots_on_target_rolling,away_fouls_rolling,away_corners_rolling,away_yellow_cards_rolling,away_red_cards_rolling,home_elo_before,away_elo_before,prob_home,prob_draw,prob_away
0,2021-08-13,Brentford,Arsenal,2,0,2,8,22,3,4,...,0.0,0.0,0.0,0.0,0.0,1500.000000,1500.000000,0.236532,0.278273,0.485194
1,2021-08-14,Man United,Leeds,5,1,2,16,10,8,3,...,0.0,0.0,0.0,0.0,0.0,1500.000000,1500.000000,0.622631,0.211695,0.165674
2,2021-08-14,Burnley,Brighton,1,2,1,14,14,3,8,...,0.0,0.0,0.0,0.0,0.0,1500.000000,1500.000000,0.306250,0.306250,0.387500
3,2021-08-14,Chelsea,Crystal Palace,3,0,2,13,4,6,1,...,0.0,0.0,0.0,0.0,0.0,1500.000000,1500.000000,0.761299,0.165500,0.073202
4,2021-08-14,Everton,Southampton,3,1,2,14,6,6,3,...,0.0,0.0,0.0,0.0,0.0,1500.000000,1500.000000,0.495575,0.269027,0.235398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1632,2025-11-22,Brighton,Brentford,2,1,2,14,10,5,4,...,5.0,9.6,5.8,2.4,0.0,1574.848479,1549.696128,0.497994,0.264214,0.237792
1636,2025-11-22,Newcastle,Man City,2,1,2,9,17,5,4,...,5.8,10.4,7.2,2.0,0.0,1567.527380,1705.026494,0.263607,0.249733,0.486660
1638,2025-11-23,Arsenal,Tottenham,4,1,2,17,3,8,2,...,3.0,11.0,5.2,2.8,0.0,1724.408513,1495.393444,0.671916,0.200868,0.127216
1637,2025-11-23,Leeds,Aston Villa,1,2,1,14,14,5,3,...,4.6,8.6,5.4,1.8,0.0,1399.260224,1620.404341,0.316547,0.287770,0.395683


In [44]:
final_df = match_stats.drop(columns=["home_goals", "away_goals",
        "home_shots", "away_shots",
        "home_sot", "away_sot",
        "home_fouls", "away_fouls",
        "home_corners", "away_corners",
        "home_yellow", "away_yellow",
        "home_red", "away_red"])

In [79]:
final_df = pd.get_dummies(final_df, columns=["home_team", "away_team"], drop_first=True)

In [80]:
final_df.to_csv('../../data/Advanced/pl_all_seasons_enhanced.csv', index=False)

In [12]:
more_data = pd.read_clipboard()
more_data

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,1,Fri,2021-08-13,20:00 (14:00),Brentford,1.2,2–0,1.3,Arsenal,16479,Brentford Community Stadium,Michael Oliver,Match Report,
1,1,Sat,2021-08-14,12:30 (06:30),Manchester Utd,1.5,5–1,0.5,Leeds United,72732,Old Trafford,Paul Tierney,Match Report,
2,1,Sat,2021-08-14,15:00 (09:00),Everton,2.4,3–1,0.8,Southampton,38487,Goodison Park,Andy Madley,Match Report,
3,1,Sat,2021-08-14,15:00 (09:00),Leicester City,0.5,1–0,1.3,Wolves,31983,King Power Stadium,Craig Pawson,Match Report,
4,1,Sat,2021-08-14,15:00 (09:00),Chelsea,0.7,3–0,0.2,Crystal Palace,38965,Stamford Bridge,Jonathan Moss,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,38,Sun,2022-05-22,16:00 (10:00),Norwich City,0.5,0–5,3.4,Tottenham,27022,Carrow Road,Chris Kavanagh,Match Report,
387,38,Sun,2022-05-22,16:00 (10:00),Leicester City,3.4,4–1,1.1,Southampton,32003,King Power Stadium,Jonathan Moss,Match Report,
388,38,Sun,2022-05-22,16:00 (10:00),Liverpool,2.7,3–1,1.3,Wolves,53097,Anfield,Anthony Taylor,Match Report,
389,38,Sun,2022-05-22,16:00 (10:00),Brighton,1.6,3–1,0.2,West Ham,31604,The American Express Community Stadium,Kevin Friend,Match Report,


In [16]:
more_data = pd.concat([more_data, pd.read_clipboard()])
more_data

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,1,Fri,2021-08-13,20:00 (14:00),Brentford,1.2,2–0,1.3,Arsenal,16479,Brentford Community Stadium,Michael Oliver,Match Report,
1,1,Sat,2021-08-14,12:30 (06:30),Manchester Utd,1.5,5–1,0.5,Leeds United,72732,Old Trafford,Paul Tierney,Match Report,
2,1,Sat,2021-08-14,15:00 (09:00),Everton,2.4,3–1,0.8,Southampton,38487,Goodison Park,Andy Madley,Match Report,
3,1,Sat,2021-08-14,15:00 (09:00),Leicester City,0.5,1–0,1.3,Wolves,31983,King Power Stadium,Craig Pawson,Match Report,
4,1,Sat,2021-08-14,15:00 (09:00),Chelsea,0.7,3–0,0.2,Crystal Palace,38965,Stamford Bridge,Jonathan Moss,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,12,Sat,2025-11-22,15:00 (09:00),Brighton,1.2,2–1,2.0,Brentford,31458,The American Express Stadium,Chris Kavanagh,Match Report,
119,12,Sat,2025-11-22,17:30 (11:30),Newcastle Utd,2.3,2–1,2.5,Manchester City,52181,St James' Park,Samuel Barrott,Match Report,
120,12,Sun,2025-11-23,14:00 (08:00),Leeds United,1.6,1–2,1.6,Aston Villa,36819,Elland Road,Robert Jones,Match Report,
121,12,Sun,2025-11-23,16:30 (10:30),Arsenal,1.9,4–1,0.1,Tottenham,60345,Emirates Stadium,Michael Oliver,Match Report,


In [20]:
more_data = more_data[more_data["Home"] != 'Home']

In [21]:
more_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1640 entries, 0 to 122
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Wk            1640 non-null   object
 1   Day           1640 non-null   object
 2   Date          1640 non-null   object
 3   Time          1640 non-null   object
 4   Home          1640 non-null   object
 5   xG            1640 non-null   object
 6   Score         1640 non-null   object
 7   xG.1          1640 non-null   object
 8   Away          1640 non-null   object
 9   Attendance    1639 non-null   object
 10  Venue         1640 non-null   object
 11  Referee       1640 non-null   object
 12  Match Report  1640 non-null   object
 13  Notes         0 non-null      object
dtypes: object(14)
memory usage: 192.2+ KB


In [22]:
more_data.rename(columns={"Home": "home_team", "Away": "away_team", 'Date': 'date'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_data.rename(columns={"Home": "home_team", "Away": "away_team", 'Date': 'date'}, inplace=True)


In [27]:
more_data.drop(columns=["Score"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_data.drop(columns=["Score"], inplace=True)


In [45]:
more_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1640 entries, 0 to 122
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Wk         1640 non-null   object
 1   date       1640 non-null   object
 2   home_team  1640 non-null   object
 3   xG         1640 non-null   object
 4   xG.1       1640 non-null   object
 5   away_team  1640 non-null   object
dtypes: object(6)
memory usage: 89.7+ KB


In [61]:
final_df['home_team'].sort_values().value_counts()

home_team
Arsenal             82
Everton             82
West Ham            82
Tottenham           82
Newcastle           82
Man United          82
Man City            82
Aston Villa         82
Liverpool           82
Wolves              82
Brighton            82
Brentford           82
Crystal Palace      82
Chelsea             82
Bournemouth         63
Nott'm Forest       63
Fulham              63
Southampton         57
Leicester           57
Burnley             44
Leeds               44
Ipswich             19
Luton               19
Sheffield United    19
Watford             19
Norwich             19
Sunderland           6
Name: count, dtype: int64

In [65]:
more_data['home_team'].sort_values().value_counts()

home_team
Arsenal            82
Everton            82
West Ham           82
Tottenham          82
Newcastle Utd      82
Manchester Utd     82
Manchester City    82
Aston Villa        82
Liverpool          82
Wolves             82
Brighton           82
Brentford          82
Crystal Palace     82
Chelsea            82
Bournemouth        63
Nott'ham Forest    63
Fulham             63
Southampton        57
Leicester City     57
Burnley            44
Leeds United       44
Ipswich Town       19
Luton Town         19
Sheffield Utd      19
Watford            19
Norwich City       19
Sunderland          6
Name: count, dtype: int64

In [68]:
team_name_map = {
        "Newcastle Utd": "Newcastle",
        "Manchester Utd": "Man United",
        "Manchester City": "Man City",
        "Leicester City": "Leicester",
        "Leeds United": "Leeds",
        "Ipswich Town": "Ipswich",
        "Luton Town": "Luton",
        "Norwich City": "Norwich",
        "Sheffield Utd": "Sheffield United",
        "Nott'ham Forest": "Nott'm Forest"
    }

more_data['home_team'] = more_data['home_team'].replace(team_name_map)
more_data['away_team'] = more_data['away_team'].replace(team_name_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_data['home_team'] = more_data['home_team'].replace(team_name_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_data['away_team'] = more_data['away_team'].replace(team_name_map)


In [67]:
final_df['date'] = pd.to_datetime(final_df['date'])
more_data['date'] = pd.to_datetime(more_data['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_data['date'] = pd.to_datetime(more_data['date'])


In [69]:
merged = final_df.merge(more_data, on=['date', 'home_team', 'away_team'], how='left')

In [71]:
merged.columns

Index(['date', 'home_team', 'away_team', 'result', 'odds_home_win',
       'odds_draw', 'odds_away_win', 'home_goals_scored_rolling',
       'home_goals_conceded_rolling', 'home_shots_rolling',
       'home_shots_on_target_rolling', 'home_fouls_rolling',
       'home_corners_rolling', 'home_yellow_cards_rolling',
       'home_red_cards_rolling', 'away_goals_scored_rolling',
       'away_goals_conceded_rolling', 'away_shots_rolling',
       'away_shots_on_target_rolling', 'away_fouls_rolling',
       'away_corners_rolling', 'away_yellow_cards_rolling',
       'away_red_cards_rolling', 'home_elo_before', 'away_elo_before',
       'prob_home', 'prob_draw', 'prob_away', 'Wk', 'xG', 'xG.1'],
      dtype='object')

In [72]:
merged.drop(columns=['Wk'], inplace=True)

In [73]:
merged.to_csv('../../data/Advanced/pl_final.csv', index=False)