In [9]:
import pandas as pd
import numpy as np

In [10]:
pl_matches = pd.read_csv('../data/pl_matches.csv')
print(pl_matches.info())
pl_matches.value_counts('match_id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830 entries, 0 to 829
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          830 non-null    object 
 1   time          830 non-null    object 
 2   match_id      830 non-null    object 
 3   league_name   830 non-null    object 
 4   league_id     830 non-null    int64  
 5   opponent      830 non-null    object 
 6   opponent_id   830 non-null    object 
 7   home_away     830 non-null    object 
 8   result        830 non-null    object 
 9   gf            830 non-null    float64
 10  ga            830 non-null    float64
 11  attendance    830 non-null    object 
 12  captain       830 non-null    object 
 13  formation     830 non-null    object 
 14  referee       830 non-null    object 
 15  home_team     830 non-null    object 
 16  home_team_id  830 non-null    object 
dtypes: float64(2), int64(1), object(14)
memory usage: 110.4+ KB
None


match_id
0050df89    1
ada09dda    1
ab8543fb    1
ab9827f2    1
abd525b4    1
           ..
573f2f77    1
57b35843    1
57c49bae    1
58636a1e    1
fff671a9    1
Name: count, Length: 830, dtype: int64

In [11]:
def expand_matches(df):
    rows = []

    for _, r in df.iterrows():
        # Row 1: as-is
        rows.append(r)

        # Row 2: flipped perspective
        flipped = r.copy()

        flipped['home_team'], flipped['opponent'] = r['opponent'], r['home_team']
        flipped['home_team_id'], flipped['opponent_id'] = r['opponent_id'], r['home_team_id']

        # Flip H/A
        flipped['home_away'] = 'Home' if r['home_away'] == 'Away' else 'Away'

        # Flip gf / ga
        flipped['gf'], flipped['ga'] = r['ga'], r['gf']

        # Flip result
        if r['result'] == 'W':
            flipped['result'] = 'L'
        elif r['result'] == 'L':
            flipped['result'] = 'W'
        else:
            flipped['result'] = 'D'

        # Remove player-specific info for opponent (optional)
        flipped['captain'] = None
        flipped['formation'] = None

        rows.append(flipped)

    return pd.DataFrame(rows)


In [12]:
pl_matches = expand_matches(pl_matches)   

In [13]:
print(pl_matches.info())
pl_matches.value_counts('match_id')

<class 'pandas.core.frame.DataFrame'>
Index: 1660 entries, 0 to 829
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          1660 non-null   object 
 1   time          1660 non-null   object 
 2   match_id      1660 non-null   object 
 3   league_name   1660 non-null   object 
 4   league_id     1660 non-null   int64  
 5   opponent      1660 non-null   object 
 6   opponent_id   1660 non-null   object 
 7   home_away     1660 non-null   object 
 8   result        1660 non-null   object 
 9   gf            1660 non-null   float64
 10  ga            1660 non-null   float64
 11  attendance    1660 non-null   object 
 12  captain       830 non-null    object 
 13  formation     830 non-null    object 
 14  referee       1660 non-null   object 
 15  home_team     1660 non-null   object 
 16  home_team_id  1660 non-null   object 
dtypes: float64(2), int64(1), object(14)
memory usage: 233.4+ KB
None


match_id
0050df89    2
ada09dda    2
ab8543fb    2
ab9827f2    2
abd525b4    2
           ..
573f2f77    2
57b35843    2
57c49bae    2
58636a1e    2
fff671a9    2
Name: count, Length: 830, dtype: int64

In [14]:
# Derive new columns
pl_matches['goal_diff'] = pl_matches['gf'] - pl_matches['ga']
pl_matches['points'] = pl_matches['result'].map({'W': 3, 'D': 1, 'L': 0})

In [15]:
pl_matches = pl_matches.sort_values(by=['home_team', 'date'], ascending=[True, True])
pl_matches.reset_index(drop=True, inplace=True)
pl_matches.head()

Unnamed: 0,date,time,match_id,league_name,league_id,opponent,opponent_id,home_away,result,gf,ga,attendance,captain,formation,referee,home_team,home_team_id,goal_diff,points
0,2023-08-12,12:30,26a7f90c,Premier League,9,Nott'ham Forest,e4a775cb,Home,W,2.0,1.0,59984,,,Michael Oliver,Arsenal,18bb7c10,1.0,3
1,2023-08-21,20:00,3b5ecd36,Premier League,9,Crystal Palace,47c64c55,Away,W,1.0,0.0,24189,Martin Ødegaard,4-3-3,David Coote,Arsenal,18bb7c10,1.0,3
2,2023-08-26,15:00,d8f8f8ad,Premier League,9,Fulham,fd962109,Home,D,2.0,2.0,59961,Martin Ødegaard,4-3-3,Paul Tierney,Arsenal,18bb7c10,0.0,1
3,2023-09-03,16:30,74125d47,Premier League,9,Manchester Utd,19538871,Home,W,3.0,1.0,60192,,,Anthony Taylor,Arsenal,18bb7c10,2.0,3
4,2023-09-17,16:30,b1278924,Premier League,9,Everton,d3fd31cc,Away,W,1.0,0.0,39217,,,Simon Hooper,Arsenal,18bb7c10,1.0,3


In [16]:
# Create rolling stats
pl_matches['avg_goals_last_5'] = pl_matches.groupby('home_team')['gf'].transform(lambda x: x.shift().rolling(window=5, min_periods=1).mean())
pl_matches['avg_goals_against_last_5'] = pl_matches.groupby('home_team')['ga'].transform(lambda x: x.shift().rolling(window=5, min_periods=1).mean())
pl_matches['avg_goals_points_last_5'] = pl_matches.groupby('home_team')['points'].transform(lambda x: x.shift().rolling(window=5, min_periods=1).mean())

In [17]:
def weighted_form_window(x):
    # x is already the rolling window Series
    n = len(x)
    weights = np.arange(1, n+1)  # increasing weights for older → newer
    return np.average(x, weights=weights)

pl_matches['form_score'] = (
    pl_matches.groupby('home_team')['points']
      .transform(lambda x: x.shift().rolling(5, min_periods=1).apply(weighted_form_window, raw=True))
)

In [18]:
# reorder columns
pl_matches = pl_matches[['date', 'match_id', 'home_team', 'home_team_id', 'opponent', 'opponent_id', 'avg_goals_last_5', 'avg_goals_against_last_5', 'avg_goals_points_last_5', 'form_score', 'result']]
pl_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660 entries, 0 to 1659
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      1660 non-null   object 
 1   match_id                  1660 non-null   object 
 2   home_team                 1660 non-null   object 
 3   home_team_id              1660 non-null   object 
 4   opponent                  1660 non-null   object 
 5   opponent_id               1660 non-null   object 
 6   avg_goals_last_5          1635 non-null   float64
 7   avg_goals_against_last_5  1635 non-null   float64
 8   avg_goals_points_last_5   1635 non-null   float64
 9   form_score                1535 non-null   float64
 10  result                    1660 non-null   object 
dtypes: float64(4), object(7)
memory usage: 142.8+ KB


In [19]:
pl_matches.to_csv('../data/pl_matches_no_dummies.csv', index=False)

In [20]:
df = pl_matches.copy()

df = df.rename(columns={
    "home_team": "team",
    "home_team_id": "team_id",
    "opponent": "opp",
    "opponent_id": "opp_id"
})

df.head()

Unnamed: 0,date,match_id,team,team_id,opp,opp_id,avg_goals_last_5,avg_goals_against_last_5,avg_goals_points_last_5,form_score,result
0,2023-08-12,26a7f90c,Arsenal,18bb7c10,Nott'ham Forest,e4a775cb,,,,,W
1,2023-08-21,3b5ecd36,Arsenal,18bb7c10,Crystal Palace,47c64c55,2.0,1.0,3.0,,W
2,2023-08-26,d8f8f8ad,Arsenal,18bb7c10,Fulham,fd962109,1.5,0.5,3.0,,D
3,2023-09-03,74125d47,Arsenal,18bb7c10,Manchester Utd,19538871,1.666667,1.0,2.333333,,W
4,2023-09-17,b1278924,Arsenal,18bb7c10,Everton,d3fd31cc,2.0,1.0,2.5,,W


In [21]:
# Home team rows
home = df.copy()
home = home.rename(columns={
    "team": "home_team",
    "team_id": "home_team_id",
    "opp": "away_team",
    "opp_id": "away_team_id",
    "avg_goals_last_5": "home_avg_gf_5",
    "avg_goals_against_last_5": "home_avg_ga_5",
    "avg_goals_points_last_5": "home_avg_pts_5",
    "form_score": "home_form_score",
    "result": "home_result"
})

# Away team rows
away = df.copy()
away = away.rename(columns={
    "team": "away_team",
    "team_id": "away_team_id",
    "opp": "home_team",
    "opp_id": "home_team_id",
    "avg_goals_last_5": "away_avg_gf_5",
    "avg_goals_against_last_5": "away_avg_ga_5",
    "avg_goals_points_last_5": "away_avg_pts_5",
    "form_score": "away_form_score",
    "result": "away_result"
})

away.head()

Unnamed: 0,date,match_id,away_team,away_team_id,home_team,home_team_id,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score,away_result
0,2023-08-12,26a7f90c,Arsenal,18bb7c10,Nott'ham Forest,e4a775cb,,,,,W
1,2023-08-21,3b5ecd36,Arsenal,18bb7c10,Crystal Palace,47c64c55,2.0,1.0,3.0,,W
2,2023-08-26,d8f8f8ad,Arsenal,18bb7c10,Fulham,fd962109,1.5,0.5,3.0,,D
3,2023-09-03,74125d47,Arsenal,18bb7c10,Manchester Utd,19538871,1.666667,1.0,2.333333,,W
4,2023-09-17,b1278924,Arsenal,18bb7c10,Everton,d3fd31cc,2.0,1.0,2.5,,W


In [22]:
merged = pd.merge(
    home,
    away,
    on=["match_id", "home_team_id", "away_team_id"],
    suffixes=("_home", "_away")
)
merged.head()

Unnamed: 0,date_home,match_id,home_team_home,home_team_id,away_team_home,away_team_id,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,home_result,date_away,away_team_away,home_team_away,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score,away_result
0,2023-08-12,26a7f90c,Arsenal,18bb7c10,Nott'ham Forest,e4a775cb,,,,,W,2023-08-12,Nott'ham Forest,Arsenal,,,,,L
1,2023-08-21,3b5ecd36,Arsenal,18bb7c10,Crystal Palace,47c64c55,2.0,1.0,3.0,,W,2023-08-21,Crystal Palace,Arsenal,1.0,0.0,3.0,,L
2,2023-08-26,d8f8f8ad,Arsenal,18bb7c10,Fulham,fd962109,1.5,0.5,3.0,,D,2023-08-26,Fulham,Arsenal,0.5,1.5,1.5,,D
3,2023-09-03,74125d47,Arsenal,18bb7c10,Manchester Utd,19538871,1.666667,1.0,2.333333,,W,2023-09-03,Manchester Utd,Arsenal,1.333333,1.333333,2.0,,L
4,2023-09-17,b1278924,Arsenal,18bb7c10,Everton,d3fd31cc,2.0,1.0,2.5,,W,2023-09-17,Everton,Arsenal,0.5,2.0,0.25,,L


In [23]:
final = merged[[
    "match_id",
    "date_home",
    "home_team_home", "away_team_away",

    # Home rolling stats
    "home_avg_gf_5",
    "home_avg_ga_5",
    "home_avg_pts_5",
    "home_form_score",

    # Away rolling stats
    "away_avg_gf_5",
    "away_avg_ga_5",
    "away_avg_pts_5",
    "away_form_score",

    # Final match label
    "home_result"
]]


In [24]:
final = final.rename(columns={
    "date_home": "date",
    "home_team_home": "home_team",
    "away_team_away": "away_team",
})
final.head()

Unnamed: 0,match_id,date,home_team,away_team,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score,home_result
0,26a7f90c,2023-08-12,Arsenal,Nott'ham Forest,,,,,,,,,W
1,3b5ecd36,2023-08-21,Arsenal,Crystal Palace,2.0,1.0,3.0,,1.0,0.0,3.0,,W
2,d8f8f8ad,2023-08-26,Arsenal,Fulham,1.5,0.5,3.0,,0.5,1.5,1.5,,D
3,74125d47,2023-09-03,Arsenal,Manchester Utd,1.666667,1.0,2.333333,,1.333333,1.333333,2.0,,W
4,b1278924,2023-09-17,Arsenal,Everton,2.0,1.0,2.5,,0.5,2.0,0.25,,W


In [25]:
#print(final['home_team'].value_counts())
print(final['away_team'].value_counts())

away_team
Nott'ham Forest    83
Newcastle Utd      83
Arsenal            83
West Ham           83
Liverpool          83
Brighton           83
Aston Villa        83
Wolves             83
Crystal Palace     83
Brentford          83
Chelsea            83
Manchester City    83
Bournemouth        83
Tottenham          83
Everton            83
Manchester Utd     83
Fulham             83
Burnley            45
Sheffield Utd      38
Luton Town         38
Leicester City     38
Southampton        38
Ipswich Town       38
Leeds United        7
Sunderland          7
Name: count, dtype: int64


In [26]:
final.info()
final = pd.get_dummies(final, columns=['home_team', 'away_team'], drop_first=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660 entries, 0 to 1659
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   match_id         1660 non-null   object 
 1   date             1660 non-null   object 
 2   home_team        1660 non-null   object 
 3   away_team        1660 non-null   object 
 4   home_avg_gf_5    1635 non-null   float64
 5   home_avg_ga_5    1635 non-null   float64
 6   home_avg_pts_5   1635 non-null   float64
 7   home_form_score  1535 non-null   float64
 8   away_avg_gf_5    1635 non-null   float64
 9   away_avg_ga_5    1635 non-null   float64
 10  away_avg_pts_5   1635 non-null   float64
 11  away_form_score  1535 non-null   float64
 12  home_result      1660 non-null   object 
dtypes: float64(8), object(5)
memory usage: 168.7+ KB


In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
final['home_result'] = le.fit_transform(final['home_result']) # W=2, D=0, L=1
final

Unnamed: 0,match_id,date,home_avg_gf_5,home_avg_ga_5,home_avg_pts_5,home_form_score,away_avg_gf_5,away_avg_ga_5,away_avg_pts_5,away_form_score,...,away_team_Manchester City,away_team_Manchester Utd,away_team_Newcastle Utd,away_team_Nott'ham Forest,away_team_Sheffield Utd,away_team_Southampton,away_team_Sunderland,away_team_Tottenham,away_team_West Ham,away_team_Wolves
0,26a7f90c,2023-08-12,,,,,,,,,...,False,False,False,True,False,False,False,False,False,False
1,3b5ecd36,2023-08-21,2.000000,1.0,3.000000,,1.000000,0.000000,3.00,,...,False,False,False,False,False,False,False,False,False,False
2,d8f8f8ad,2023-08-26,1.500000,0.5,3.000000,,0.500000,1.500000,1.50,,...,False,False,False,False,False,False,False,False,False,False
3,74125d47,2023-09-03,1.666667,1.0,2.333333,,1.333333,1.333333,2.00,,...,False,True,False,False,False,False,False,False,False,False
4,b1278924,2023-09-17,2.000000,1.0,2.500000,,0.500000,2.000000,0.25,,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,3c399f5e,2025-08-30,0.600000,2.4,0.200000,0.200000,1.600000,0.400000,2.40,2.200000,...,False,False,False,False,False,False,False,False,False,False
1656,55fbbbe6,2025-09-13,1.000000,2.6,0.200000,0.133333,0.400000,1.000000,0.40,0.533333,...,False,False,True,False,False,False,False,False,False,False
1657,45650a33,2025-09-20,0.600000,2.0,0.200000,0.066667,0.250000,1.500000,1.00,,...,False,False,False,False,False,False,False,False,False,False
1658,116cff7d,2025-09-27,0.600000,2.4,0.000000,0.000000,2.000000,0.600000,2.00,1.733333,...,False,False,False,False,False,False,False,True,False,False


In [28]:
final.to_csv('../data/pl_matches_final.csv', index=False)

In [29]:
pl_matches = pl_matches[['date', 'home_team', 'opponent', 'avg_goals_last_5', 'avg_goals_against_last_5', 'avg_goals_points_last_5', 'form_score', 'result']]
pl_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660 entries, 0 to 1659
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      1660 non-null   object 
 1   home_team                 1660 non-null   object 
 2   opponent                  1660 non-null   object 
 3   avg_goals_last_5          1635 non-null   float64
 4   avg_goals_against_last_5  1635 non-null   float64
 5   avg_goals_points_last_5   1635 non-null   float64
 6   form_score                1535 non-null   float64
 7   result                    1660 non-null   object 
dtypes: float64(4), object(4)
memory usage: 103.9+ KB


In [None]:
pl_matches.fillna(0, inplace=True)
pl_matches = pd.get_dummies(pl_matches, columns=['home_team', 'opponent'], drop_first=True)

In [31]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
pl_matches['result'] = le.fit_transform(pl_matches['result']) # W=2, D=0, L=1
pl_matches

Unnamed: 0,date,avg_goals_last_5,avg_goals_against_last_5,avg_goals_points_last_5,form_score,result,home_team_Aston Villa,home_team_Bournemouth,home_team_Brentford,home_team_Brighton,...,opponent_Manchester City,opponent_Manchester Utd,opponent_Newcastle Utd,opponent_Nott'ham Forest,opponent_Sheffield Utd,opponent_Southampton,opponent_Sunderland,opponent_Tottenham,opponent_West Ham,opponent_Wolves
0,2023-08-12,0.000000,0.0,0.000000,0.000000,2,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,2023-08-21,2.000000,1.0,3.000000,0.000000,2,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2023-08-26,1.500000,0.5,3.000000,0.000000,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2023-09-03,1.666667,1.0,2.333333,0.000000,2,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
4,2023-09-17,2.000000,1.0,2.500000,0.000000,2,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,2025-08-30,0.600000,2.4,0.200000,0.200000,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1656,2025-09-13,1.000000,2.6,0.200000,0.133333,1,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1657,2025-09-20,0.600000,2.0,0.200000,0.066667,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1658,2025-09-27,0.600000,2.4,0.000000,0.000000,0,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [32]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../data/pl_matches_final.csv")

# Filter to recent data
df = df.sort_values("date")

# Fill rolling NaNs with team mean or 0
rolling_cols = [
    "home_avg_gf_5", "home_avg_ga_5", "home_avg_pts_5", "home_form_score",
    "away_avg_gf_5", "away_avg_ga_5", "away_avg_pts_5", "away_form_score"
]
df[rolling_cols] = df[rolling_cols].fillna(df[rolling_cols].mean())

# Ensure one-hot columns are ints 0/1
one_hot_cols = [c for c in df.columns if c.startswith("home_team_") or c.startswith("away_team_")]
df[one_hot_cols] = df[one_hot_cols].astype(int)

In [33]:
teams = pd.read_csv('../data/pl_teams.csv')
all_team_names = teams['home'].tolist()
import numpy as np

elo = {team: 1500 for team in all_team_names}
K = 20

elos = []

for _, row in df.iterrows():
    home = row.filter(like="home_team_").idxmax()[10:]  # team name
    away = row.filter(like="away_team_").idxmax()[10:]
    
    Rh = elo[home]
    Ra = elo[away]

    # Expected score
    Eh = 1 / (1 + 10 ** ((Ra - Rh) / 400))
    Ea = 1 - Eh

    result = row["home_result"]      # 0 draw, 1 loss, 2 win
    if result == 2: Sh, Sa = 1, 0
    elif result == 1: Sh, Sa = 0, 1
    else: Sh, Sa = 0.5, 0.5

    # Update ratings
    elo[home] = Rh + K * (Sh - Eh)
    elo[away] = Ra + K * (Sa - Ea)

    elos.append([elo[home], elo[away]])

df["home_elo"] = [h for h, a in elos]
df["away_elo"] = [a for h, a in elos]
df["diff_elo"] = df["home_elo"] - df["away_elo"]


In [34]:
df.to_csv("../data/pl_matches_final_cleaned.csv", index=False)

In [35]:
pl_matches.to_csv('../data/pl_clean.csv', index=False)