In [64]:
import pandas as pd
import numpy as np

In [65]:
pl_matches = pd.read_csv('../data/pl_matches.csv')
print(pl_matches.info())
pl_matches.value_counts('match_id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          140 non-null    object 
 1   time          140 non-null    object 
 2   match_id      140 non-null    object 
 3   league_name   140 non-null    object 
 4   league_id     140 non-null    int64  
 5   opponent      140 non-null    object 
 6   opponent_id   140 non-null    object 
 7   home_away     140 non-null    object 
 8   result        140 non-null    object 
 9   gf            140 non-null    float64
 10  ga            140 non-null    float64
 11  attendance    140 non-null    object 
 12  captain       140 non-null    object 
 13  formation     140 non-null    object 
 14  referee       140 non-null    object 
 15  home_team     140 non-null    object 
 16  home_team_id  140 non-null    object 
dtypes: float64(2), int64(1), object(14)
memory usage: 18.7+ KB
None


match_id
0050df89    2
0701e218    2
0bad74e2    2
0e30cf4b    2
1163f46d    2
           ..
f7e2c9c7    2
f96a49ea    2
fa03feeb    2
fc45a8b2    2
fed27aaa    2
Name: count, Length: 70, dtype: int64

In [66]:
# pl_matches = pl_matches[pl_matches['league_id'] == 9]
# pl_matches.to_csv('../data/pl_matches.csv', index=False)

# Remove duplicates
pl_matches = pl_matches.drop_duplicates(subset='match_id', keep='first')
pl_matches.reset_index(drop=True, inplace=True)
print(pl_matches.info())
pl_matches.value_counts('match_id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          70 non-null     object 
 1   time          70 non-null     object 
 2   match_id      70 non-null     object 
 3   league_name   70 non-null     object 
 4   league_id     70 non-null     int64  
 5   opponent      70 non-null     object 
 6   opponent_id   70 non-null     object 
 7   home_away     70 non-null     object 
 8   result        70 non-null     object 
 9   gf            70 non-null     float64
 10  ga            70 non-null     float64
 11  attendance    70 non-null     object 
 12  captain       70 non-null     object 
 13  formation     70 non-null     object 
 14  referee       70 non-null     object 
 15  home_team     70 non-null     object 
 16  home_team_id  70 non-null     object 
dtypes: float64(2), int64(1), object(14)
memory usage: 9.4+ KB
None


match_id
0050df89    1
0701e218    1
0bad74e2    1
0e30cf4b    1
1163f46d    1
           ..
f7e2c9c7    1
f96a49ea    1
fa03feeb    1
fc45a8b2    1
fed27aaa    1
Name: count, Length: 70, dtype: int64

In [67]:
# Derive new columns
pl_matches['goal_diff'] = pl_matches['gf'] - pl_matches['ga']
pl_matches['points'] = pl_matches['result'].map({'W': 3, 'D': 1, 'L': 0})

In [68]:
pl_matches = pl_matches.sort_values(by=['home_team', 'date'], ascending=[True, True])
pl_matches.reset_index(drop=True, inplace=True)
pl_matches.head()

Unnamed: 0,date,time,match_id,league_name,league_id,opponent,opponent_id,home_away,result,gf,ga,attendance,captain,formation,referee,home_team,home_team_id,goal_diff,points
0,2025-08-23,17:30,e8724659,Premier League,9,Leeds United,5bfb9659,Home,W,5.0,0.0,60110,Martin Ødegaard,4-3-3,Jarred Gillett,Arsenal,18bb7c10,5.0,3
1,2025-09-13,12:30,4fe3e679,Premier League,9,Nott'ham Forest,e4a775cb,Home,W,3.0,0.0,60167,Martin Ødegaard,4-3-3,Darren England,Arsenal,18bb7c10,3.0,3
2,2025-09-21,16:30,2045939c,Premier League,9,Manchester City,b8fd03ef,Home,D,1.0,1.0,60161,Gabriel Magalhães,4-3-3,Stuart Attwell,Arsenal,18bb7c10,0.0,1
3,2025-08-16,12:30,bbdf4739,Premier League,9,Newcastle Utd,b2b47a98,Home,D,0.0,0.0,42526,John McGinn,4-2-3-1,Craig Pawson,Aston Villa,8602292d,0.0,1
4,2025-09-21,14:00,4e615d10,Premier League,9,Sunderland,8ef52968,Away,D,1.0,1.0,46261,John McGinn,4-2-3-1,Samuel Barrott,Aston Villa,8602292d,0.0,1


In [69]:
# Create rolling stats
pl_matches['avg_goals_last_5'] = pl_matches.groupby('home_team')['gf'].transform(lambda x: x.shift().rolling(window=5, min_periods=1).mean())
pl_matches['avg_goals_against_last_5'] = pl_matches.groupby('home_team')['ga'].transform(lambda x: x.shift().rolling(window=5, min_periods=1).mean())
pl_matches['avg_goals_points_last_5'] = pl_matches.groupby('home_team')['points'].transform(lambda x: x.shift().rolling(window=5, min_periods=1).mean())

In [70]:
def weighted_form_window(x):
    # x is already the rolling window Series
    n = len(x)
    weights = np.arange(1, n+1)  # increasing weights for older → newer
    return np.average(x, weights=weights)

pl_matches['form_score'] = (
    pl_matches.groupby('home_team')['points']
      .transform(lambda x: x.shift().rolling(3, min_periods=1).apply(weighted_form_window, raw=True))
)

In [71]:
# reorder columns
pl_matches = pl_matches[['home_team', 'opponent','gf', 'ga', 'avg_goals_last_5', 'avg_goals_against_last_5', 'avg_goals_points_last_5', 'form_score', 'result']]
pl_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   home_team                 70 non-null     object 
 1   opponent                  70 non-null     object 
 2   gf                        70 non-null     float64
 3   ga                        70 non-null     float64
 4   avg_goals_last_5          50 non-null     float64
 5   avg_goals_against_last_5  50 non-null     float64
 6   avg_goals_points_last_5   50 non-null     float64
 7   form_score                16 non-null     float64
 8   result                    70 non-null     object 
dtypes: float64(6), object(3)
memory usage: 5.1+ KB


In [72]:
pl_matches.fillna(0, inplace=True)
pl_matches = pd.get_dummies(pl_matches, columns=['home_team', 'opponent'], drop_first=True)
pl_matches

Unnamed: 0,gf,ga,avg_goals_last_5,avg_goals_against_last_5,avg_goals_points_last_5,form_score,result,home_team_Aston Villa,home_team_Bournemouth,home_team_Brentford,...,opponent_Leeds United,opponent_Liverpool,opponent_Manchester City,opponent_Manchester Utd,opponent_Newcastle Utd,opponent_Nott'ham Forest,opponent_Sunderland,opponent_Tottenham,opponent_West Ham,opponent_Wolves
0,5.0,0.0,0.000000,0.000000,0.0,0.0,W,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,3.0,0.0,5.000000,0.000000,3.0,0.0,W,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1.0,1.0,4.000000,0.000000,3.0,0.0,D,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,0.0,0.0,0.000000,0.000000,0.0,0.0,D,True,False,False,...,False,False,False,False,True,False,False,False,False,False
4,1.0,1.0,0.000000,0.000000,1.0,0.0,D,True,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0.0,2.0,0.333333,3.666667,0.0,0.0,L,False,False,False,...,False,False,False,False,False,False,False,False,False,False
66,0.0,4.0,0.000000,0.000000,0.0,0.0,L,False,False,False,...,False,False,True,False,False,False,False,False,False,False
67,2.0,3.0,0.000000,4.000000,0.0,0.0,L,False,False,False,...,False,False,False,False,False,False,False,False,False,False
68,1.0,3.0,1.000000,3.500000,0.0,0.0,L,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [73]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
pl_matches['result'] = le.fit_transform(pl_matches['result']) # W=2, D=0, L=1

In [None]:
# pl_matches.to_csv('../data/pl_clean.csv', index=False)