In [None]:
import pandas as pd
import numpy as np
from dataset_versioning import save_dataset_version 

In [12]:
# Load the cleaned dataset (Change to your specific file path)
file_path = 'datasets_versions/EPL_dataset_7_20250226.csv'
data = pd.read_csv(file_path)

# Check the first few rows
print(data.head())

    HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  HTR  HS  AS  ...  \
0    Arsenal        Everton     2     1    0     1     1    1  26   5  ...   
1  Brentford      Newcastle     2     4    2     0     3    2  10  12  ...   
2   Brighton     Man United     0     2    2     0     0    1  17  11  ...   
3    Burnley  Nott'm Forest     1     2    2     0     2    2  20  12  ...   
4    Chelsea    Bournemouth     2     1    0     1     0    0  16  22  ...   

        HAS       HDS       AAS       ADS  Home_Overall  Away_Overall  \
0  1.319865  0.827642  0.763359  1.070707            82            77   
1  0.579125  0.586581  0.996384  1.077441            77            81   
2  0.861953  0.964243  1.076738  0.895623            77            82   
3  0.505051  0.948172  0.265167  0.545455            74            76   
4  1.090909  0.811571  0.498192  0.787879            81            74   

   TravelDistance  HomeProb  DrawProb  AwayProb  
0      288.384207  0.818569  0.119120  0.0

In [13]:
### Assign Points Based on FTR
def assign_points(row):
    if row['FTR'] == 0:  # Home win
        return (3, 0)
    elif row['FTR'] == 2:  # Away win
        return (0, 3)
    else:  # Draw
        return (1, 1)

data[['HomePoints', 'AwayPoints']] = data.apply(assign_points, axis=1, result_type='expand')

In [14]:
# Uses a rolling 5-match average of points, which indicates recent form which is crucial in predicting match results 
data['HomeMomentum'] = data.groupby('HomeTeam')['HomePoints'].transform(lambda x: x.rolling(5, min_periods=1).mean())
data['AwayMomentum'] = data.groupby('AwayTeam')['AwayPoints'].transform(lambda x: x.rolling(5, min_periods=1).mean())

In [15]:
# Drop HomePoints and AwayPoints columns
data.drop(columns=['HomePoints', 'AwayPoints'], inplace=True)

In [16]:
print(data.head(30))

            HomeTeam          AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  HTR  HS  \
0            Arsenal           Everton     2     1    0     1     1    1  26   
1          Brentford         Newcastle     2     4    2     0     3    2  10   
2           Brighton        Man United     0     2    2     0     0    1  17   
3            Burnley     Nott'm Forest     1     2    2     0     2    2  20   
4            Chelsea       Bournemouth     2     1    0     1     0    0  16   
5     Crystal Palace       Aston Villa     5     0    0     2     0    0  15   
6          Liverpool            Wolves     2     0    0     2     0    0  36   
7              Luton            Fulham     2     4    2     1     2    2  15   
8           Man City          West Ham     3     1    0     2     1    0  28   
9   Sheffield United         Tottenham     0     3    2     0     1    2   6   
10          Brighton           Chelsea     1     2    2     0     1    2  12   
11        Man United         Newcastle  

In [17]:
# Save the dataset with the new TravelDistance column
save_dataset_version(data, "8")

Dataset saved as: datasets_versions\EPL_dataset_8_20250227.csv
