In [None]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from dataset_versioning import save_dataset_version 

In [None]:
# Load the cleaned dataset (Change to your specific file path)
file_path = 'datasets_versions/EPL_dataset_5_20250207.csv'
data = pd.read_csv(file_path)

# Check the first few rows
print(data.head())


    HomeTeam       AwayTeam  FTHG  FTAG  FTR  HTHG  HTAG  HTR  HS  AS  ...  \
0    Arsenal        Everton     2     1    0     1     1    1  26   5  ...   
1  Brentford      Newcastle     2     4    2     0     3    2  10  12  ...   
2   Brighton     Man United     0     2    2     0     0    1  17  11  ...   
3    Burnley  Nott'm Forest     1     2    2     0     2    2  20  12  ...   
4    Chelsea    Bournemouth     2     1    0     1     0    0  16  22  ...   

   AR  AvgH  AvgD   AvgA       HAS       HDS       AAS       ADS  \
0   0  1.17  8.04  15.37  1.319865  0.827642  0.763359  1.070707   
1   0  2.94  3.96   2.22  0.579125  0.586581  0.996384  1.077441   
2   0  2.17  4.11   2.95  0.861953  0.964243  1.076738  0.895623   
3   0  2.90  3.74   2.32  0.505051  0.948172  0.265167  0.545455   
4   0  1.45  5.32   5.99  1.090909  0.811571  0.498192  0.787879   

   Home_Overall  Away_Overall  
0            82            77  
1            77            81  
2            77           

In [8]:
# Coordinates for each EPL team's stadium (latitude, longitude)
stadium_coordinates = {
    "Arsenal": (51.554888, -0.108438),
    "Aston Villa": (52.509026, -1.884605),
    "Bournemouth": (50.735159, -1.838127),
    "Brentford": (51.490630, -0.288768),
    "Brighton": (50.861822, -0.083278),
    "Burnley": (53.789159, -2.230020),
    "Chelsea": (51.481663, -0.190956),
    "Crystal Palace": (51.398330, -0.085491),
    "Everton": (53.438787, -3.037248),
    "Fulham": (51.474774, -0.221551),
    "Leeds": (53.777637, -1.572243),
    "Leicester": (52.620370, -1.142196),
    "Liverpool": (53.430829, -2.960830),
    "Luton": (51.884327, -0.422142),
    "Man City": (53.483135, -2.200394),
    "Man United": (53.463059, -2.291340),
    "Newcastle": (54.975556, -1.621667),
    "Nott'm Forest": (52.939000, -1.132778),
    "Sheffield United": (53.370280, -1.470010),
    "Tottenham": (51.604301, -0.066880),
    "West Ham": (51.538709, -0.016575),
    "Wolves": (52.590144, -2.130939),
    "Southampton": (50.905875, -1.391195),
    "Watford": (51.650774, -0.401572),
    "West Brom": (52.509030, -1.964349),
    "Norwich": (52.622014, 1.308868)
}

In [None]:
# Haversine formula to calculate distance between two coordinates
def haversine(coord1, coord2):
    R = 6371  # earth radius in kms
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

# Calculate travel distance for each match
def calculate_travel_distance(row):
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    if home_team in stadium_coordinates and away_team in stadium_coordinates:
        home_coord = stadium_coordinates[home_team]
        away_coord = stadium_coordinates[away_team]
        return haversine(home_coord, away_coord)
    else:
        return np.nan  # Return NaN if coordinates are not found

# Apply the function to calculate travel distance
data['TravelDistance'] = data.apply(calculate_travel_distance, axis=1)

# Check if the new feature is added
print(data[['HomeTeam', 'AwayTeam', 'TravelDistance']].head())


    HomeTeam       AwayTeam  TravelDistance
0    Arsenal        Everton      288.384207
1  Brentford      Newcastle      397.511501
2   Brighton     Man United      326.069081
3    Burnley  Nott'm Forest      119.316643
4    Chelsea    Bournemouth      141.818451


In [10]:
# Save the dataset with the new TravelDistance column
save_dataset_version(data, "travel_distance")

Dataset saved as: datasets_versions\EPL_dataset_travel_distance_20250224.csv
