# 02. Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [3]:
# Importing dataset
df = pd.read_csv('../data/raw/matches.csv')

In [6]:
# Columns to remove and why

columns_to_remove = {
    'Season': 'Only one value (2023-24) - no variance',
    'Date': 'Specific dates not needed for prediction',
    'Time': 'Kickoff time has minimal impact',
    'Comp': 'All Premier League - single value',
    'Captain': 'Too granular, changes frequently',
    'Formation': 'Too complex for simple model',
    'Referee': 'Not a strong predictor',
    'Match Report': 'Just a URL/link',
    'Notes': 'Unstructured text field',
    'Attendance': 'Weak correlation with results'
}

for col in columns_to_remove:
    if col in df.columns:
        df = df.drop(col, axis=1)
    else:
        print(f"Incorrect name: {col}")

In [8]:
# Standardizing names

team_name_map = {
    "Man Utd": "Manchester United",
    "ManchesterUnited": "Manchester United",
    "Man City": "Manchester City",
    "ManchesterCity": "Manchester City",
    "Spurs": "Tottenham Hotspur",
    "Tottenham": "Tottenham Hotspur",
    "Arsenal": "Arsenal",
    "Chelsea": "Chelsea",
    "Liverpool": "Liverpool",
    "Everton": "Everton",
    "Aston Villa": "Aston Villa",
    "Newcastle": "Newcastle United",
    "Newcastle Utd": "Newcastle United",
    "West Ham": "West Ham United",
    "West Ham Utd": "West Ham United",
    "Brighton": "Brighton and Hove Albion",
    "Brighton & Hove Albion": "Brighton and Hove Albion",
    "Bournemouth": "AFC Bournemouth",
    "Sheffield Utd": "Sheffield United",
    "Sheffield United": "Sheffield United",
    "Wolves": "Wolverhampton Wanderers",
    "Wolverhampton": "Wolverhampton Wanderers",
    "Nottingham Forest": "Nottingham Forest",
    "Nottm Forest": "Nottingham Forest",
    "Fulham": "Fulham",
    "Brentford": "Brentford",
    "Crystal Palace": "Crystal Palace",

    # Promoted clubs mapped to relegated ones
    "Ipswich": "Luton Town",
    "Ipswich Town": "Luton Town",
    "Leicester": "Burnley",
    "Leicester City": "Burnley",
    "Southampton": "Sheffield United",

    # Relegated/others
    "Luton": "Luton Town",
    "LutonTown": "Luton Town",
    "Burnley": "Burnley"
}

df["Team"] = df["Team"].replace(team_name_map)
df["Opponent"] = df["Opponent"].replace(team_name_map)

In [11]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [15]:
# Extracting number from round

df["Round"] = df["Round"].str.replace("Matchweek ", "").astype(int)

In [16]:
df.head()

Unnamed: 0,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Sh,SoT,Dist,FK,PK,PKatt,Team
0,1,Fri,Away,W,3,0,Burnley,1.9,0.3,65.0,17.0,8.0,13.9,0.0,0,0,Manchester City
1,2,Sat,Home,W,1,0,Newcastle United,1.0,0.3,59.0,14.0,4.0,17.9,0.0,0,0,Manchester City
2,3,Sun,Away,W,2,1,Sheffield United,3.5,0.7,79.0,29.0,9.0,17.3,2.0,0,1,Manchester City
3,4,Sat,Home,W,5,1,Fulham,2.2,1.4,68.0,6.0,4.0,14.8,0.0,1,1,Manchester City
4,5,Sat,Away,W,3,1,West Ham United,3.6,0.9,68.0,29.0,13.0,16.4,1.0,0,0,Manchester City


In [17]:
# Saving dataset

df.to_csv('../data/processed/dataset_cleaned.csv', index=False)