Nettoyage : suppression ou traitement des valeurs manquantes et incohérentes.


Standardisation : harmonisation des unités, formats de date et noms de colonnes pour garantir la cohérence et la qualité des données.


In [1]:
import os
import pandas as pd
import numpy as np

# === Paths ===
base_path = "../Data/Bronze"
cleaned_path = "../Data/Silver"
os.makedirs(cleaned_path, exist_ok=True)

# Chemin d'acces au dossier
teams = os.listdir(base_path)


# === Fonction pour netoyer les espaces et  espaces spéciaux par espaces normales===
def clean_string(text):
    if isinstance(text, str):
        return text.strip().replace("\n", " ").replace("\xa0", " ")
    return text


# === boucle sur chaque equipe===
for team_folder in teams:
    team_path = os.path.join(base_path, team_folder)
    if not os.path.isdir(team_path):
        continue

    print(f"\n Cleaning data for team: {team_folder}")

    #  1. Nettoyade des match

    matches_path = os.path.join(team_path, "matches.csv")
    if os.path.exists(matches_path):
        matches = pd.read_csv(matches_path)

        # --- Nettoyer les noms de colonne ---
        matches.columns = [c.strip().replace("\n", "_").replace(" ", "_") for c in matches.columns]
        matches = matches.applymap(clean_string)

        # --- Remplacer les valeurs vides par np.nan = NaN---
        matches.replace(["", "NaN", "None", "-", "--"], np.nan, inplace=True)

        # --- Laisse juste la premier data  eneleve la data qui est entre ()
        if "Time" in matches.columns:
            matches["Time"] = matches["Time"].astype(str).str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()

        #  Combine Date + Time dans  datetime 
        if {"Date", "Time"}.issubset(matches.columns):
            matches["datetime"] = pd.to_datetime(
                matches["Date"].astype(str) + " " + matches["Time"].astype(str),
                errors="coerce"
            )
            matches.drop(["Date", "Time"], axis=1, inplace=True)

        # --- Standardize remplace matchweek par week round proper par round ---
        if "Round" in matches.columns:
            matches["Round"] = matches["Round"].str.replace("round proper", "Round", case=False)
            matches["Round"] = matches["Round"].str.replace("Matchweek", "Week", case=False)
            matches["Round"] = matches["Round"].str.strip()

        # --- Nettoyer les colonnes GF et GA en supprimant le contenu entre parenthèses remplace les valeur manquantes par 0 convert valeur en entier ---
        for col in ["GF", "GA"]:
            if col in matches.columns:
                matches[col] = matches[col].astype(str).str.extract(r"^(\d+)")[0]
                matches[col] = pd.to_numeric(matches[col], errors="coerce").fillna(0).astype(int)

        # nettoie la colonne "Attendance" en enlevant les virgules, en transformant les valeurs en nombres entiers, et en remplaçant les valeurs manquantes par 0.
        if "Attendance" in matches.columns:
            matches["Attendance"] = (
                matches["Attendance"]
                .astype(str)
                .str.replace(",", "", regex=False)
                .replace("nan", np.nan)
            )
            matches["Attendance"] = pd.to_numeric(matches["Attendance"], errors="coerce").fillna(0).astype(int)

        # --- Renomer opp formation column  ---
        for col in matches.columns:
            if "opp formation" in col.lower():  
                matches.rename(columns={col: "Opp_Formation"}, inplace=True)

        # --- Convertir au numeric column et renplacer manquante par 0 ---
        numeric_cols = ["xG", "xGA" , "Poss"]
        for col in numeric_cols:
            if col in matches.columns:
                matches[col] = pd.to_numeric(matches[col], errors="coerce").fillna(0)

        # --- Handle les colonne categorical ---
        for cat_col in ["Referee", "Captain", "Opponent", "Venue", "Result", "Comp", "Round"]:
            if cat_col in matches.columns:
                matches[cat_col] = matches[cat_col].fillna("Unknown")

        # --- Enregistrer les match ---
        clean_team_path = os.path.join(cleaned_path, team_folder)
        os.makedirs(clean_team_path, exist_ok=True)
        matches.to_csv(os.path.join(clean_team_path, "matches_cleaned.csv"), index=False)
        print(f" Matches cleaned for {team_folder}")

    # 2. Nettoyer Equipes

    players_path = os.path.join(team_path, "players.csv")
    if os.path.exists(players_path):
        players = pd.read_csv(players_path)

        # --- Nettoyage les noms colonnes  ---
        players.columns = [c.strip().replace("\n", "_").replace(" ", "_") for c in players.columns]
        players = players.applymap(clean_string)

        # --- Gerer les valeurs manquantes ---
        players.replace(["", "NaN", "None", "-", "--"], np.nan, inplace=True)

        #  Extracter depuis nation les 3lettres majuscules
        if "Nation" in players.columns:
            players["Nation"] = players["Nation"].str.extract(r"([A-Z]{3})")


        
        # --- Convertire les numeriques columns ---
        numeric_cols = ["Age", "MP", "Starts", "Min", "90s", "Gls", "Ast", "G-PK", "PK", "PKatt", "CrdY", "CrdR"]
        for col in numeric_cols:
            if col in players.columns:
                players[col] = pd.to_numeric(players[col], errors="coerce").fillna(0)

        # --- Gerer les valeurs categorical manquantes  ---
        for cat_col in ["Player", "Nation", "Pos"]:
            if cat_col in players.columns:
                players[cat_col] = players[cat_col].fillna("Unknown")

        # --- Enregistrer les equipes nettoyer ---
        clean_team_path = os.path.join(cleaned_path, team_folder)
        os.makedirs(clean_team_path, exist_ok=True)
        players.to_csv(os.path.join(clean_team_path, "players_cleaned.csv"), index=False)
        print(f" Players cleaned for {team_folder}")

print("\n All teams cleaned and saved successfully to:", cleaned_path)



 Cleaning data for team: Arsenal
 Matches cleaned for Arsenal
 Players cleaned for Arsenal

 Cleaning data for team: Aston Villa
 Matches cleaned for Aston Villa
 Players cleaned for Aston Villa

 Cleaning data for team: Bournemouth
 Matches cleaned for Bournemouth
 Players cleaned for Bournemouth

 Cleaning data for team: Brentford


  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)


 Matches cleaned for Brentford
 Players cleaned for Brentford

 Cleaning data for team: Brighton
 Matches cleaned for Brighton
 Players cleaned for Brighton

 Cleaning data for team: Chelsea
 Matches cleaned for Chelsea
 Players cleaned for Chelsea

 Cleaning data for team: Crystal Palace


  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)


 Matches cleaned for Crystal Palace
 Players cleaned for Crystal Palace

 Cleaning data for team: Everton
 Matches cleaned for Everton
 Players cleaned for Everton

 Cleaning data for team: Fulham
 Matches cleaned for Fulham


  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)


 Players cleaned for Fulham

 Cleaning data for team: Ipswich Town
 Matches cleaned for Ipswich Town
 Players cleaned for Ipswich Town

 Cleaning data for team: Leicester City
 Matches cleaned for Leicester City
 Players cleaned for Leicester City

 Cleaning data for team: Liverpool
 Matches cleaned for Liverpool


  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)


 Players cleaned for Liverpool

 Cleaning data for team: Manchester City
 Matches cleaned for Manchester City
 Players cleaned for Manchester City

 Cleaning data for team: Manchester Utd
 Matches cleaned for Manchester Utd
 Players cleaned for Manchester Utd

 Cleaning data for team: Newcastle Utd
 Matches cleaned for Newcastle Utd
 Players cleaned for Newcastle Utd

 Cleaning data for team: Nott'ham Forest


  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)


 Matches cleaned for Nott'ham Forest
 Players cleaned for Nott'ham Forest

 Cleaning data for team: Southampton
 Matches cleaned for Southampton
 Players cleaned for Southampton

 Cleaning data for team: Tottenham
 Matches cleaned for Tottenham
 Players cleaned for Tottenham

 Cleaning data for team: West Ham
 Matches cleaned for West Ham
 Players cleaned for West Ham

 Cleaning data for team: Wolves
 Matches cleaned for Wolves
 Players cleaned for Wolves

 All teams cleaned and saved successfully to: ../Data/Silver


  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
