Bout de code pour associer les fichiers players_from_clubs.csv et players_profiles en players_tm

In [4]:
from pathlib import Path
import pandas as pd

# On accède au dossier du projet / Accessing the project file / Se accede al expediente del proyecto
def _get_script_dir():
    try:
        return Path(__file__).resolve().parent
    except NameError:
        return Path.cwd()
        
# On crée players_tm.csv en joignant players_from_clubs.csv et players_profiles.csv sur player_id / We create players_tm.csv by joining players_from_clubs.csv and players_profiles.csv on player_id.
# Se crea players_tm.csv uniendo players_from_clubs.csv y players_profiles.csv en player_id
def merge_tm(join_type: str = "inner"):
    # Localisation des fichiers / File location / Ubicación de los archivos
    script_dir = _get_script_dir()
    data_player_dir = script_dir.parent.parent / "data" / "player"

    clubs_path = data_player_dir / "players_from_clubs.csv"
    profiles_path = data_player_dir / "players_profiles.csv"
    out_path = data_player_dir / "players_tm.csv"

    # Colonnes à conserver / Columns to keep / Columnas que se conservarán
    keep_clubs = ["club_id", "player_id", "player_name", "position", "dateOfBirth", "age","nationality", "height", "foot",
                  "joinedOn", "contract", "marketValue", "status"]
    keep_profiles = ["imageUrl", "position_other", "shirtNumber", "club_name", "agent_name", "outfitter"]
    final_order = ["player_id", "player_name", "nationality", "dateOfBirth", "age","club_name", "club_id","position",
                   "position_other", "height", "foot","shirtNumber", "joinedOn", "contract", "marketValue","imageUrl",
                   "agent_name", "outfitter", "status"]

    # Chargement / Loading / Cargando
    clubs_df = pd.read_csv(clubs_path, dtype=str, encoding="utf-8", low_memory=False)
    profiles_df = pd.read_csv(profiles_path, dtype=str, encoding="utf-8", low_memory=False)

    # Normalisation / Standardisation / Normalización
    for df in (clubs_df, profiles_df):
        if "player_id" in df.columns:
            df["player_id"] = df["player_id"].astype(str).str.strip()

    # Sélection des colonnes / Column selection / Selección de columnas
    def ensure_and_select(df, wanted_cols):
        df = df.copy()
        for col in wanted_cols:
            if col not in df.columns:
                df[col] = pd.NA
        return df[wanted_cols]

    clubs_df = ensure_and_select(clubs_df, keep_clubs)
    profiles_df = ensure_and_select(profiles_df, keep_profiles + ["player_id"])

    # Suppression des doubles s'il y en a / Deletion of duplicates, if any / Eliminación de duplicados, si los hay.
    profiles_df = profiles_df.drop_duplicates(subset=["player_id"], keep="first")

    # Jointure / Jointure / Unión
    merged = pd.merge(clubs_df, profiles_df, on="player_id", how=join_type)

    # Ordre final / Final order
    for col in final_order:
        if col not in merged.columns:
            merged[col] = pd.NA
    merged = merged[final_order]

    # --- Exclusions de clubs ---
    EXCLUDE_CLUBS = {
        "1.FC Nuremberg", "A. Bielefeld II", "Arm. Bielefeld", "Benevento", "Bournemouth U21",
        "Cádiz CF", "Cardiff", "Carpi", "Catania", "CD Leganés", "Cesena", "Chievo Verona",
        "Córdoba CF", "Crotone", "Darmstadt 98", "Dep. La Coruña", "E. Braunschweig",
        "E. Frankfurt II", "F. Düsseldorf", "FC Bayern II", "FC Bayern U19", "FC Empoli",
        "FC Ingolstadt", "FC Schalke 04", "Frosinone", "FSV Mainz 05 II", "Granada CF",
        "Greuther Fürth", "Hannover 96", "Hannover 96 II", "Hannover 96 U19", "Hertha BSC",
        "Holst. Kiel II", "Holstein Kiel", "Huddersfield", "Hull City", "Ingolstadt II",
        "Ipswich", "Leicester", "Livorno", "Luton", "Málaga CF", "Middlesbrough", "Monza",
        "Norwich", "Nuremberg II", "Palermo", "Pescara", "QPR", "RB Leipzig U19", "Reading",
        "Real Sociedad B", "Real Valladolid", "Real Zaragoza", "Salernitana", "Sampdoria",
        "SC Paderborn", "SC Paderborn II", "SD Eibar", "SD Huesca", "Sevilla Atl.", "Sheff Utd",
        "Siena", "Southampton", "Spezia Calcio", "Sporting Atlco.", "Sporting Gijón",
        "Stoke City", "Stuttgart II", "Swansea", "UD Almería", "UD Las Palmas", "Venezia",
        "VfL Bochum", "VfL Bochum U19", "W. Bremen II", "Watford", "West Brom", "Wigan",
    }

    merged = merged.loc[
        ~merged["club_name"].fillna("").str.strip().isin(EXCLUDE_CLUBS)
    ].copy()
    
    # Sauvegarde / Orden final
    out_path.parent.mkdir(parents=True, exist_ok=True)
    merged.to_csv(out_path, index=False, encoding="utf-8")

    print("✅ Fichier créé")
    print(f"Nombre de lignes : {len(merged):,}")

# Exécution de la fonction / Function execution / Ejecución de la función
if __name__ == "__main__":
    merge_tm()


✅ Fichier créé
Nombre de lignes : 2,061


Nombre de joueurs après pré-traitement en enlevant les joueurs hors big 5, et équipe de jeunes : 
Avant : 3730
Après : 2061