In [122]:
# ======================================
# Merge Standard + Misc Cleaned CSVs
# ======================================

import pandas as pd
import glob
import os
from tqdm import tqdm  # simple progress bar

In [123]:
!where python

c:\Users\eoind\anaconda3\envs\fbref_analytics\python.exe
C:\Users\eoind\AppData\Local\Microsoft\WindowsApps\python.exe


In [124]:
# --- SETTINGS ---
clean_folder = "../data/clean"   # Adjust if needed
save_folder = "../data/cleaned_and_merged"   # Where merged files will go

# --- Create merged folder if it doesn't exist ---
os.makedirs(save_folder, exist_ok=True)

In [125]:
# --- Find all cleaned CSVs ---
csv_files = glob.glob(os.path.join(clean_folder, "**", "*.csv"), recursive=True)
print(f"Found {len(csv_files)} cleaned CSVs.")


Found 106 cleaned CSVs.


In [126]:
# --- Group CSVs by league+season ---
# (Assuming filenames like: 'EPL_2020-2021_standard.csv' and 'EPL_2020-2021_misc.csv')
file_groups = dict()

for file in csv_files:
    filename = os.path.basename(file)
    key = filename.replace("_standard.csv", "").replace("_misc.csv", "")
    if key not in file_groups:
        file_groups[key] = {}
    if "standard" in filename:
        file_groups[key]["standard"] = file
    elif "misc" in filename:
        file_groups[key]["misc"] = file

print(f"Detected {len(file_groups)} seasons ready for merging.")

Detected 53 seasons ready for merging.


In [127]:
# --- Merge standard + misc ---
for key, files in tqdm(file_groups.items(), desc="Merging"):
    try:
        # Load both files
        df_standard = pd.read_csv(files.get("standard"))
        df_misc = pd.read_csv(files.get("misc"))

        # Merge on player and team
        df_merged = pd.merge(
            df_standard, df_misc,
            on=["player", "team"],
            how="outer",
            suffixes=("_standard", "_misc")
        )

        # Save
        save_path = os.path.join(save_folder, f"{key}_merged.csv")
        df_merged.to_csv(save_path, index=False)

    except Exception as e:
        print(f"⚠️ Failed merging {key}: {e}")

print("\n✅ All files merged and saved into 'merged/' folder!")

Merging: 100%|██████████| 53/53 [00:02<00:00, 17.70it/s]


✅ All files merged and saved into 'merged/' folder!





In [128]:
input_folder = input_folder = os.path.join("..", "data", "cleaned_and_merged")
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

print(f"Found {len(csv_files)} files:")
print(csv_files)
print(os.getcwd())

Found 53 files:
['..\\data\\cleaned_and_merged\\Bundesliga_2016-2017_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2017-2018_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2018-2019_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2019-2020_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2020-2021_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2021-2022_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2022-2023_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2023-2024_merged.csv', '..\\data\\cleaned_and_merged\\Bundesliga_2024-2025_merged.csv', '..\\data\\cleaned_and_merged\\Champions League_2016-2017_merged.csv', '..\\data\\cleaned_and_merged\\Champions League_2017-2018_merged.csv', '..\\data\\cleaned_and_merged\\Champions League_2018-2019_merged.csv', '..\\data\\cleaned_and_merged\\Champions League_2019-2020_merged.csv', '..\\data\\cleaned_and_merged\\Champions League_2020-2021_merged.csv', '..\\data\\cleaned_and_merged\\Champions Le

In [129]:
# Set your input and output folder paths
input_folder = os.path.join("..", "data", "cleaned_and_merged")
output_folder = os.path.join("..", "data", "cleaned_and_merged", "big5_standard_misc_merged")

# Make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Initialize an empty list to hold DataFrames
dfs = []

# Loop through all CSVs in the merged folder
for file_path in glob.glob(os.path.join(input_folder, "*.csv")):
    # Extract filename
    filename = os.path.basename(file_path)  # example: 'Bundesliga_2016-2017_merged.csv'
    parts = filename.split("_")  # ['Bundesliga', '2016-2017', 'merged.csv']
    
    # Extract league and season
    league = parts[0]
    season = parts[1]
    
    # Read CSV
    df = pd.read_csv(file_path)
    
    # Add new columns
    df['comp'] = league
    df['season'] = season
    
    # Add to list
    dfs.append(df)

# Concatenate everything
big5_standard_misc = pd.concat(dfs, ignore_index=True)

# Save the final DataFrame
output_path = os.path.join(output_folder, "big5_standard_misc.csv")
big5_standard_misc.to_csv(output_path, index=False)

print(f"Done! Combined shape: {big5_standard_misc.shape}")

Done! Combined shape: (30875, 37)


In [130]:
# checks
print(big5_standard_misc.isnull().sum())

player                              0
nationality_standard             1653
position_standard                   6
team                                0
age_standard                       14
birth_year_standard                14
matches_played                      0
starts                              0
minutes                             1
goals                               0
assists                             0
goals_and_assists                   0
non_penalty_goals                   0
pens_scored                         0
pens_attempted                      0
goals_per90                         1
assists_per90                       1
goals_and_assists_per90             1
non_penalty_goals_per90             1
non_penalty_goals_and_assists       1
country_code_standard              28
nationality_misc                 1653
position_misc                       6
age_misc                           14
birth_year_misc                    14
fouls_drawn                        21
offsides    

In [132]:
print(big5_standard_misc.duplicated(subset=["player", "team", "comp", "season"]).sum())
duplicates = big5_standard_misc[big5_standard_misc.duplicated(subset=["player", "team", "comp", "season"], keep=False)]

3


In [133]:
big5_standard_misc = big5_standard_misc.drop_duplicates()

In [134]:
print(duplicates)
print(duplicates["season"])

                 player nationality_standard position_standard   team  \
26209  Emanuele Torrasi                Italy                MF  Milan   
26210  Emanuele Torrasi                Italy                MF  Milan   
26211  Emanuele Torrasi                Italy                MF  Milan   
26212  Emanuele Torrasi                Italy                MF  Milan   

       age_standard  birth_year_standard  matches_played  starts  minutes  \
26209          18.0               1999.0               1       0      6.0   
26210          18.0               1999.0               1       0      6.0   
26211          18.0               1999.0               1       0      6.0   
26212          18.0               1999.0               1       0      6.0   

       goals  ...  pkwon  country_code_misc     comp     season  \
26209      0  ...    NaN                ITA  Serie A  2017-2018   
26210      0  ...    0.0                ITA  Serie A  2017-2018   
26211      0  ...    NaN                ITA  Se

In [136]:
big5_standard_misc = big5_standard_misc.drop_duplicates(subset=["player", "team", "comp", "season"], keep="first")
print(duplicates)

                 player nationality_standard position_standard   team  \
26209  Emanuele Torrasi                Italy                MF  Milan   
26210  Emanuele Torrasi                Italy                MF  Milan   
26211  Emanuele Torrasi                Italy                MF  Milan   
26212  Emanuele Torrasi                Italy                MF  Milan   

       age_standard  birth_year_standard  matches_played  starts  minutes  \
26209          18.0               1999.0               1       0      6.0   
26210          18.0               1999.0               1       0      6.0   
26211          18.0               1999.0               1       0      6.0   
26212          18.0               1999.0               1       0      6.0   

       goals  ...  pkwon  country_code_misc     comp     season  \
26209      0  ...    NaN                ITA  Serie A  2017-2018   
26210      0  ...    0.0                ITA  Serie A  2017-2018   
26211      0  ...    NaN                ITA  Se

In [137]:
dupe_rows = big5_standard_misc[
    (big5_standard_misc['player'] == 'Emanuele Torrasi') &
    (big5_standard_misc['team'] == 'Milan') &
    (big5_standard_misc['comp'] == 'Serie A') &
    (big5_standard_misc['season'] == '2017-2018')
]

print(dupe_rows)

                 player nationality_standard position_standard   team  \
26209  Emanuele Torrasi                Italy                MF  Milan   

       age_standard  birth_year_standard  matches_played  starts  minutes  \
26209          18.0               1999.0               1       0      6.0   

       goals  ...  pkwon  country_code_misc     comp     season  \
26209      0  ...    NaN                ITA  Serie A  2017-2018   

       xg_expected_goals  npxg_non_penalty_xg  xag_expected_assisted_goals  \
26209                NaN                  NaN                          NaN   

       npxg+xag  progressive_carries  prgp  
26209       NaN                  NaN   NaN  

[1 rows x 37 columns]


In [138]:
torrasi_dupes = big5_standard_misc[
    (big5_standard_misc['player'] == 'Emanuele Torrasi') &
    (big5_standard_misc['team'] == 'Milan') &
    (big5_standard_misc['comp'] == 'Serie A') &
    (big5_standard_misc['season'] == '2017-2018')
]
print(torrasi_dupes)
torrasi_indexes = torrasi_dupes.index
# Keep only the first, drop others
big5_standard_misc = big5_standard_misc.drop(index=torrasi_indexes[1:])

                 player nationality_standard position_standard   team  \
26209  Emanuele Torrasi                Italy                MF  Milan   

       age_standard  birth_year_standard  matches_played  starts  minutes  \
26209          18.0               1999.0               1       0      6.0   

       goals  ...  pkwon  country_code_misc     comp     season  \
26209      0  ...    NaN                ITA  Serie A  2017-2018   

       xg_expected_goals  npxg_non_penalty_xg  xag_expected_assisted_goals  \
26209                NaN                  NaN                          NaN   

       npxg+xag  progressive_carries  prgp  
26209       NaN                  NaN   NaN  

[1 rows x 37 columns]


In [None]:
print(duplicates)

                 player nationality_standard position_standard   team  \
20484  Emanuele Torrasi                Italy                MF  Milan   
20485  Emanuele Torrasi                Italy                MF  Milan   
20486  Emanuele Torrasi                Italy                MF  Milan   
20487  Emanuele Torrasi                Italy                MF  Milan   

       age_standard  birth_year_standard  matches_played  starts  minutes  \
20484          18.0               1999.0               1       0        6   
20485          18.0               1999.0               1       0        6   
20486          18.0               1999.0               1       0        6   
20487          18.0               1999.0               1       0        6   

       goals  ...  pkwon  country_code_misc   league     season  \
20484      0  ...    NaN                ITA  Serie A  2017-2018   
20485      0  ...    0.0                ITA  Serie A  2017-2018   
20486      0  ...    NaN                ITA  Se

In [None]:
# for project: Paths are always relative to where your script or notebook runs.

#If you move the whole project folder (e.g., "penalties_won"), it still works.

# ✅ Best for personal projects or version control (like GitHub).

In [139]:
big5_standard_misc.sample(10)
print(big5_standard_misc['comp'].unique())
print(big5_standard_misc['comp'].unique())
print(big5_standard_misc.dtypes)
print(big5_standard_misc.describe())

['Bundesliga' 'Champions League' 'EPL' 'La Liga' 'Ligue 1' 'Serie A']
['Bundesliga' 'Champions League' 'EPL' 'La Liga' 'Ligue 1' 'Serie A']
player                            object
nationality_standard              object
position_standard                 object
team                              object
age_standard                     float64
birth_year_standard              float64
matches_played                     int64
starts                             int64
minutes                          float64
goals                              int64
assists                            int64
goals_and_assists                  int64
non_penalty_goals                  int64
pens_scored                        int64
pens_attempted                     int64
goals_per90                      float64
assists_per90                    float64
goals_and_assists_per90          float64
non_penalty_goals_per90          float64
non_penalty_goals_and_assists    float64
country_code_standard             object

In [140]:
big5_standard_misc.head(5)

Unnamed: 0,player,nationality_standard,position_standard,team,age_standard,birth_year_standard,matches_played,starts,minutes,goals,...,pkwon,country_code_misc,comp,season,xg_expected_goals,npxg_non_penalty_xg,xag_expected_assisted_goals,npxg+xag,progressive_carries,prgp
0,Aaron Hunt,Germany,"FW,MF",Hamburger SV,29.0,1986.0,22,14,1302.0,4,...,1.0,GER,Bundesliga,2016-2017,,,,,,
1,Aaron Seydel,Germany,FW,Mainz 05,20.0,1996.0,6,1,176.0,1,...,0.0,GER,Bundesliga,2016-2017,,,,,,
2,Admir Mehmedi,Switzerland,"FW,MF",Leverkusen,25.0,1991.0,22,16,1258.0,3,...,0.0,SUI,Bundesliga,2016-2017,,,,,,
3,Adrián Ramos,Colombia,FW,Dortmund,30.0,1986.0,7,4,357.0,2,...,0.0,COL,Bundesliga,2016-2017,,,,,,
4,Albin Ekdal,Sweden,MF,Hamburger SV,27.0,1989.0,21,13,1247.0,1,...,0.0,SWE,Bundesliga,2016-2017,,,,,,
