In [None]:
import pandas as pd
import glob
import os 
from pathlib import Path

In [45]:
### if '+' present, add numbers, if '-' present, subtract numbers

def safe_eval(x):
    try:
        if '+' in str(x):
            return sum(int(i) for i in str(x).split('+'))
        elif '-' in str(x):
            return int(str(x).split('-')[0]) - sum(int(i) for i in str(x).split('-')[1:])
        else:
            return int(x)
    except:
        return x


In [46]:
def clean_player(ex):
    ex = ex.copy()
    
    ex = ex.loc[:, ~ex.columns.str.contains('^Unnamed')]
    
    ex.loc[:, 'Name'] = ex['Name'].str.replace(r'[A-Z]+$', '', regex=True)
    ex.loc[:, 'date'] = pd.to_datetime(ex['date']).dt.date
    
    pat = r'(\w+)\((\d+)\)(\d{4}) ~ (\d{4})'
    ex[['position', 'jersey_num', 'contract_start',
        'contract_end']] = ex.loc[:, 'Team & Contract'].astype(str).str.extract(pat)

    ex.loc[:, 'height_cm'] = ex['Height'].astype(str).str.split('cm ').str[0].astype('Int64')
    ex.loc[:, 'weight_kg'] = ex['Weight'].astype(str).str.split('kg ').str[0].astype('Int64')

    ex.loc[:, 'wage_eur'] = ex['Wage'].astype(str).str.replace('€', '').str.replace(
        ',', '').str.replace('K', '000').str.replace('M',
                                                     '000000').astype('Int64')

    ex.loc[:, 'value_eur'] = ex['Value'].astype(str).str.replace('€', '').str.replace(
        ',', '').str.replace('.', '').str.replace('K', '000').str.replace(
            'M', '000000').astype('Int64')
    
    
    ex.drop(columns=['Height', 'Weight', 'Team & Contract', 'Value', 'Wage'],
            inplace=True)
    
    ## safe eval everything but date
    for col in ex.columns:
        if col != 'date':
            ex[col] = ex[col].apply(safe_eval)

    
    main_cols = [
        'ID', 'date', 'Name', 'Age', 'height_cm', 'weight_kg', 'team',
        'contract_start', 'contract_end', 'position', 'foot', 'jersey_num',
        'wage_eur', 'value_eur'
    ]
    
    rest_cols = [col for col in ex.columns if col not in main_cols]
    ex = ex[main_cols + rest_cols]
    
    for c in ex.columns:
        try:
            ex[c] = pd.to_numeric(ex[c])
        except (ValueError, TypeError):
            pass 
    
    ex.columns = ex.columns.str.lower().str.replace(' ', '_')
    
    return ex


In [47]:
csv = pd.read_csv('../../data_files/scraped_raw/players/Atlanta-United-Apr-1,-2019.csv')

clean_csv = clean_player(csv)

clean_csv

Unnamed: 0,id,date,name,age,height_cm,weight_kg,team,contract_start,contract_end,position,...,total_defending,marking,standing_tackle,sliding_tackle,total_goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,164505,2019-04-01,B. Guzan,33,193,95,Atlanta United,2017,2023,GK,...,41,12,15,14,361,73,68,73,72,75
1,237223,2019-04-01,J. Gressel,24,186,84,Atlanta United,2017,2019,RWB,...,180,59,63,58,46,10,6,7,13,10
2,164839,2019-04-01,M. Parkhurst,34,180,72,Atlanta United,2016,2023,RCB,...,215,74,71,70,42,12,7,7,6,10
3,237255,2019-04-01,M. Robinson,21,188,84,Atlanta United,2017,2020,CB,...,163,46,59,58,39,13,7,9,5,5
4,199669,2019-04-01,L. González Pirez,26,185,80,Atlanta United,2017,2019,LCB,...,222,74,74,74,55,14,8,8,12,13
5,186536,2019-04-01,B. Shea,28,191,86,Atlanta United,2019,2019,LWB,...,171,61,51,59,57,16,7,13,12,9
6,164610,2019-04-01,J. Larentowicz,34,185,79,Atlanta United,2016,2019,RCM,...,197,65,67,65,68,16,16,12,10,14
7,228838,2019-04-01,E. Remedi,23,169,72,Atlanta United,2018,2023,LCM,...,200,65,71,64,58,14,14,11,9,10
8,226377,2019-04-01,G. Martínez,25,172,75,Atlanta United,2019,2020,RW,...,105,47,28,30,55,6,9,13,12,15
9,207877,2019-04-01,J. Martínez,25,170,69,Atlanta United,2017,2023,ST,...,55,20,20,15,60,12,14,14,12,8


In [48]:
### select if file is named "San-Diego-FC-*.csv"

files = glob.glob('../../data_files/scraped_raw/players/*.csv')
sd_files = [f for f in files if 'San-Diego-FC-' in f]



In [49]:
for file in sd_files:
    df = pd.read_csv(file)
    cleaned_df = clean_player(df)
    filename = os.path.basename(file)
    cleaned_df.to_csv(
        f"../../data_files/cleaned/players2/cleaned_{filename}.csv",
        index=False)

OSError: Cannot save file into a non-existent directory: '..\..\data_files\cleaned\players2'

In [None]:
### combine all cleaned team files into one master file 

input_path = Path('../../data_files/cleaned/players/')
output_path = Path('../../data_files/cleaned/players/master_players.csv')

csv_files = [f for f in os.listdir(input_path) if f.endswith('.csv')]
+9+

In [58]:
all_dataframes = []
for file in csv_files:
    file_path = os.path.join(input_path, file)
    try:
        df = pd.read_csv(file_path, engine="pyarrow")
        all_dataframes.append(df)
    except Exception as e:
        print(f"Skipping {file} due to: {e}")

master = pd.concat(all_dataframes, ignore_index=True)

Skipping cleaned_CF-Montréal-Jul-30,-2021.csv.csv due to: Empty CSV file
Skipping cleaned_Real-Salt-Lake-May-4,-2017.csv.csv due to: [Errno 22] Invalid argument


In [59]:
master.to_csv(output_path, index=False)

In [2]:
df = pd.read_csv('../../data_files/cleaned/players/master_players.csv')

df.head()

Unnamed: 0,id,date,name,age,height_cm,weight_kg,team,contract_start,contract_end,position,...,total_goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,marking,tactical_awareness,positioning,tackling
0,238919,2023-01-18,J. Pantemis,25.0,188,79,CF Montréal,2018.0,2023.0,GK,...,315,66,63,54,66,66,,,,
1,255956,2023-01-18,J. Waterman,26.0,185,75,CF Montréal,2020.0,2023.0,RCB,...,45,6,7,10,11,11,,,,
2,206485,2023-01-18,R. Camacho,31.0,185,79,CF Montréal,2022.0,2023.0,CB,...,49,8,6,6,15,14,,,,
3,247648,2023-01-18,K. Miller,25.0,183,88,CF Montréal,2021.0,2023.0,LCB,...,50,8,11,12,7,12,,,,
4,243574,2023-01-18,Z. Brault-Guillard,23.0,171,65,CF Montréal,2020.0,2022.0,RM,...,50,11,6,11,11,11,,,,


In [4]:
clubs = [
    "Atlanta United",
    "Austin FC",
    "CF Montréal",
    "Charlotte FC",
    "Chicago Fire FC",
    "FC Cincinnati",
    "Colorado Rapids",
    "Columbus Crew",
    "D.C. United",
    "FC Dallas",
    "Houston Dynamo FC",
    "Sporting Kansas City",
    "LA Galaxy",
    "Los Angeles Football Club",
    "Inter Miami CF",
    "Minnesota United FC",
    "Minnesota United",
    "Nashville SC",
    "New England Revolution",
    "New York City Football Club",
    "New York City FC",
    "New York Red Bulls",
    "Orlando City",
    "Philadelphia Union",
    "Portland Timbers",
    "Real Salt Lake",
    "San Diego FC",
    "San Jose Earthquakes",
    "Seattle Sounders FC",
    "St. Louis CITY SC",
    "Toronto FC",
    "Vancouver Whitecaps FC"
]

team_map = {
    "Atlanta United": "ATL",
    "Austin FC": "ATX",
    "CF Montréal": "MTL",
    "Charlotte FC": "CLT",
    "Chicago Fire FC": "CHI",
    "FC Cincinnati": "CIN",
    "Colorado Rapids": "COL",
    "Columbus Crew": "CLB",
    "D.C. United": "DC",
    "FC Dallas": "DAL",
    "Houston Dynamo FC": "HOU",
    "Sporting Kansas City": "SKC",
    "LA Galaxy": "LA",
    "Los Angeles Football Club": "LAFC",
    "Inter Miami CF": "MIA",
    "Minnesota United": "MIN",
    "Minnesota United FC": "MIN",
    "Nashville SC": "NSH",
    "New England Revolution": "NE",
    "New York City Football Club": "NYC",
    "New York City FC": "NYC",
    "New York Red Bulls": "RBNY",
    "Orlando City": "ORL",
    "Philadelphia Union": "PHI",
    "Portland Timbers": "POR",
    "Real Salt Lake": "RSL",
    "San Diego FC": "SD",
    "San Jose Earthquakes": "SJ",
    "Seattle Sounders FC": "SEA",
    "St. Louis CITY SC": "STL",
    "Toronto FC": "TOR",
    "Vancouver Whitecaps FC": "VAN"
}

In [5]:
### unique team names

df['team'].unique()

array(['CF Montréal', 'New York City FC', 'Colorado Rapids',
       'Inter Miami', 'DC United', 'Real Salt Lake',
       'Sporting Kansas City', 'FC Dallas', 'Chicago Fire', 'Toronto FC',
       'Nashville SC', 'Orlando City SC', 'New York Red Bulls',
       'Columbus Crew', 'FC Cincinnati', 'St. Louis CITY SC', 'LA Galaxy',
       'Minnesota United FC', 'Philadelphia Union',
       'Vancouver Whitecaps FC', 'Austin FC', 'Houston Dynamo',
       'New England Revolution', 'San Jose Earthquakes',
       'Portland Timbers', 'Atlanta United', 'Charlotte FC',
       'Los Angeles FC', 'Seattle Sounders FC', 'San Diego FC'],
      dtype=object)

In [6]:
### Rmove cf, fc, united, sc, football club, and strip from periods from team names and team mapping

df['team'] = df['team'].str.replace(r'\bCF\b', '', regex=True)
df['team'] = df['team'].str.replace(r'\bFC\b', '', regex=True)
df['team'] = df['team'].str.replace(r'\bUnited\b', '', regex=True)
df['team'] = df['team'].str.replace(r'\bSC\b', '', regex=True)
df['team'] = df['team'].str.replace(r'\bFootball Club\b', '', regex=True)
df['team'] = df['team'].str.replace(r'\.', '', regex=True)
df['team'] = df['team'].str.strip()
df['team'] = df['team'].str.lower()

df['team'].unique()

array(['montréal', 'new york city', 'colorado rapids', 'inter miami',
       'dc', 'real salt lake', 'sporting kansas city', 'dallas',
       'chicago fire', 'toronto', 'nashville', 'orlando city',
       'new york red bulls', 'columbus crew', 'cincinnati',
       'st louis city', 'la galaxy', 'minnesota', 'philadelphia union',
       'vancouver whitecaps', 'austin', 'houston dynamo',
       'new england revolution', 'san jose earthquakes',
       'portland timbers', 'atlanta', 'charlotte', 'los angeles',
       'seattle sounders', 'san diego'], dtype=object)

In [None]:
import re

def clean_team_name(s):
    if s is None:
        return s

    s = re.sub(r'\bCF\b', '', s)
    s = re.sub(r'\bFC\b', '', s)
    s = re.sub(r'\bUnited\b', '', s)
    s = re.sub(r'\bSC\b', '', s)
    s = re.sub(r'\bFootball Club\b', '', s)
    s = re.sub(r'\.', '', s)
    s = s.strip().lower()

    # collapse extra spaces created by removals
    s = re.sub(r'\s+', ' ', s)

    return s

team_map_clean = {
    clean_team_name(k): v
    for k, v in team_map.items()
}

In [8]:
### map team names to abbreviations

df['team_abbr'] = df['team'].map(team_map_clean)

df.head()

Unnamed: 0,id,date,name,age,height_cm,weight_kg,team,contract_start,contract_end,position,...,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,marking,tactical_awareness,positioning,tackling,team_abbr
0,238919,2023-01-18,J. Pantemis,25.0,188,79,montréal,2018.0,2023.0,GK,...,66,63,54,66,66,,,,,MTL
1,255956,2023-01-18,J. Waterman,26.0,185,75,montréal,2020.0,2023.0,RCB,...,6,7,10,11,11,,,,,MTL
2,206485,2023-01-18,R. Camacho,31.0,185,79,montréal,2022.0,2023.0,CB,...,8,6,6,15,14,,,,,MTL
3,247648,2023-01-18,K. Miller,25.0,183,88,montréal,2021.0,2023.0,LCB,...,8,11,12,7,12,,,,,MTL
4,243574,2023-01-18,Z. Brault-Guillard,23.0,171,65,montréal,2020.0,2022.0,RM,...,11,6,11,11,11,,,,,MTL


In [9]:
### nan in team_abbr

df['team_abbr'].isna().sum()

### change team_abbr to team

df['team'] = df['team_abbr']

## remove team_abbr

df.drop(columns=['team_abbr'], inplace=True)

df.head()

Unnamed: 0,id,date,name,age,height_cm,weight_kg,team,contract_start,contract_end,position,...,total_goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,marking,tactical_awareness,positioning,tackling
0,238919,2023-01-18,J. Pantemis,25.0,188,79,MTL,2018.0,2023.0,GK,...,315,66,63,54,66,66,,,,
1,255956,2023-01-18,J. Waterman,26.0,185,75,MTL,2020.0,2023.0,RCB,...,45,6,7,10,11,11,,,,
2,206485,2023-01-18,R. Camacho,31.0,185,79,MTL,2022.0,2023.0,CB,...,49,8,6,6,15,14,,,,
3,247648,2023-01-18,K. Miller,25.0,183,88,MTL,2021.0,2023.0,LCB,...,50,8,11,12,7,12,,,,
4,243574,2023-01-18,Z. Brault-Guillard,23.0,171,65,MTL,2020.0,2022.0,RM,...,50,11,6,11,11,11,,,,


In [10]:
### find nan in team 

df[df['team'].isna()]

Unnamed: 0,id,date,name,age,height_cm,weight_kg,team,contract_start,contract_end,position,...,total_goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,marking,tactical_awareness,positioning,tackling


In [12]:
### find unique team names again

df['team'].unique()

array(['MTL', 'NYC', 'COL', 'MIA', 'DC', 'RSL', 'SKC', 'DAL', 'CHI',
       'TOR', 'NSH', 'ORL', 'RBNY', 'CLB', 'CIN', 'STL', 'LA', 'MIN',
       'PHI', 'VAN', 'ATX', 'HOU', 'NE', 'SJ', 'POR', 'ATL', 'CLT',
       'LAFC', 'SEA', 'SD'], dtype=object)

In [11]:
### save df 

df.to_csv('../../data_files/cleaned/players/master_players_w_team_abbr.csv', index=False)