In [1]:
import pandas as pd
import glob
import os

In [2]:
### if '+' present, add numbers, if '-' present, subtract numbers

def safe_eval(x):
    try:
        if '+' in str(x):
            return sum(int(i) for i in str(x).split('+'))
        elif '-' in str(x):
            return int(str(x).split('-')[0]) - sum(int(i) for i in str(x).split('-')[1:])
        else:
            return int(x)
    except:
        return x


In [3]:
def clean_player(ex):
    ex = ex.copy()
    
    ex = ex.loc[:, ~ex.columns.str.contains('^Unnamed')]
    
    ex.loc[:, 'Name'] = ex['Name'].str.replace(r'[A-Z]+$', '', regex=True)
    ex.loc[:, 'date'] = pd.to_datetime(ex['date']).dt.date
    
    pat = r'(\w+)\((\d+)\)(\d{4}) ~ (\d{4})'
    ex[['position', 'jersey_num', 'contract_start',
        'contract_end']] = ex.loc[:, 'Team & Contract'].astype(str).str.extract(pat)

    ex.loc[:, 'height_cm'] = ex['Height'].astype(str).str.split('cm / ').str[0].astype('Int64')
    ex.loc[:, 'weight_kg'] = ex['Weight'].astype(str).str.split('kg / ').str[0].astype('Int64')

    ex.loc[:, 'wage_eur'] = ex['Wage'].astype(str).str.replace('€', '').str.replace(
        ',', '').str.replace('K', '000').str.replace('M',
                                                     '000000').astype('Int64')

    ex.loc[:, 'value_eur'] = ex['Value'].astype(str).str.replace('€', '').str.replace(
        ',', '').str.replace('.', '').str.replace('K', '000').str.replace(
            'M', '000000').astype('Int64')
    
    
    ex.drop(columns=['Height', 'Weight', 'Team & Contract', 'Value', 'Wage'],
            inplace=True)
    
    ## safe eval everything but date
    for col in ex.columns:
        if col != 'date':
            ex[col] = ex[col].apply(safe_eval)

    
    main_cols = [
        'ID', 'date', 'Name', 'Age', 'height_cm', 'weight_kg', 'team',
        'contract_start', 'contract_end', 'position', 'foot', 'jersey_num',
        'wage_eur', 'value_eur'
    ]
    
    rest_cols = [col for col in ex.columns if col not in main_cols]
    ex = ex[main_cols + rest_cols]
    
    for c in ex.columns:
        try:
            ex[c] = pd.to_numeric(ex[c])
        except (ValueError, TypeError):
            pass 
    
    ex.columns = ex.columns.str.lower().str.replace(' ', '_')
    
    return ex


In [4]:
csv = pd.read_csv('G:\\My Drive\\GitHubProjects\\MLS\\data\\scraping\\players\\Atlanta-United-Apr-1,-2025.csv')

clean_csv = clean_player(csv)

clean_csv

Unnamed: 0,id,date,name,age,height_cm,weight_kg,team,contract_start,contract_end,position,...,total_defending,defensive_awareness,standing_tackle,sliding_tackle,total_goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,164505,2025-04-01,B. Guzan,39,193,94,Atlanta United,2017,2025,GK,...,41,12,15,14,340,69,68,65,69,69
1,237662,2025-04-01,B. Lennon,26,177,70,Atlanta United,2019,2025,RB,...,191,63,65,63,37,6,6,10,8,7
2,216730,2025-04-01,S. Gregersen,29,192,83,Atlanta United,2024,2027,RCB,...,209,68,71,70,47,8,10,10,7,12
3,203398,2025-04-01,D. Williams,31,180,75,Atlanta United,2024,2025,LCB,...,200,67,66,67,42,7,9,6,10,10
4,256044,2025-04-01,Pedro Amador,25,181,73,Atlanta United,2024,2026,LB,...,199,64,68,67,48,8,10,11,12,7
5,243222,2025-04-01,B. Slisz,25,180,76,Atlanta United,2024,2028,RCM,...,198,66,67,65,44,14,5,12,8,5
6,192955,2025-04-01,M. Klich,34,183,84,Atlanta United,2024,2025,LCM,...,198,66,69,63,64,11,16,12,11,14
7,214092,2025-04-01,A. Miranchuk,28,182,74,Atlanta United,2024,2028,CAM,...,79,22,28,29,52,8,8,9,12,15
8,230977,2025-04-01,M. Almirón,30,174,70,Atlanta United,2025,2028,RW,...,169,61,59,49,53,6,9,13,13,12
9,237295,2025-04-01,E. Latte Lath,25,176,72,Atlanta United,2025,2029,ST,...,113,27,41,45,34,7,9,5,8,5


In [5]:
files = glob.glob('data/scraping/players/*.csv')

In [6]:
for file in glob.glob('G:/My Drive/GitHubProjects/MLS/data/scraping/players/*.csv'):
    df = pd.read_csv(file)
    cleaned_df = clean_player(df)
    filename = os.path.basename(file)
    cleaned_df.to_csv(
        f"G:/My Drive/GitHubProjects/MLS/data/data_clean/players/cleaned_{filename}.csv",
        index=False)