In [None]:
import pandas as pd
import glob
import os

In [None]:
### if '+' present, add numbers, if '-' present, subtract numbers

def safe_eval(x):
    try:
        if '+' in str(x):
            return sum(int(i) for i in str(x).split('+'))
        elif '-' in str(x):
            return int(str(x).split('-')[0]) - sum(int(i) for i in str(x).split('-')[1:])
        else:
            return int(x)
    except:
        return x


In [None]:
def clean_player(ex):
    ex = ex.copy()
    
    ex = ex.loc[:, ~ex.columns.str.contains('^Unnamed')]
    
    ex.loc[:, 'Name'] = ex['Name'].str.replace(r'[A-Z]+$', '', regex=True)
    ex.loc[:, 'date'] = pd.to_datetime(ex['date']).dt.date
    
    pat = r'(\w+)\((\d+)\)(\d{4}) ~ (\d{4})'
    ex[['position', 'jersey_num', 'contract_start',
        'contract_end']] = ex.loc[:, 'Team & Contract'].astype(str).str.extract(pat)

    ex.loc[:, 'height_cm'] = ex['Height'].astype(str).str.split('cm / ').str[0].astype('Int64')
    ex.loc[:, 'weight_kg'] = ex['Weight'].astype(str).str.split('kg / ').str[0].astype('Int64')

    ex.loc[:, 'wage_eur'] = ex['Wage'].astype(str).str.replace('€', '').str.replace(
        ',', '').str.replace('K', '000').str.replace('M',
                                                     '000000').astype('Int64')

    ex.loc[:, 'value_eur'] = ex['Value'].astype(str).str.replace('€', '').str.replace(
        ',', '').str.replace('.', '').str.replace('K', '000').str.replace(
            'M', '000000').astype('Int64')
    
    
    ex.drop(columns=['Height', 'Weight', 'Team & Contract', 'Value', 'Wage'],
            inplace=True)
    
    ex = ex.map(safe_eval)

    
    main_cols = [
        'ID', 'date', 'Name', 'Age', 'height_cm', 'weight_kg', 'team',
        'contract_start', 'contract_end', 'position', 'foot', 'jersey_num',
        'wage_eur', 'value_eur'
    ]
    
    rest_cols = [col for col in ex.columns if col not in main_cols]
    ex = ex[main_cols + rest_cols]
    
    for c in ex.columns:
        try:
            ex[c] = pd.to_numeric(ex[c])
        except (ValueError, TypeError):
            pass 
    
    ex.columns = ex.columns.str.lower().str.replace(' ', '_')
    
    return ex


In [None]:
csv = pd.read_csv('G:\\My Drive\\GitHubProjects\\MLS\\data\\scraping\\players\\Atlanta-United-Apr-1,-2025.csv')

clean_csv = clean_player(csv)

clean_csv

In [None]:
files = glob.glob('data/scraping/players/*.csv')

In [None]:
for file in glob.glob('G:/My Drive/GitHubProjects/MLS/data/scraping/players/*.csv'):
    df = pd.read_csv(file)
    cleaned_df = clean_player(df)
    filename = os.path.basename(file)
    cleaned_df.to_csv(
        f"G:/My Drive/GitHubProjects/MLS/data/data_clean/players/cleaned_{filename}.csv",
        index=False)
    print(f"Cleaned {file} and saved the result.")