In [None]:
import pandas as pd
import glob
import re  
import os

In [7]:
def clean_teams(filename, df):
    df = df.copy()

    df = df.iloc[:, 1:-1]

    rename_map = {
        "Name": "team_name",
        "ID": "team_id",
        "Formation": "team_formation",
        "Overall": "overall_score",
        "Attack": "attack",
        "Midfield": "midfield",
        "Defence": "defense",
        "Players": "num_players",
    }

    for possible in ["Club worth", "Club.worth"]:
        if possible in df.columns:
            rename_map[possible] = "worth_euro"

    df = df.rename(columns=rename_map)
    
    df['team_formation'] = df['team_formation'].fillna('')

    if "team_formation" in df.columns:
        split_cols = df["team_formation"].str.split(" ", expand=True)
        if split_cols.shape[1] >= 2:
            df["lineup"] = split_cols[0]
            df["style"] = split_cols[1]
        if split_cols.shape[1] >= 3:
            df["trash2"] = split_cols[2]
        else:
            df["trash2"] = None

    df["team_name"] = df["team_name"].str.replace("Major League Soccer", "", regex=False)

    if "worth_euro" in df.columns:
        df["worth_euro"] = (
            df["worth_euro"]
            .astype(str)
            .str.replace("â‚¬", "", regex=False)
            .str.replace("M", "", regex=False)
        )

    numeric_cols = ["overall_score", "attack", "midfield", "defense", "worth_euro", "num_players"]
    for c in numeric_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
            
    base_name = os.path.basename(filename)
    
    match = re.search(r"teams_(.+)\.csv", base_name)
    
    if match:
        date_str = match.group(1) 

        df['date'] = pd.to_datetime(date_str)
    else:   
        df['date'] = pd.NaT


    if "trash2" in df.columns:
        df = df.drop(columns=["trash2"])

    if "team_id" in df.columns:
        df = df.set_index("team_id")
        
    if 'team_formation' in df.columns:
        df = df.drop(columns=['team_formation'])

    return df

In [None]:
for filepath in glob.glob('../../data_files/scraped_raw/teams/*.csv'):
    
    # Read the file
    df = pd.read_csv(filepath)
    
    # Add the date column
    df = clean_teams(filepath, df)
    
    # Save the cleaned file
    df.to_csv('../../data_files/cleaned/teams/' + os.path.basename(filepath))