In [10]:
import pandas as pd
import re
import glob
import os

In [11]:
df = pd.read_csv('cleaned_stats_sdvsatx-05-31-2025.csv')

In [12]:
csv = 'cleaned_stats_atlvsmtl-02-22-2025.csv'
df = pd.read_csv(csv)

In [13]:
fname = os.path.basename(csv)

# regex: match 'cleaned_stats_' + home + 'vs' + away + '-' + date + '.csv'
m = re.match(r"cleaned_stats_([a-z]{2,4})vs([a-z]{2,4})-(\d{2}-\d{2}-\d{4})\.csv", fname, re.I)

if m:
    home, away, date_str = m.groups()
    home, away = home.upper(), away.upper()
    date = pd.to_datetime(date_str, format="%m-%d-%Y")
    print("home_team:", home)   # ATL
    print("away_team:", away)   # MTL
    print("date     :", date)   # 2025-02-22 00:00:00

    # add rows with stat = date and team = teams
    add_df = pd.DataFrame({
        'home_value': [home],
        'away_value': [away],
        'stat': 'teams'
    })
    
    new_row = pd.DataFrame({
        'home_value': [date],
        'away_value': [date],
        'stat': 'date'
    })

df = pd.concat([df, add_df], ignore_index=True)

df = pd.concat([df, new_row], ignore_index=True)


df

home_team: ATL
away_team: MTL
date     : 2025-02-22 00:00:00


Unnamed: 0,stat,home_value,away_value
0,shooting_Possession %,48.0,52.0
1,shooting_Shots,14.0,12.0
2,shooting_Shots on Goal,8.0,5.0
3,shooting_Blocked Shots,3.0,1.0
4,shooting_Total Passes,471.0,492.0
5,shooting_Passing Accuracy %,80.3,82.3
6,shooting_Corners,7.0,5.0
7,shooting_Total Crosses,5.0,13.0
8,shooting_Offsides,1.0,2.0
9,shooting_Aerial Duels Won,19.0,10.0


In [14]:
### pivot long to wide

out = {}
for _, row in df.iterrows():
    out[f"{row['stat']}_home"] = row['home_value']
    out[f"{row['stat']}_away"] = row['away_value']

wide = pd.DataFrame([out])

In [15]:
wide= wide.drop(columns=['date_home'])

## rename date_away to match_date

wide['match_date'] = wide['date_away']

wide = wide.drop(columns=['date_away'])

wide

Unnamed: 0,shooting_Possession %_home,shooting_Possession %_away,shooting_Shots_home,shooting_Shots_away,shooting_Shots on Goal_home,shooting_Shots on Goal_away,shooting_Blocked Shots_home,shooting_Blocked Shots_away,shooting_Total Passes_home,shooting_Total Passes_away,...,possession_86_90_away,xg_Total Team XG_home,xg_Total Team XG_away,xg_Shots_home,xg_Shots_away,xg_Shots On Target_home,xg_Shots On Target_away,teams_home,teams_away,match_date
0,48.0,52.0,14.0,12.0,8.0,5.0,3.0,1.0,471.0,492.0,...,49.08,2.2,3.5,14.0,12.0,8.0,5.0,ATL,MTL,2025-02-22


In [16]:
### simplify colnames
wide.columns = [c.replace(' ', '_').replace('%', 'pct').replace('-', '_').lower() for c in wide.columns]

wide

Unnamed: 0,shooting_possession_pct_home,shooting_possession_pct_away,shooting_shots_home,shooting_shots_away,shooting_shots_on_goal_home,shooting_shots_on_goal_away,shooting_blocked_shots_home,shooting_blocked_shots_away,shooting_total_passes_home,shooting_total_passes_away,...,possession_86_90_away,xg_total_team_xg_home,xg_total_team_xg_away,xg_shots_home,xg_shots_away,xg_shots_on_target_home,xg_shots_on_target_away,teams_home,teams_away,match_date
0,48.0,52.0,14.0,12.0,8.0,5.0,3.0,1.0,471.0,492.0,...,49.08,2.2,3.5,14.0,12.0,8.0,5.0,ATL,MTL,2025-02-22


In [17]:
## create function to reframe stats

def reframe_stats(filename, df):
    fname = os.path.basename(filename)

    # regex: match 'cleaned_stats_' + home + 'vs' + away + '-' + date + '.csv'
    m = re.match(r"cleaned_stats_([a-z]{2,4})vs([a-z]{2,4})-(\d{2}-\d{2}-\d{4})\.csv", fname, re.I)

    if m:
        home, away, date_str = m.groups()
        home, away = home.upper(), away.upper()
        date = pd.to_datetime(date_str, format="%m-%d-%Y")

        # add rows with stat = date and team = teams
        add_df = pd.DataFrame({
            'home_value': [home],
            'away_value': [away],
            'stat': 'teams'
        })
        
        new_row = pd.DataFrame({
            'home_value': [date],
            'away_value': [date],
            'stat': 'date'
        })

    df = pd.concat([df, add_df], ignore_index=True)

    df = pd.concat([df, new_row], ignore_index=True)
    out = {}
    for _, row in df.iterrows():
        out[f"{row['stat']}_home"] = row['home_value']
        out[f"{row['stat']}_away"] = row['away_value']

    wide = pd.DataFrame([out])
    wide= wide.drop(columns=['date_home'])
    wide['match_date'] = wide['date_away']
    wide = wide.drop(columns=['date_away'])
    wide.columns = [c.replace(' ', '_').replace('%', 'pct').replace('-', '_').lower() for c in wide.columns]
    return wide


dfidk = pd.read_csv('cleaned_stats_sdvsatx-05-31-2025.csv')
idk2 = reframe_stats('cleaned_stats_sdvsatx-05-31-2025.csv', dfidk)

In [18]:
idk2

Unnamed: 0,shooting_possession_pct_home,shooting_possession_pct_away,shooting_shots_home,shooting_shots_away,shooting_shots_on_goal_home,shooting_shots_on_goal_away,shooting_blocked_shots_home,shooting_blocked_shots_away,shooting_total_passes_home,shooting_total_passes_away,...,possession_86_90_away,xg_total_team_xg_home,xg_total_team_xg_away,xg_shots_home,xg_shots_away,xg_shots_on_target_home,xg_shots_on_target_away,teams_home,teams_away,match_date
0,58.1,41.9,12.0,8.0,4.0,2.0,1.0,2.0,601.0,377.0,...,11.67,2.0,0.5,12.0,8.0,4.0,2.0,SD,ATX,2025-05-31


In [19]:
import glob
import os

In [20]:
for file in glob.glob('*.csv'):
    filename = os.path.basename(file)
    df = pd.read_csv(file)
    cleaned_df = reframe_stats(filename, df)
    cleaned_df.to_csv(
        f"G:/My Drive/GitHubProjects/MLS/data/data_clean/matches/raw/reframed_stats/{filename}",
        index=False)