In [1]:
import os
import re
import glob
import hashlib
import pandas as pd
from datetime import datetime
from pathlib import Path

In [2]:
df = pd.read_csv('G:/My Drive/GitHubProjects/MLS/data/data_clean/matches/raw/feed/cleaned_feed_atlvschi-03-31-2024.csv.csv')

In [5]:
TYPE_PREFIXES = ["player_stats_", "simple_stats_", "players_", "stats_", "feed_"]

def canonical_key_from_name(stem: str) -> str:
    """Return 'aaa_bbb_yyyymmdd' from any cleaned_* filename."""
    name = stem.lower()
    if name.startswith("cleaned_"):
        name = name[len("cleaned_"):]                      # strip cleaned_
    # strip dataset prefix
    for pref in TYPE_PREFIXES:
        if name.startswith(pref):
            name = name[len(pref):]
            break

    # now name should look like "atlvschi-03-31-2024" or "atl-vs-chi-2024-03-31", etc.

    # 1) extract date in common patterns -> YYYYMMDD
    yyyymmdd = None
    # patterns: 03-31-2024 | 2024-03-31 | 3-1-2024
    for pat in [r"(\d{4})[^\d]?(\d{1,2})[^\d]?(\d{1,2})",  # YYYY-MM-DD
                r"(\d{1,2})[^\d]?(\d{1,2})[^\d]?(\d{4})"]: # MM-DD-YYYY
        m = re.search(pat, name)
        if m:
            g = tuple(int(x) for x in m.groups())
            try:
                if len(str(g[0])) == 4:   # YYYY,MM,DD
                    dt = datetime(g[0], g[1], g[2])
                else:                     # MM,DD,YYYY
                    dt = datetime(g[2], g[0], g[1])
                yyyymmdd = dt.strftime("%Y%m%d")
                # drop the matched date chunk to isolate teams
                name = name[:m.start()] + name[m.end():]
                break
            except ValueError:
                pass

    # 2) extract teams
    # common forms: "atlvs chi", "atl-vs-chi", "atl-chi", "atlvschi"
    teams = None
    # try with explicit vs
    m = re.search(r"([a-z]{3})[ _\-]*v?s[ _\-]*([a-z]{3})", name)
    if m:
        t1, t2 = m.group(1), m.group(2)
        teams = (t1, t2)
    else:
        # fall back to contiguous 6 letters (e.g., 'atlvschi' -> 'atl','vsc'? not good)
        # instead, remove non-letters and look for exact 6 or 7-8 with separators
        letters = re.sub(r"[^a-z]", "", name)
        if len(letters) >= 6:
            t1, t2 = letters[:3], letters[3:6]
            teams = (t1, t2)

    # final guardrails
    home, away = (teams or ("unk", "unk"))
    yyyymmdd = yyyymmdd or "00000000"

    return f"{home}_{away}_{yyyymmdd}"

def compute_match_id_from_filename(path: Path) -> tuple[str, str, str]:
    """Returns (type_prefix_wo_trailing_, canonical_key, match_id)."""
    stem = path.stem
    # find type for output subfolder (mirror raw/<type>/...)
    t = None
    s = stem.lower()
    if s.startswith("cleaned_"):
        s2 = s[len("cleaned_"):]
    else:
        s2 = s
    for pref in TYPE_PREFIXES:
        if s2.startswith(pref):
            t = pref.rstrip("_")
            break
    # fallback to folder name
    if t is None:
        t = path.parent.name.lower()

    key = canonical_key_from_name(stem)
    match_hash = hashlib.md5(key.encode()).hexdigest()[:8]
    return t, key, f"match_{match_hash}"

def add_match_id(file_path: str, out_root="G:/My Drive/GitHubProjects/MLS/data/data_clean/matches/w_match_id"):
    p = Path(file_path)
    dtype, canon_key, match_id = compute_match_id_from_filename(p)

    df = pd.read_csv(p)
    df["match_id_hash"] = match_id

    # rename file to keep original key + match id (easy to eyeball):
    # cleaned_<type>_<canon_key>_<match_id>.csv
    out_dir = Path(out_root) / dtype
    out_dir.mkdir(parents=True, exist_ok=True)
    new_name = f"cleaned_{dtype}_{match_id}{p.suffix}"
    out_path = out_dir / new_name
    df.to_csv(out_path, index=False)
    print(f"✔ {p} → {out_path}  key={canon_key} id={match_id}")
    return out_path

# ---- run for all raw subfolders ----
for sub in ["feed", "reframed_stats", "players", "simple_stats"]:
    for file in glob.glob(f'G:/My Drive/GitHubProjects/MLS/data/data_clean/matches/raw/{sub}/*.csv'):
        add_match_id(file)

✔ G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\raw\feed\cleaned_feed_minvsrsl-03-29-2025.csv.csv → G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\w_match_id\feed\cleaned_feed_match_d36bdec5.csv  key=min_rsl_20250329 id=match_d36bdec5
✔ G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\raw\feed\cleaned_feed_orlvscin-06-28-2025.csv.csv → G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\w_match_id\feed\cleaned_feed_match_23a33625.csv  key=orl_cin_20250628 id=match_23a33625
✔ G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\raw\feed\cleaned_feed_sjvscol-04-13-2024.csv.csv → G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\w_match_id\feed\cleaned_feed_match_44650644.csv  key=sjv_col_20240413 id=match_44650644
✔ G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\raw\feed\cleaned_feed_torvsdc-05-10-2025.csv.csv → G:\My Drive\GitHubProjects\MLS\data\data_clean\matches\w_match_id\feed\cleaned_feed_match_12f32646.csv  key=tor_vsd_20250510 id=match_12f

In [6]:
for file in glob.glob(f'G:/My Drive/GitHubProjects/MLS/data/data_clean/matches/w_match_id/stats/*.csv'):
    df = pd.read_csv(file)
    df = df[['shooting_goals_home', 'shooting_goals_away', 'teams_home', 'teams_away', 'match_id_hash', 'match_date']]
    file_name = 'cleaned_simple_stats_' + Path(file).stem.split('match_')[1] + '.csv'
    
    df.to_csv(f'G:/My Drive/GitHubProjects/MLS/data/data_clean/matches/w_match_id/simple_stats/{file_name}', index=False)