In [None]:
import pandas as pd
import os


In [22]:


def build_advanced_dataset(data_dir, start_year=2018, end_year=2024):
    adv_list = []
    std_list = []

    for year in range(start_year, end_year + 1):
        # Find advanced playoff passing file
        adv_files = [f for f in os.listdir(data_dir) if f"{year}_Playoffs_Advanced" in f]
        if not adv_files:
            print(f"No advanced playoff file found for {year}, skipping")
            continue
        adv_file = adv_files[0]
        
        # Read advanced file, ignore first header row
        adv_df = pd.read_html(os.path.join(data_dir, adv_file), header=1)[0]
        adv_df['Season'] = year
        adv_list.append(adv_df)
        
        # Find standard passing file
        std_files = [f for f in os.listdir(data_dir) if f"{year}_Standard_Passing" in f]
        if not std_files:
            print(f"No standard passing file found for {year}, skipping")
            continue
        std_file = std_files[0]
        
        # Read standard passing
        std_df = pd.read_html(os.path.join(data_dir, std_file), header=0)[0]
        std_df['Season'] = year
        std_list.append(std_df)
    
    # Merge all advanced and standard files individually
    adv_all = pd.concat(adv_list, ignore_index=True)
    std_all = pd.concat(std_list, ignore_index=True)
    
    # Drop repeated columns from standard dataset
    repeated_cols = [c for c in std_all.columns if c in adv_all.columns and c not in ['Player', 'Team', 'Season', 'QBrec']]
    std_all_unique = std_all.drop(columns=repeated_cols)
    
    # Merge standard onto advanced
    merged_df = adv_all.merge(std_all_unique, on=['Player', 'Team', 'Season'], how='left')
    
    # Create playoff_games_won column from QBrec
    def extract_wins(qbrec):
        try:
            return int(str(qbrec).split('-')[0])
        except:
            return pd.NA
    
    merged_df['playoff_games_won'] = merged_df['QBrec'].apply(extract_wins)
    
    return merged_df


In [24]:
big_df = build_advanced_dataset("data sources", start_year=2018, end_year=2024)


In [25]:
big_df.to_csv("merged_passing_data.csv", index=False)
