In [23]:
import pandas as pd
import os
pd.set_option('display.max_columns', None)

years = list(range(2016, 2025))
stats = ["passing", "rushing", "receiving"]

all_records = []

for year in years:
    for stat in stats:
        file_name = f"{year}_{stat}.csv"
        if os.path.exists(file_name):
            df = pd.read_csv(file_name)
            if 'Player' in df.columns:
                df.rename(columns={'Player': 'player'}, inplace=True)
            if 'Awards' in df.columns:
                df = df.drop(columns=['Awards'])
            # Rename columns if they exist
            rename_dict = {
                'Yds.2': 'Scrim_Yds',
                'Avg': 'Scrim_Avg',
                'TD.2': 'Tot_TD'
            }
            df.rename(columns={k: v for k, v in rename_dict.items() if k in df.columns}, inplace=True)
            if 'player' not in df.columns or 'Rk' not in df.columns:
                print(f"❌ 'player' or 'Rk' column missing in {file_name}, skipping this file.")
                continue
            # Ensure Rk is numeric (float or int)
            df['Rk'] = pd.to_numeric(df['Rk'], errors='coerce')
            df['season'] = year
            df['stat_type'] = stat  # Keep track of which stat this row came from
            all_records.append(df)

if all_records:
    combined_df = pd.concat(all_records, ignore_index=True)
    # Remove all '*' from player names
    combined_df['player'] = combined_df['player'].str.replace('*', '', regex=False)
    combined_df = combined_df.dropna(subset=['Rk'])
    # For each player and season, keep the row with the lowest Rk
    idx = combined_df.groupby(['player', 'season'])['Rk'].idxmin()
    filtered_df = combined_df.loc[idx].copy()
    # Insert Col/Pro after 'player' but before 'Team'
    cols = list(filtered_df.columns)
    # Remove Col/Pro if it already exists
    if 'Col/Pro' in cols:
        cols.remove('Col/Pro')
    # Find positions
    player_idx = cols.index('player')
    team_idx = cols.index('Team') if 'Team' in cols else player_idx + 1
    # Insert Col/Pro after player
    cols = cols[:player_idx+1] + ['Col/Pro'] + cols[player_idx+1:]
    # Add Col/Pro column with value 'College'
    filtered_df['Col/Pro'] = 'College'
    # Reorder columns
    filtered_df = filtered_df[cols]
    # Create player_college_dict: one row per season per player (lowest Rk)
    player_college_dict = {name: group for name, group in filtered_df.groupby('player')}
    print(f"✅ Created dictionary for {len(player_college_dict)} players (lowest Rk per season, all seasons included).")
else:
    print("❌ No data found. Please check your CSV files.")

✅ Created dictionary for 10738 players (lowest Rk per season, all seasons included).


In [24]:
player_college_dict["Brashard Smith"]

Unnamed: 0,Rk,player,Col/Pro,Team,Conf,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,season,stat_type,Rec,Yds.1,Y/R,TD.1,Y/G.1,Plays,Scrim_Yds,Scrim_Avg,Tot_TD
23046,625.0,Brashard Smith,College,Miami (FL),ACC,12.0,,6.0,,199.0,2.0,,,,3.8,,,16.6,,2021,receiving,14.0,23.0,14.2,0.0,1.9,20.0,222.0,11.1,2.0
27054,445.0,Brashard Smith,College,Miami (FL),ACC,11.0,,3.0,,308.0,0.0,,,,7.3,,,28.0,,2022,receiving,33.0,22.0,9.3,0.0,2.0,36.0,330.0,9.2,0.0
31445,506.0,Brashard Smith,College,Miami (FL),ACC,12.0,,9.0,,263.0,2.0,,,,14.7,,,21.9,,2023,receiving,22.0,132.0,12.0,1.0,11.0,31.0,395.0,12.7,3.0
33584,15.0,Brashard Smith,College,SMU,ACC,14.0,,235.0,,1332.0,14.0,,,,5.7,,,95.1,,2024,rushing,39.0,327.0,8.4,4.0,23.4,274.0,1659.0,6.1,18.0
