In [18]:
import pandas as pd
import os
import re

pd.set_option('display.max_columns', None)

# Function to clean player names
def clean_player_name(player_name):
    if not isinstance(player_name, str):
        return player_name
    player_name = re.sub(r'[^\w\s]', '', player_name)  # Remove punctuation
    suffixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']
    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
    player_name = re.sub(pattern, '', player_name, flags=re.IGNORECASE)
    return ' '.join(player_name.split())

years = list(range(2016, 2025))
stats = ["passing", "rushing", "receiving"]
all_records = []

for year in years:
    for stat in stats:
        file_name = f"{year}_{stat}.csv"
        if os.path.exists(file_name):
            df = pd.read_csv(file_name)

            if 'Player' in df.columns:
                df.rename(columns={'Player': 'player'}, inplace=True)

            if 'Awards' in df.columns:
                df.drop(columns=['Awards'], inplace=True)

            # Rename ambiguous column names
            rename_dict = {
                'Yds.2': 'Scrim_Yds',
                'Avg': 'Scrim_Avg',
                'TD.2': 'Tot_TD'
            }
            df.rename(columns={k: v for k, v in rename_dict.items() if k in df.columns}, inplace=True)

            if 'player' not in df.columns or 'Rk' not in df.columns:
                print(f"❌ 'player' or 'Rk' column missing in {file_name}, skipping this file.")
                continue

            df['Rk'] = pd.to_numeric(df['Rk'], errors='coerce')
            df['season'] = year
            df['stat_type'] = stat
            all_records.append(df)

if all_records:
    combined_df = pd.concat(all_records, ignore_index=True)

    combined_df['player'] = combined_df['player'].str.replace('*', '', regex=False)
    combined_df['player'] = combined_df['player'].apply(clean_player_name)

    combined_df = combined_df.dropna(subset=['Rk'])

    idx = combined_df.groupby(['player', 'season'])['Rk'].idxmin()
    filtered_df = combined_df.loc[idx].copy()

    # Reorder columns to place 'season' after 'player'
    cols = list(filtered_df.columns)
    if 'season' in cols:
        cols.remove('season')
    player_idx = cols.index('player')
    cols = cols[:player_idx + 1] + ['season'] + cols[player_idx + 1:]
    filtered_df = filtered_df[cols]

    # Drop Rk column
    if 'Rk' in filtered_df.columns:
        filtered_df.drop(columns=['Rk'], inplace=True)

    player_college_dict = {
        name: group.reset_index(drop=True)
        for name, group in filtered_df.groupby('player')
    }

    print(f"✅ Created dictionary for {len(player_college_dict)} players (lowest Rk per season, all seasons included).")
else:
    print("❌ No data found. Please check your CSV files.")


✅ Created dictionary for 10709 players (lowest Rk per season, all seasons included).


In [19]:
player_college_dict["Luther Burden"]

Unnamed: 0,player,season,Team,Conf,G,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Y/A,AY/A,Y/C,Y/G,Rate,stat_type,Rec,Yds.1,Y/R,TD.1,Y/G.1,Plays,Scrim_Yds,Scrim_Avg,Tot_TD
0,Luther Burden,2022,Missouri,SEC,13.0,,18.0,,375.0,6.0,,,,4.9,,,28.8,,receiving,45.0,88.0,8.3,2.0,6.8,63.0,463.0,7.3,8.0
1,Luther Burden,2023,Missouri,SEC,13.0,,7.0,,1212.0,9.0,,,,4.4,,,93.2,,receiving,86.0,31.0,14.1,0.0,2.4,93.0,1243.0,13.4,9.0
2,Luther Burden,2024,Missouri,SEC,12.0,,9.0,,676.0,6.0,,,,12.8,,,56.3,,receiving,61.0,115.0,11.1,2.0,9.6,70.0,791.0,11.3,8.0


In [15]:
all_keys = list(player_college_dict.keys())

print(all_keys) #i.e. Amon_Ra St. Brown is now AmonRa St Brown

['AD Miller', 'ADarius Purifoy', 'AJ Abbott', 'AJ Alexander', 'AJ Barner', 'AJ Bianco', 'AJ Branisel', 'AJ Brown', 'AJ Bush', 'AJ Carter', 'AJ Cole', 'AJ Coney', 'AJ Davis', 'AJ Dillon', 'AJ Duffy', 'AJ Epenesa', 'AJ Erdely', 'AJ Gates', 'AJ Green', 'AJ Hairston', 'AJ Henning', 'AJ Howard', 'AJ Jones', 'AJ Krawczyk', 'AJ Lewis', 'AJ Mayer', 'AJ Newberry', 'AJ Ouellette', 'AJ Padgett', 'AJ Phillips', 'AJ Richardson', 'AJ Swann', 'AJ Taylor', 'AJ Toney', 'AJ Turner', 'AJ Vongphachanh', 'AJ Williams', 'AJon Vivens', 'AMarion Peterson', 'AMontae Spivey', 'AT Perry', 'Aaren Vaughns', 'Aaron Allen', 'Aaron Anderson', 'Aaron Bedgood', 'Aaron Brewer', 'Aaron Cephus', 'Aaron Duckworth', 'Aaron Dumas', 'Aaron Fuller', 'Aaron Greene', 'Aaron Hackett', 'Aaron Hansford', 'Aaron Hepp', 'Aaron Jackson', 'Aaron Jarman', 'Aaron Jones', 'Aaron Key', 'Aaron Mathews', 'Aaron McAllister', 'Aaron McLaughlin', 'Aaron McLean', 'Aaron Molina', 'Aaron Moore', 'Aaron Peck', 'Aaron Philo', 'Aaron Short', 'Aaron S