In [1]:
import pandas as pd
import numpy as np
import sys
import os

Notes:
* Going to take the top 75 scoring players in the leage
    - in 1995-96 season, there were only 92 players in the leage
    - lowest ranked scorer that won the MVP was Steve Nash in 2005 where he averaged 15.5 PPG (63rd in the leage)
* csv files are in the directory of the 2nd year in the season. ex. the 95-96 season will be in the 1996 directory
* in the directory for the year, there is a players.csv file, a teams.csv file, an advanced.csv file, and a mvp_voting.csv file
    - the teams.csv file is so that I can associate a team record with the players
    - advanced data only goes back until the 1980 season
    - mvp voting I only collected back to 1980
* the mvp.csv dataset is in the outer data dir and has the name of each mvp winner since the 65-66 season


**GET DATA SAMPLE**

In [2]:
# get standard player data 
def get_data(year):
    players = pd.read_csv("data/" + year + "/players.csv")
    unique_rows = players.groupby("Player").filter(lambda x: len(x) == 1)
    return unique_rows

In [3]:
# get standard player data 
def get_advanced_data(year):
    advanced = pd.read_csv("data/" + year + "/advanced.csv")
    unique_rows = advanced.groupby("Player").filter(lambda x: len(x) == 1)
    metrics = [ "Player", "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", "OWS", "DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP"]
    advanced_filtered = unique_rows[metrics]
    
    return advanced_filtered

In [4]:
# get the team data and turn it into something usable

def get_team_data(year):
    teams_vs = pd.read_csv("data/" + year + "/teams.csv")
    teams_vs = teams_vs.drop(columns=['Rk'])
    
    teams = pd.DataFrame(columns=['Team', 'Abbreviation', 'Wins', 'Losses', 'Win%'])
    
    # Get abbreviation and record
    for index, row in teams_vs.iterrows():
    
        # extract wins and losses
        row_wins = 0
        row_losses = 0
        for record in row:
            if pd.notna(record) and len(record) < 6:
                win_loss = record.split('-')
                row_wins += int(win_loss[0])
                row_losses += int(win_loss[1])
    
        # don't divide by 0
        if row_losses == 0:
            win_pct = 1
        else:
            win_pct = row_wins/(row_losses+row_wins)
    
        # add to df    
        teams.loc[len(teams)] = ({'Team': row['Team'], 'Abbreviation': row.index[row.name + 1], 'Wins': row_wins, 'Losses': row_losses, 'Win%': win_pct})
    
    # get overall team rank in the nba
    teams = teams.sort_values(by='Win%', ascending=False)
    teams = teams.reset_index(drop=True)
    teams['Team_Rank'] = teams.index+1
    
    return teams

In [5]:
def get_mvp_voting(year):
    mvp_voting = pd.read_csv("data/" + year + "/mvp_voting.csv")
    metrics = [ "Player", "Share"]
    mvp_voting_filtered = mvp_voting[metrics]
    return mvp_voting_filtered

In [6]:
# transform the player data into something uniform that we can use
def get_top_75_filtered(players):
    
    # remove entries for players that played on multiple teams, just look at the last team they played for
    players_filtered = players.drop_duplicates(subset='Player', keep='last').reset_index()
    
    # sort by points per game and take top 75 candidates
    players_sorted = players_filtered.sort_values(by="PTS", ascending=False)
    top_75 = players_sorted.head(75).copy()
    
    # metrics we are using
    metrics = ["Player", "Age", "Team", "Pos", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]

    top_75_filtered = top_75[metrics]
    return top_75_filtered


In [7]:
# combine into final dataframe
def merge_data(top_75_filtered, teams, advanced, mvp_voting):
    player_team_df = pd.merge(top_75_filtered, teams, left_on='Team', right_on='Abbreviation')
    player_team_advanced_df = pd.merge(player_team_df, advanced, left_on='Player', right_on='Player')
    player_team_advanced_voting_df = pd.merge(mvp_voting, player_team_advanced_df, left_on='Player', right_on='Player')
    return player_team_advanced_voting_df

In [8]:
# verify that all teams matched, this should print an empty dataframe
def verify_team_match(top_75_filtered, teams):
    unmatched_teams = top_75_filtered[~top_75_filtered['Team_x'].isin(teams['Abbreviation'])]
    if len(unmatched_teams['Team_x']) > 0:
        print(unmatched_teams)
        return 1
    return 0
        

In [9]:
# verify that all advanced stats matched with a player, this should print an empty dataframe
def verify_player_match(top_75_filtered, players):
    unmatched_players = top_75_filtered[~top_75_filtered['Player'].isin(players['Player'])]
    if len(unmatched_players['Player']) > 0:
        print(unmatched_players)
        return 1
    return 0

In [10]:
# verify that all players in the MVP voting are in the top 75 dataframe
def verify_mvp_match(top_75_filtered, mvp_voting):
    unmatched_players = mvp_voting[~mvp_voting['Player'].isin(top_75_filtered['Player'])]
    if len(unmatched_players['Player']) > 0:
        print(unmatched_players)
        return 1
    return 0

In [11]:
# only keep numerical variables, don't worry about player name, team name, etc
def get_numerical_df(player_season_final_df):
    numerical_df = player_season_final_df.select_dtypes(include=['number'])
    return numerical_df

In [12]:
def get_mvp_by_year(year):
    mvps = pd.read_csv("data/mvps.csv")
    result = mvps.loc[mvps['Season'].str[:4].astype(int) == int(year) - 1, 'Player'].values

    if(len(result) == 1):
        return result[0]
    else:
        return 0

In [13]:
def get_final_df_by_year(year):
    # get the data
    og_data = get_data(year)
    top_75 = get_top_75_filtered(og_data)
    team_data = get_team_data(year)
    advanced_data = get_advanced_data(year)

    # there is no mvp_voting data for 2025 yet
    if year != "2025":
        mvp_voting = get_mvp_voting(year)
    else:
        mvp_voting = top_75[["Player"]].copy()
        mvp_voting["Share"] = 0
        

    # merge it
    player_team_advanced_combined = merge_data(top_75, team_data, advanced_data, mvp_voting)

    # verify merge worked correctly
    if verify_team_match(player_team_advanced_combined, team_data) > 0:
        print("Team merge did not work correctly. Aborting")
        sys.exit()

    # verify merge worked correctly
    if verify_player_match(player_team_advanced_combined, advanced_data) > 0:
        print("Advanced stats merge did not work correctly. Aborting")
        sys.exit()

    # verify merge worked correctly
    if verify_player_match(player_team_advanced_combined, mvp_voting) > 0:
        print("Advanced stats merge did not work correctly. Aborting")
        sys.exit()


    # add year and MVP attribute
    player_team_advanced_combined['Year'] = int(year)
    player_team_advanced_combined['MVP'] = (player_team_advanced_combined['Player'] == get_mvp_by_year(year)).astype(int) 

    # using these metrics makes the model overfit like crazy because the correlate so much with the MVP
    # might want to minimize features through dimentionality reduction?
    metrics = ['Player', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Wins', 'Losses', 'Win%', 'Team_Rank', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Year', 'MVP', 'Share']

    # using these metrics and a batch size of 1 overfits the training data like crazy
    # metrics = ["PER", "WS", "WS/48", "PTS", "FG%", "AST", "Age", "MP", "TRB", "G", "MVP"]
    
    return player_team_advanced_combined[metrics]

get_final_df_by_year("2025")

Unnamed: 0,Player,Age,G,GS,MP,FG,FGA,FG%,3P,3P%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year,MVP,Share
0,Giannis Antetokounmpo,30,19,19,34.8,12.7,20.7,0.616,0.2,0.214,...,0.9,3.5,0.256,6.9,2.4,9.3,1.9,2025,0,0
1,LaMelo Ball,23,18,18,34.1,10.7,24.9,0.430,4.7,0.356,...,0.5,1.5,0.116,5.9,-0.6,5.4,1.1,2025,0,0
2,Nikola Jokić,29,16,16,37.6,11.3,20.1,0.564,2.3,0.522,...,0.8,3.8,0.300,10.7,3.0,13.7,2.4,2025,0,0
3,Shai Gilgeous-Alexander,26,21,21,34.3,10.4,20.6,0.507,2.1,0.344,...,1.5,4.1,0.272,7.8,2.5,10.4,2.3,2025,0,0
4,Luka Dončić,25,16,16,36.9,10.3,22.7,0.452,3.4,0.342,...,0.9,2.1,0.175,5.4,0.7,6.2,1.2,2025,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,Trey Murphy III,24,9,9,32.6,6.0,15.4,0.388,2.6,0.307,...,0.1,0.1,0.022,-0.7,-1.6,-2.3,0.0,2025,0,0
71,Andrew Wiggins,29,18,18,28.8,5.8,12.7,0.459,2.2,0.417,...,0.6,1.8,0.169,2.9,0.0,2.9,0.6,2025,0,0
72,Collin Sexton,26,21,19,28.3,5.9,12.1,0.482,1.8,0.427,...,-0.1,0.8,0.062,0.6,-2.2,-1.6,0.1,2025,0,0
73,Austin Reaves,26,19,19,33.7,5.8,13.2,0.440,2.6,0.355,...,0.1,0.7,0.051,0.1,-2.0,-1.9,0.0,2025,0,0


In [14]:
def get_all_years_df():

    all_years_df = []
    # get all the data from all the years
    for year_dir in os.listdir("data"):

        #skip current year and don't add data before 1980 because a lot of the advanced stats didn't exist yet
        if year_dir == "2025" or (year_dir != "mvps.csv" and int(year_dir) < 1980):
            continue
            
        # verify that the item is a directory and not a file. the mvps.csv file will be ignored
        if os.path.isdir(os.path.join("data", year_dir)):
            year_df = get_final_df_by_year(year_dir)
            all_years_df.append(year_df)

    final_df = pd.concat(all_years_df)
            
    return final_df
       
mvp_candidates_df = get_all_years_df()
print(mvp_candidates_df.shape)
mvp_candidates_df

(672, 53)


Unnamed: 0,Player,Age,G,GS,MP,FG,FGA,FG%,3P,3P%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year,MVP,Share
0,Kareem Abdul-Jabbar,32,82,,38.3,10.2,16.9,0.604,0.0,0.000,...,5.3,14.8,0.227,4.8,2.4,7.2,7.3,1980,1,0.665
1,Julius Erving,29,78,78.0,36.1,10.7,20.7,0.519,0.1,0.200,...,5.2,12.5,0.213,5.7,1.9,7.6,6.8,1980,0,0.143
2,George Gervin,27,78,,37.6,13.1,24.9,0.528,0.4,0.314,...,1.3,10.6,0.173,5.5,-1.6,3.9,4.4,1980,0,0.086
3,Larry Bird,23,82,82.0,36.0,8.5,17.8,0.474,0.7,0.406,...,5.6,11.2,0.182,3.0,1.5,4.5,4.8,1980,0,0.068
4,Tiny Archibald,31,80,80.0,35.8,4.8,9.9,0.482,0.1,0.222,...,2.9,8.9,0.148,1.4,-0.3,1.1,2.3,1980,0,0.009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,Jalen Brunson,27,77,77.0,35.4,10.3,21.4,0.479,2.7,0.401,...,2.4,11.2,0.198,6.3,-0.4,5.8,5.4,2024,0,0.143
5,Jayson Tatum,25,74,74.0,35.7,9.1,19.3,0.471,3.1,0.376,...,4.1,10.4,0.189,4.5,0.6,5.1,4.7,2024,0,0.087
6,Anthony Edwards,22,79,78.0,35.1,9.1,19.7,0.461,2.4,0.357,...,4.7,7.5,0.130,2.7,0.5,3.3,3.7,2024,0,0.018
7,Domantas Sabonis,27,82,82.0,35.7,7.7,13.0,0.594,0.4,0.379,...,4.0,12.6,0.206,4.0,2.4,6.5,6.2,2024,0,0.003


**Determine Highest Correlating Variables**

In [15]:
numerical_candidates_df = get_numerical_df(get_all_years_df())

# make training set 70% of years from 1980-2024 and test set will be the other 30%
years = np.arange(1980, 2025)
train_years = np.random.choice(years, size=int(0.7 * len(years)), replace=False)
test_years = np.setdiff1d(years, train_years)

train_df = numerical_candidates_df[numerical_candidates_df['Year'].isin(train_years)]
test_df = numerical_candidates_df[numerical_candidates_df['Year'].isin(test_years)]

# print(train_df)

In [16]:
# find highest correlating for Share
share_correlations = train_df.corr()["Share"].sort_values(ascending=False)
print(share_correlations.head(20))

Share    1.000000
MVP      0.731642
VORP     0.651410
PER      0.650038
WS/48    0.631261
WS       0.629056
BPM      0.627182
OBPM     0.561397
OWS      0.522411
PTS      0.466032
FG       0.430816
DBPM     0.412652
USG%     0.396529
DWS      0.389115
FT       0.363250
Win%     0.356313
FTA      0.352483
FGA      0.334561
Wins     0.314413
2P       0.313129
Name: Share, dtype: float64


In [17]:
# find highest correlating for MVP (1 hot encoded vector)
mvp_correlations = train_df.corr()["MVP"].sort_values(ascending=False)
print(mvp_correlations.head(20))

MVP      1.000000
Share    0.731642
WS       0.434624
VORP     0.432466
WS/48    0.429494
PER      0.420097
BPM      0.408838
OBPM     0.374784
OWS      0.370830
PTS      0.277836
FG       0.266140
DBPM     0.254156
DWS      0.248700
Win%     0.240379
USG%     0.224275
FT       0.214040
Wins     0.211726
2P       0.209304
DRB      0.204744
FTA      0.196293
Name: MVP, dtype: float64


**Make Prediction**

In [22]:
def calculate_pred(correlations, player, year_df):
    pred_val = 0
    
    for stat, cor in correlations.head(17).items():
        if stat == "MVP" or stat == "Share":
            continue

        # print("evaluating " + str(stat) + " with cor " + str(cor) + " where player[" + str(stat) + "] = " + str(player[stat]) + " and max(year_df[" + str(stat) + "]) = " + str(max(year_df[stat]))) 
        pred_val += cor * player[stat]/np.mean(year_df[stat])

    return pred_val

In [23]:
# predict with correlation with MVP voting share
def predict_by_yr_share(year_df):

    # I used chatGPT to help me figure out how to do this:
    year_df["Pred_Val"] = year_df.apply(lambda row: calculate_pred(share_correlations, row, year_df), axis=1)

    return year_df.sort_values(by='Pred_Val', ascending=False)

In [24]:
def predict_by_yr_mvp(year_df):

    # I used chatGPT to help me figure out how to do this:
    year_df["Pred_Val"] = year_df.apply(lambda row: calculate_pred(mvp_correlations, row, year_df), axis=1)

    return year_df.sort_values(by='Pred_Val', ascending=False)

**Evaluate Model**

In [25]:
def evaluate_share_accuracy_train():
    correct = 0
    for year in train_df["Year"].unique():
        if get_mvp_by_year(str(year)) == predict_by_yr_share(get_final_df_by_year(str(year))).iloc[0]["Player"]:
            print("Correct for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " == " + predict_by_yr_share(get_final_df_by_year(str(year))).iloc[0]["Player"])
            correct += 1
        else:
            print("Wrong for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " != " + predict_by_yr_share(get_final_df_by_year(str(year))).iloc[0]["Player"])
    return (correct/len(train_df["Year"].unique()))

evaluate_share_accuracy_train()

Wrong for 1982 season: Moses Malone != Magic Johnson
Wrong for 1983 season: Moses Malone != Larry Bird
Correct for 1985 season: Larry Bird == Larry Bird
Correct for 1986 season: Larry Bird == Larry Bird
Wrong for 1987 season: Magic Johnson != Michael Jordan
Correct for 1988 season: Michael Jordan == Michael Jordan
Wrong for 1989 season: Magic Johnson != Michael Jordan
Correct for 1992 season: Michael Jordan == Michael Jordan
Wrong for 1993 season: Charles Barkley != Michael Jordan
Wrong for 1994 season: Hakeem Olajuwon != David Robinson
Correct for 1995 season: David Robinson == David Robinson
Wrong for 1997 season: Karl Malone != Michael Jordan
Correct for 1999 season: Karl Malone == Karl Malone
Correct for 2000 season: Shaquille O'Neal == Shaquille O'Neal
Correct for 2004 season: Kevin Garnett == Kevin Garnett
Wrong for 2007 season: Dirk Nowitzki != LeBron James
Wrong for 2008 season: Kobe Bryant != LeBron James
Correct for 2009 season: LeBron James == LeBron James
Correct for 2010 s

0.5806451612903226

In [26]:
def evaluate_mvp_accuracy_train():
    correct = 0
    for year in train_df["Year"].unique():
        if get_mvp_by_year(str(year)) == predict_by_yr_mvp(get_final_df_by_year(str(year))).iloc[0]["Player"]:
            print("Correct for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " == " + predict_by_yr_mvp(get_final_df_by_year(str(year))).iloc[0]["Player"])
            correct += 1
        else:
            print("Wrong for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " != " + predict_by_yr_mvp(get_final_df_by_year(str(year))).iloc[0]["Player"])
    return (correct/len(train_df["Year"].unique()))

evaluate_mvp_accuracy_train()

Wrong for 1982 season: Moses Malone != Magic Johnson
Wrong for 1983 season: Moses Malone != Larry Bird
Correct for 1985 season: Larry Bird == Larry Bird
Correct for 1986 season: Larry Bird == Larry Bird
Wrong for 1987 season: Magic Johnson != Michael Jordan
Correct for 1988 season: Michael Jordan == Michael Jordan
Wrong for 1989 season: Magic Johnson != Michael Jordan
Correct for 1992 season: Michael Jordan == Michael Jordan
Wrong for 1993 season: Charles Barkley != Michael Jordan
Wrong for 1994 season: Hakeem Olajuwon != David Robinson
Correct for 1995 season: David Robinson == David Robinson
Wrong for 1997 season: Karl Malone != Michael Jordan
Correct for 1999 season: Karl Malone == Karl Malone
Correct for 2000 season: Shaquille O'Neal == Shaquille O'Neal
Correct for 2004 season: Kevin Garnett == Kevin Garnett
Correct for 2007 season: Dirk Nowitzki == Dirk Nowitzki
Wrong for 2008 season: Kobe Bryant != LeBron James
Correct for 2009 season: LeBron James == LeBron James
Correct for 201

0.6129032258064516

In [27]:
def evaluate_share_accuracy_test():
    correct = 0
    for year in test_df["Year"].unique():
        if get_mvp_by_year(str(year)) == predict_by_yr_share(get_final_df_by_year(str(year))).iloc[0]["Player"]:
            print("Correct for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " == " + predict_by_yr_share(get_final_df_by_year(str(year))).iloc[0]["Player"])
            correct += 1
        else:
            print("Wrong for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " != " + predict_by_yr_share(get_final_df_by_year(str(year))).iloc[0]["Player"])
    return (correct/len(test_df["Year"].unique()))

evaluate_share_accuracy_test()

Correct for 1980 season: Kareem Abdul-Jabbar == Kareem Abdul-Jabbar
Wrong for 1981 season: Julius Erving != Magic Johnson
Correct for 1984 season: Larry Bird == Larry Bird
Wrong for 1990 season: Magic Johnson != Michael Jordan
Correct for 1991 season: Michael Jordan == Michael Jordan
Correct for 1996 season: Michael Jordan == Michael Jordan
Wrong for 1998 season: Michael Jordan != David Robinson
Wrong for 2001 season: Allen Iverson != Shaquille O'Neal
Correct for 2002 season: Tim Duncan == Tim Duncan
Wrong for 2003 season: Tim Duncan != Tracy McGrady
Wrong for 2005 season: Steve Nash != Kevin Garnett
Wrong for 2006 season: Steve Nash != LeBron James
Correct for 2018 season: James Harden == James Harden
Correct for 2022 season: Nikola Jokić == Nikola Jokić


0.5

In [None]:
def evaluate_mvp_accuracy_test():
    correct = 0
    for year in test_df["Year"].unique():
        if get_mvp_by_year(str(year)) == predict_by_yr_mvp(get_final_df_by_year(str(year))).iloc[0]["Player"]:
            print("Correct for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " == " + predict_by_yr_mvp(get_final_df_by_year(str(year))).iloc[0]["Player"])
            correct += 1
        else:
            print("Wrong for " + str(year) + " season: " + get_mvp_by_year(str(year)) + " != " + predict_by_yr_mvp(get_final_df_by_year(str(year))).iloc[0]["Player"])
    return (correct/len(test_df["Year"].unique()))

evaluate_mvp_accuracy_test()
    

Correct for 1980 season: Kareem Abdul-Jabbar == Kareem Abdul-Jabbar
Wrong for 1981 season: Julius Erving != Magic Johnson
Correct for 1984 season: Larry Bird == Larry Bird
Wrong for 1990 season: Magic Johnson != Michael Jordan
Correct for 1991 season: Michael Jordan == Michael Jordan
Correct for 1996 season: Michael Jordan == Michael Jordan
Wrong for 1998 season: Michael Jordan != David Robinson
Wrong for 2001 season: Allen Iverson != Shaquille O'Neal


**2025 Prediction**

In [None]:
predict_by_yr_share(get_final_df_by_year("2025")).head(5)

In [None]:
predict_by_yr_mvp(get_final_df_by_year("2025")).head(5)