In [1]:
#Importing libraries

import numpy as np
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.simplefilter(action = 'ignore', category=SettingWithCopyWarning)

In [16]:
#Setting the year marks we need (useful for loops later)

first = 1980
last = 2023
years = [i for i in range(first, 2023)]
seasons = len(years)


#Parsing the tables with the MVP candidates for every year and storing the dfs in a list

mvp_all = []
columns = ["Rank", 'Player',  'Age', 'Tm', 'First', 'Pts Won',
           'Pts Max', 'Share', 'G', 'MP', 'PTS', 'TRB', 'AST', 'STL',
           'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']


for year in years:
    mvp_url = "https://www.basketball-reference.com/awards/awards_{}.html".format(year)
    all_awards_year = pd.read_html(mvp_url)
    mvp_year = all_awards_year[0]
    mvp_year.columns = columns
    mvp_year["Year"] = year
    mvp_all.append(mvp_year) 

In [6]:
#Scraping full data for all years. This includes per game, per 36mins, per 100 possessions and advanced data.
#Then we only keep the data for the players receiving MVP votes each year and adding a "Year" column to separate between seasons


mvp_per_game_stats = []
#mvp_per_36_stats = []
mvp_advanced_stats = []
#mvp_per_possession_stats = []

    
for year in years:
    
    names = mvp_all[year-first]["Player"].values
    
    per_game_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    #per_36_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_minute.html".format(year)
    advanced_url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
    #per_possession_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_poss.html".format(year)
    
    per_game_tab = pd.read_html(per_game_url)
    #per_36_tab = pd.read_html(per_36_url)
    advanced_tab = pd.read_html(advanced_url)
    #per_possession_tab = pd.read_html(per_possession_url)
    
    per_game_tab[0]["Player"] = per_game_tab[0]["Player"].str.replace("*","")
    #per_36_tab[0]["Player"] = per_36_tab[0]["Player"].str.replace("*","")
    advanced_tab[0]["Player"] = advanced_tab[0]["Player"].str.replace("*","")
    #per_possession_tab[0]["Player"] = per_possession_tab[0]["Player"].str.replace("*","")
    
    per_game_tab_mvp = per_game_tab[0].loc[per_game_tab[0]["Player"].isin(names)]
    #per_36_tab_mvp = per_36_tab[0].loc[per_36_tab[0]["Player"].isin(names)]
    advanced_tab_mvp = advanced_tab[0].loc[advanced_tab[0]["Player"].isin(names)]
    #per_possession_tab_mvp = per_possession_tab[0].loc[per_possession_tab[0]["Player"].isin(names)]
    
    per_game_tab_mvp["Year"] = year
    #per_36_tab_mvp["Year"] = year
    advanced_tab_mvp["Year"] = year
    #per_possession_tab_mvp["Year"] = year
    
    mvp_per_game_stats.append(per_game_tab_mvp)
    #mvp_per_36_stats.append(per_36_tab_mvp)
    mvp_advanced_stats.append(advanced_tab_mvp)
    #mvp_per_possession_stats.append(per_possession_tab_mvp)

In [8]:
#Removing two columns we won't need and changing all numerical features to float (except two that are integers).


per_game_to_float_cols = ['MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

#per_possession_to_float_cols = ['MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg']

#per_36_to_float_cols = ['MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

advanced_to_float_cols = ['MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
       'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

to_integer_cols = ['Age', 'G']

for i in range(seasons):
    
    mvp_per_game_stats[i].drop(['Rk', 'GS'], axis=1, inplace=True)
    #mvp_per_36_stats[i].drop(['Rk', 'GS'], axis=1, inplace=True)
    mvp_advanced_stats[i].drop('Rk', axis=1, inplace=True)
    #mvp_per_possession_stats[i].drop(['Rk', 'GS'], axis=1, inplace=True)
    
    mvp_per_game_stats[i].drop_duplicates(subset=['Player'], keep = 'first', inplace=True)
    #mvp_per_36_stats[i].drop_duplicates(subset=['Player'], keep = 'first', inplace=True)
    mvp_advanced_stats[i].drop_duplicates(subset=['Player'], keep = 'first', inplace=True)
    #mvp_per_possession_stats[i].drop_duplicates(subset=['Player'], keep = 'first', inplace=True)

    mvp_per_game_stats[i][per_game_to_float_cols] = mvp_per_game_stats[i][per_game_to_float_cols].astype(float)
    mvp_per_game_stats[i][to_integer_cols] = mvp_per_game_stats[i][to_integer_cols].astype(int)
    
    # mvp_per_possession_stats[i][per_possession_to_float_cols] = mvp_per_possession_stats[i][per_possession_to_float_cols].astype(float)
    # mvp_per_possession_stats[i][to_integer_cols] = mvp_per_possession_stats[i][to_integer_cols].astype(int)
    
    # mvp_per_36_stats[i][per_36_to_float_cols] = mvp_per_36_stats[i][per_36_to_float_cols].astype(float)
    # mvp_per_36_stats[i][to_integer_cols] = mvp_per_36_stats[i][to_integer_cols].astype(int)
    
    mvp_advanced_stats[i][advanced_to_float_cols] = mvp_advanced_stats[i][advanced_to_float_cols].astype(float)
    mvp_advanced_stats[i][to_integer_cols] = mvp_advanced_stats[i][to_integer_cols].astype(int)

In [159]:
#Reading the columns of each dataset


#print(mvp_per_36_stats[29].columns)
#print(mvp_per_possession_stats[29].columns)
print(mvp_advanced_stats[29].columns)
print(mvp_per_game_stats[29].columns)

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year'],
      dtype='object')
Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Unnamed: 29', 'ORtg',
       'DRtg', 'Year'],
      dtype='object')
Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'Unnamed: 19', 'OWS', 'DWS', 'WS', 'WS/48', 'Unnamed: 24', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'Year'],
      dtype='object')
Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS

In [29]:
#Stacking data for all years in one dataframe and removing some empty columns that were there by default


all_per_game = pd.concat(mvp_per_game_stats, ignore_index=True)
#all_per_possession = pd.concat(mvp_per_possession_stats, ignore_index=True)
#all_per_possession.drop('Unnamed: 29', axis=1, inplace=True)
#all_per_36 = pd.concat(mvp_per_36_stats, ignore_index=True)
all_advanced = pd.concat(mvp_advanced_stats, ignore_index=True)
all_advanced.drop(columns=['Unnamed: 19', 'Unnamed: 24'], axis=1, inplace=True)

In [99]:
#Stacking the MVP data for all years too. 

mvp_full = pd.concat(mvp_all, ignore_index=True)

pure_mvp_full = mvp_full[['Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'WS', 'WS/48']]

In [33]:
#Isolating features not included in the initial MVP tables to be added. We keep Player, Team and Year as anchors for merging later   
    
advanced_to_merge = all_advanced.drop(['Age', 'Pos', 'G', 'MP', 'WS', 'WS/48'], axis=1)
per_game_to_merge = all_per_game[["Player", "Tm", "ORB", "DRB", "TOV", "PF", "Year"]]

ratings = all_per_possession[["Player", "Tm", "ORtg", "DRtg", "Year"]]

added_stats_df = pd.merge(pd.merge(per_game_to_merge, advanced_to_merge, on = ["Player", "Tm", "Year"], how = "left"), ratings, on = ["Player", "Tm", "Year"], how = 'left')




In [100]:
#Merging per_game and advanced stats with MVP table

mvp_all_stats = pd.merge(mvp_full, added_stats_df, on = ["Player", "Tm", "Year"], how = 'left')

In [121]:
#There are a handful of players in total that received MVP votes but they played for multiple teams in that season.
#Apart from Chauncey Billups in 2009 nobody else had significant share, so we remove them.
#We change Billups Team that year from TOT that means total to DEN for denver nuggets since he only played 2 games for the other team.
#Remaing players with 2 or more teams wiil be removed from the data later. This way we keep Billups in 2009.


mvp_all_stats[(mvp_all_stats["Player"] == "Chauncey Billups") & (mvp_all_stats["Tm"] == "TOT")] = mvp_all_stats[(mvp_all_stats["Player"] == "Chauncey Billups") & (mvp_all_stats["Tm"] == "TOT")].replace("TOT", "DEN")



In [125]:
#A dictionary with NBA teams full names and abbreviations, so that we can extract and match wins per season for players


team_dict = {"Boston Celtics": "BOS", "Brooklyn Nets": "BKN", "New York Knicks": "NYK",
             "Philadelphia 76ers": "PHI", "Toronto Raptors": "TOR", "Chicago Bulls": "CHI",
             "Cleveland Cavaliers": "CLE", "Detroit Pistons": "DET", "Indiana Pacers": "IND",
             "Milwaukee Bucks": "MIL", "Atlanta Hawks": "ATL", "Charlotte Hornets": "CHH",
             "Miami Heat": "MIA", "Orlando Magic": "ORL", "Washington Wizards": "WAS",
             "Denver Nuggets": "DEN", "Minnesota Timberwolves": "MIN", "Oklahoma City Thunder": "OKC",
             "Portland Trail Blazers": "POR", "Utah Jazz": "UTA","Golden State Warriors": "GSW",
             "Los Angeles Clippers": "LAC", "Los Angeles Lakers": "LAL", "Phoenix Suns": "PHO",
             "Sacramento Kings": "SAC", "Dallas Mavericks": "DAL", "Houston Rockets": "HOU",
             "Memphis Grizzlies": "MEM", "New Orleans Pelicans": "NOP", "San Antonio Spurs": "SAS",
             "Seattle SuperSonics": "SEA", "Kansas City Kings": "KCK","New Jersey Nets": "NJN",
             "Washington Bullets": "WSB", "Charlotte Bobcats": "CHA", "New Orleans Hornets": "NOH",
             "San Diego Clippers": "SDC", "New Orleans/Oklahoma City Hornets": "NOK",
             "Vancouver Grizzlies": "VAN"
            }



In [143]:
#Scraping NBA standings from 1980. The basketball-reference websites have different structure from 2016 so we need 
#to slightly change the code. Hence a separate loop. 



west_stands = []
east_stands = []
for i in range(1980,2016):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html".format(i)
    tables = pd.read_html(url)
    east = tables[0][~tables[0].W.str.contains("|".join("Division"))]
    west = tables[1][~tables[1].W.str.contains("|".join("Division"))]
    east["Eastern Conference"] = east["Eastern Conference"].str.replace("*","")
    west["Western Conference"] = west["Western Conference"].str.replace("*","")
    east = east.replace({"Eastern Conference":team_dict})
    west = west.replace({"Western Conference":team_dict})
    east = east[["Eastern Conference", "W", "W/L%"]]
    west = west[["Western Conference", "W", "W/L%"]]
    east = east.rename(columns={'Eastern Conference': 'Tm'})
    west = west.rename(columns={'Western Conference': 'Tm'})
    east_stands.append(east)
    west_stands.append(west)
    
for i in range(2016, 2023):
    url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html".format(i)
    tables = pd.read_html(url)
    east = tables[2][~tables[2].W.str.contains("|".join("Division"))]
    west = tables[3][~tables[3].W.str.contains("|".join("Division"))]
    east["Eastern Conference"] = east["Eastern Conference"].str.replace("*","")
    west["Western Conference"] = west["Western Conference"].str.replace("*","")
    east = east.replace({"Eastern Conference":team_dict})
    west = west.replace({"Western Conference":team_dict})
    east = east[["Eastern Conference", "W", "W/L%"]]
    west = west[["Western Conference", "W", "W/L%"]]
    east = east.rename(columns={'Eastern Conference': 'Tm'})
    west = west.rename(columns={'Western Conference': 'Tm'})
    east_stands.append(east)
    west_stands.append(west)

In [144]:
#Unite standings for eastern and western conference per year and add a Year column to match with Players data.


all_standings_per_year = []
for i in range(len(west_stands)):
    east_stands[i][["W", "W/L%"]] = east_stands[i][["W", "W/L%"]].astype({'W':int, 'W/L%':float})
    west_stands[i][["W", "W/L%"]] = west_stands[i][["W", "W/L%"]].astype({'W':int, 'W/L%':float})
    east_stands[i]["Year"] = i + 1980
    west_stands[i]["Year"] = i + 1980
    standings_per_year = pd.concat([east_stands[i], west_stands[i]], ignore_index=True)
    all_standings_per_year.append(standings_per_year)

In [146]:
#Merge all year standings in one dataframe

full_standings = pd.concat(all_standings_per_year, ignore_index=True)

In [148]:
#Merge player data and standings from 1980 to get number of wins and percentage for every candidate

mvp_df = pd.merge(mvp_all_stats, full_standings, on = ['Tm', 'Year'], how = 'left')

In [150]:
#Old data don't include 3P% so set NaN values to 0. As a feature it shouldn't be important anyway.
#Players with TOT as a team (played for more than one team that year) are removed as mentioned above. 


mvp_df['3P%'] = mvp_df['3P%'].fillna(0)
mvp_df.dropna(inplace = True)
mvp_df.reset_index(drop = True, inplace = True)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,OWS,DWS,OBPM,DBPM,BPM,VORP,ORtg,DRtg,W,W/L%
0,1,Kareem Abdul-Jabbar,32,LAL,147.0,147.0,221,0.665,82,38.3,...,9.5,5.3,4.8,2.4,7.2,7.3,118.0,100.0,60.0,0.732
1,2,Julius Erving,29,PHI,31.5,31.5,221,0.143,78,36.1,...,7.3,5.2,5.7,1.9,7.6,6.8,111.0,99.0,59.0,0.720
2,3,George Gervin,27,SAS,19.0,19.0,221,0.086,78,37.6,...,9.3,1.3,5.5,-1.6,3.9,4.4,115.0,110.0,41.0,0.500
3,4,Larry Bird,23,BOS,15.0,15.0,221,0.068,82,36.0,...,5.6,5.6,3.0,1.5,4.5,4.8,109.0,98.0,61.0,0.744
4,5T,Tiny Archibald,31,BOS,2.0,2.0,221,0.009,80,35.8,...,5.9,2.9,1.4,-0.3,1.1,2.3,115.0,105.0,61.0,0.744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,7,Ja Morant,22,MEM,0.0,10.0,1000,0.010,57,33.1,...,4.6,2.1,6.2,-0.1,6.1,3.9,116.0,111.0,56.0,0.683
686,8,Stephen Curry,33,GSW,0.0,4.0,1000,0.004,64,34.5,...,4.6,3.4,5.4,0.4,5.8,4.4,115.0,108.0,53.0,0.646
687,9,Chris Paul,36,PHO,0.0,2.0,1000,0.002,65,32.9,...,5.8,3.6,3.1,2.3,5.4,4.0,124.0,107.0,64.0,0.780
688,10T,DeMar DeRozan,32,CHI,0.0,1.0,1000,0.001,76,36.1,...,7.0,1.8,3.4,-0.9,2.5,3.1,117.0,115.0,46.0,0.561


In [152]:
#Save data to a csv file

mvp_df.to_csv('Data/MVP_per_game.csv', index=False)

In [173]:
#Also isolating pure MVP stats to use with otherwise averaged plyer data (per 100 possessions or per 36 mins)

#pure_mvp_full = mvp_full[['Rank', 'Player', 'Age', 'Tm', 'First', 'Pts Won', 'Pts Max', 'Share', 'G', 'MP', 'WS', 'WS/48', 'Year']]

In [None]:
# per_possession_columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P',
#        '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
#        'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg', 'Year']

In [174]:
# per_possession_to_merge = all_per_possession.drop(['Age', 'Pos', 'G', 'MP'], axis = 1)

# added_stats_per_poss_df = pd.merge(per_possession_to_merge,
#                                    advanced_to_merge,
#                                    on = ["Player", "Tm", "Year"],
#                                    how = "left"
#                                   )


In [175]:
# per_possession_mvp_stats = pd.merge(pure_mvp_full,
#                                     added_stats_per_poss_df,
#                                     on=['Player', 'Tm', 'Year'],
#                                     how = 'left'
#                                    )

In [177]:
#per_possession_mvp_stats[(per_possession_mvp_stats["Player"] == "Chauncey Billups") & (per_possession_mvp_stats["Tm"] == "TOT")] = per_possession_mvp_stats[(per_possession_mvp_stats["Player"] == "Chauncey Billups") & (per_possession_mvp_stats["Tm"] == "TOT")].replace("TOT", "DEN")






In [178]:
# mvp_per_possession_df = pd.merge(per_possession_mvp_stats,
#                                  full_standings,
#                                  on = ['Tm', 'Year'],
#                                  how = 'left'
#                                 )

In [181]:
# mvp_per_possession_df['3P%'] = mvp_per_possession_df['3P%'].fillna(0)
# mvp_per_possession_df.dropna(inplace = True)
# mvp_per_possession_df.reset_index(drop = True, inplace = True)

In [183]:
#Save data to a csv file

#mvp_per_possession_df.to_csv('Data/MVP_per_possession.csv', index=False)