# 0. Basic configuration

In [1]:
# Imports
import numpy as np
import pandas as pd
import pickle

In [2]:
# Files path
DATA_PATH = '../data'

STAT_FILE = f'{DATA_PATH}/stat.csv'
SALARY_FILE = f'{DATA_PATH}/salary.csv'
PLAYER_FILE = f'{DATA_PATH}/player.csv'

# Intermediate file with complete return of nba_api
NBA_API_FILE = f'{DATA_PATH}/players_stats.pkl'

# 1. Get players statistics using [NBA API](https://github.com/swar/nba_api/)

In [3]:
# Install NBA API
!pip install nba_api



## 1.1. Basic API operation (understanding how it works)

In [4]:
# Get a list of teams
from nba_api.stats.static import teams

nba_teams = teams.get_teams()
print(f'Number of teams fetched: {len(nba_teams)}')
nba_teams[:2]

Number of teams fetched: 30


[{'id': 1610612737,
  'full_name': 'Atlanta Hawks',
  'abbreviation': 'ATL',
  'nickname': 'Hawks',
  'city': 'Atlanta',
  'state': 'Atlanta',
  'year_founded': 1949},
 {'id': 1610612738,
  'full_name': 'Boston Celtics',
  'abbreviation': 'BOS',
  'nickname': 'Celtics',
  'city': 'Boston',
  'state': 'Massachusetts',
  'year_founded': 1946}]

In [5]:
teams.find_team_by_abbreviation('LAL')

{'id': 1610612747,
 'full_name': 'Los Angeles Lakers',
 'abbreviation': 'LAL',
 'nickname': 'Lakers',
 'city': 'Los Angeles',
 'state': 'California',
 'year_founded': 1948}

In [6]:
# Get a list of players
from nba_api.stats.static import players

nba_players = players.get_players()
print(f'Number of players fetched: {len(nba_players)}')
nba_players[:3]

Number of players fetched: 4723


[{'id': 76001,
  'full_name': 'Alaa Abdelnaby',
  'first_name': 'Alaa',
  'last_name': 'Abdelnaby',
  'is_active': False},
 {'id': 76002,
  'full_name': 'Zaid Abdul-Aziz',
  'first_name': 'Zaid',
  'last_name': 'Abdul-Aziz',
  'is_active': False},
 {'id': 76003,
  'full_name': 'Kareem Abdul-Jabbar',
  'first_name': 'Kareem',
  'last_name': 'Abdul-Jabbar',
  'is_active': False}]

In [7]:
players.find_players_by_full_name("Anthony Davis")

[{'id': 203076,
  'full_name': 'Anthony Davis',
  'first_name': 'Anthony',
  'last_name': 'Davis',
  'is_active': True}]

In [8]:
# Get career stats for a player
from nba_api.stats.endpoints import playercareerstats

# Anthony Davis
career = playercareerstats.PlayerCareerStats(player_id=203076)
career.season_totals_regular_season.get_data_frame()

Unnamed: 0,PLAYER_ID,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,203076,2012-13,0,1610612740,NOH,20.0,64,60,1846.0,349,...,0.751,165,357,522,63,75,112,89,158,867
1,203076,2013-14,0,1610612740,NOP,21.0,67,66,2358.0,522,...,0.791,207,466,673,105,89,189,109,200,1394
2,203076,2014-15,0,1610612740,NOP,22.0,68,68,2455.0,642,...,0.805,173,523,696,149,100,200,95,141,1656
3,203076,2015-16,0,1610612740,NOP,23.0,61,61,2164.0,560,...,0.758,130,497,627,116,78,125,121,148,1481
4,203076,2016-17,0,1610612740,NOP,24.0,75,75,2708.0,770,...,0.802,172,712,884,157,94,167,181,168,2099
5,203076,2017-18,0,1610612740,NOP,25.0,75,75,2727.0,780,...,0.828,187,644,831,174,115,193,162,159,2110
6,203076,2018-19,0,1610612740,NOP,26.0,56,56,1850.0,530,...,0.794,174,498,672,218,88,135,112,132,1452
7,203076,2019-20,0,1610612747,LAL,27.0,62,62,2131.0,551,...,0.846,142,435,577,200,91,143,154,156,1618
8,203076,2020-21,0,1610612747,LAL,28.0,36,36,1162.0,301,...,0.738,62,224,286,110,45,59,74,60,786
9,203076,2021-22,0,1610612747,LAL,28.0,22,22,786.0,213,...,0.728,64,156,220,67,29,50,44,51,534


In [9]:
career.get_available_data()

dict_keys(['SeasonTotalsRegularSeason', 'CareerTotalsRegularSeason', 'SeasonTotalsPostSeason', 'CareerTotalsPostSeason', 'SeasonTotalsAllStarSeason', 'CareerTotalsAllStarSeason', 'SeasonTotalsCollegeSeason', 'CareerTotalsCollegeSeason', 'SeasonTotalsShowcaseSeason', 'CareerTotalsShowcaseSeason', 'SeasonRankingsRegularSeason', 'SeasonRankingsPostSeason'])

In [10]:
# Get information about a player
from nba_api.stats.endpoints import commonplayerinfo

player_info = commonplayerinfo.CommonPlayerInfo(player_id=203076)
player_info.common_player_info.get_data_frame()

Unnamed: 0,PERSON_ID,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,...,PLAYERCODE,FROM_YEAR,TO_YEAR,DLEAGUE_FLAG,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GREATEST_75_FLAG
0,203076,Anthony,Davis,Anthony Davis,"Davis, Anthony",A. Davis,anthony-davis,1993-03-11T00:00:00,Kentucky,USA,...,anthony_davis,2012,2021,N,Y,Y,2012,1,1,Y


In [11]:
player_info.get_available_data()

dict_keys(['CommonPlayerInfo', 'PlayerHeadlineStats', 'AvailableSeasons'])

## 1.2. Fetching data

In [12]:
import logging
import time
from requests.exceptions import Timeout

from nba_api.stats.static import players
from nba_api.stats.endpoints import playercareerstats, commonplayerinfo

LOG_FILE_PATH = '../logs/getstats.log'

logging.basicConfig(filename=LOG_FILE_PATH, encoding='utf-8', level=logging.INFO)

def getPlayersData(persist_file, max_tries=10):
    logging.info('Starting data collection')

    # Load data already fetched
    logging.info('Loading previous saved data')
    try:
        persist_file = open(persist_file, 'rb')
        players_stats = pickle.load(persist_file)
        persist_file.close()
    except:
        players_stats = []

    logging.info(f'{len(players_stats)} players loaded')
    
    # Iterate over all players in static data
    for player in players.get_players():
    
        # Skip if player already fetched
        if player['id'] in [ p['id'] for p in players_stats ]:
            logging.info(f"skipping player_id = {player['id']}")
            continue

        # Will try to get a player until success or `max_tries`
        success = False
        tries = 0
        while (not success) and (tries < max_tries):
            try:
                logging.info(f"Attempting to get player {player['id']} at try {tries}")
                
                # Request player info and career stats using NBA API
                info = commonplayerinfo.CommonPlayerInfo(player_id=player['id'])
                career = playercareerstats.PlayerCareerStats(player_id=player['id'])
        
                players_stats.append({
                    'id': player['id'],
                    'info': info,
                    'career': career
                })
                     
                logging.info(f"Player {player['id']} successfully fetched!")
            
                # Save updated information to disk
                logging.info(f'Saving to file')
                with open(persist_file, 'wb') as persist_file:
                    pickle.dump(players_stats, persist_file)
            
                logging.info(f'Persistent file has now {len(players_stats)} players')
                success = True
            except Timeout:
                logging.error(f"Failed to get player {player['id']}")
                tries += 1
                time.sleep(tries * 60)

In [13]:
# getPlayersData(persist_file=NBA_API_FILE)

## 1.3. Transforming the fetched data into a CSV table

In [14]:
file = open(NBA_API_FILE, 'rb')
players = pickle.load(file)
file.close()

In [15]:
player_stats = pd.DataFrame()
for player in players:
    name = player['info'].common_player_info.get_data_frame()['DISPLAY_FIRST_LAST'][0]
    data = player['career'].season_totals_regular_season.get_data_frame()
    data.insert(1, 'PLAYER_NAME', name)
    player_stats = player_stats.append(data)

In [16]:
player_stats.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76001,Alaa Abdelnaby,1990-91,0,1610612757,POR,23.0,43,0,290.0,...,0.568,27.0,62.0,89.0,12,4.0,12.0,22.0,39,135
1,76001,Alaa Abdelnaby,1991-92,0,1610612757,POR,24.0,71,1,934.0,...,0.752,81.0,179.0,260.0,30,25.0,16.0,66.0,132,432
2,76001,Alaa Abdelnaby,1992-93,0,1610612749,MIL,25.0,12,0,159.0,...,0.75,12.0,25.0,37.0,10,6.0,4.0,13.0,24,64
3,76001,Alaa Abdelnaby,1992-93,0,1610612738,BOS,25.0,63,52,1152.0,...,0.76,114.0,186.0,300.0,17,19.0,22.0,84.0,165,514
4,76001,Alaa Abdelnaby,1992-93,0,0,TOT,25.0,75,52,1311.0,...,0.759,126.0,211.0,337.0,27,25.0,26.0,97.0,189,578


In [17]:
player_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28153 entries, 0 to 0
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PLAYER_ID          28153 non-null  object 
 1   PLAYER_NAME        28153 non-null  object 
 2   SEASON_ID          28153 non-null  object 
 3   LEAGUE_ID          28153 non-null  object 
 4   TEAM_ID            28153 non-null  object 
 5   TEAM_ABBREVIATION  28144 non-null  object 
 6   PLAYER_AGE         28153 non-null  float64
 7   GP                 28153 non-null  object 
 8   GS                 21671 non-null  object 
 9   MIN                27384 non-null  object 
 10  FGM                28153 non-null  object 
 11  FGA                28153 non-null  object 
 12  FG_PCT             28139 non-null  object 
 13  FG3M               22120 non-null  object 
 14  FG3A               22120 non-null  object 
 15  FG3_PCT            21898 non-null  object 
 16  FTM                28153 n

In [18]:
player_stats.to_csv(STAT_FILE, index=False)

# 2. Get players salary scraping [HoopsHype](https://hoopshype.com/salaries/) site

## 2.1. Fetching data

In [19]:
!pip install lxml



In [20]:
import requests
from bs4 import BeautifulSoup

In [21]:
# Main salary page URL
URL="https://hoopshype.com/salaries/players/"

# Fetch url and parse it in a BeautifulSout object
request = requests.get(URL)
html = request.text
soup = BeautifulSoup(html, 'html.parser')

In [22]:
# Get page links to salary by season
links = { a.text.strip(): a['href'] for a in soup.find("div", {"class": "salaries-team-selector-top"}).find_all("a", href=True, text=True) }
dict(list(links.items())[0:3])

{'2021/22': 'https://hoopshype.com/salaries/players/',
 '2020/21': 'https://hoopshype.com/salaries/players/2020-2021/',
 '2018/19': 'https://hoopshype.com/salaries/players/2018-2019/'}

In [23]:
# Fetch salary tables for each season and store it in a pandas DataFrame
salaries = { season: pd.read_html(link)[0] for season, link in links.items() }

## 2.2. Transforming data

In [24]:
# Small clean in fetched dataframes, just to remove a unnecessary columns and set index to player name
df_salaries = [ df.drop(columns='Unnamed: 0').set_index('Player') for df in salaries.values() ]
display(df_salaries[0].head(1))
display(df_salaries[1].head(1))
df_salaries[2].head(1)

Unnamed: 0_level_0,2021/22,2022/23,2023/24,2024/25,2025/26,2026/27
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stephen Curry,"$45,780,966","$48,070,014","$51,915,615","$55,761,217","$59,606,817",$0


Unnamed: 0_level_0,2020/21,2020/21(*)
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Stephen Curry,"$43,006,362","$43,006,362"


Unnamed: 0_level_0,2018/19,2018/19(*)
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Stephen Curry,"$37,457,154","$38,320,489"


In [25]:
# Group all salaries dataframes in just one, using player name as index
grouped_salary = df_salaries[0].join(df_salaries[1:], how='outer')
print(f'Salaries data for {len(grouped_salary)} players\n')
grouped_salary.iloc[345:348]

Salaries data for 2768 players



Unnamed: 0_level_0,2021/22,2022/23,2023/24,2024/25,2025/26,2026/27,2020/21,2020/21(*),2018/19,2018/19(*),...,1999/00,1999/00(*),1997/98,1997/98(*),1995/96,1995/96(*),1993/94,1993/94(*),1991/92,1991/92(*)
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cade Cunningham,"$10,050,120","$10,552,800","$11,055,360","$13,940,809","$18,123,052",$0,,,,,...,,,,,,,,,,
Cal Bowdler,,,,,,,,,,,...,"$1,025,880","$1,591,268",,,,,,,,
Calbert Cheaney,,,,,,,,,,,...,"$2,000,000","$3,102,250","$3,600,000","$5,789,577","$2,800,000","$4,733,321","$2,000,000","$3,570,595",,


In [26]:
# Let's get a better number format, removing the string symbols
salary = grouped_salary.replace({'\$':'', ',':''}, regex=True)

# then converting the columns to numeric type
salary = salary.apply(pd.to_numeric)

# and replacing season names in columns
# The "(*)" in the column means it is a deflacted salary in todays value
salary.columns = salary.columns.str.replace('/', '-').str.replace('(*)', '_deflacted', regex=False)

salary.iloc[345:348]

Unnamed: 0_level_0,2021-22,2022-23,2023-24,2024-25,2025-26,2026-27,2020-21,2020-21_deflacted,2018-19,2018-19_deflacted,...,1999-00,1999-00_deflacted,1997-98,1997-98_deflacted,1995-96,1995-96_deflacted,1993-94,1993-94_deflacted,1991-92,1991-92_deflacted
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cade Cunningham,10050120.0,10552800.0,11055360.0,13940809.0,18123052.0,0.0,,,,,...,,,,,,,,,,
Cal Bowdler,,,,,,,,,,,...,1025880.0,1591268.0,,,,,,,,
Calbert Cheaney,,,,,,,,,,,...,2000000.0,3102250.0,3600000.0,5789577.0,2800000.0,4733321.0,2000000.0,3570595.0,,


In [27]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2768 entries, A.C. Green to Zylan Cheatham
Data columns (total 68 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2021-22            536 non-null    float64
 1   2022-23            536 non-null    float64
 2   2023-24            536 non-null    float64
 3   2024-25            536 non-null    float64
 4   2025-26            536 non-null    float64
 5   2026-27            536 non-null    float64
 6   2020-21            578 non-null    float64
 7   2020-21_deflacted  578 non-null    float64
 8   2018-19            576 non-null    float64
 9   2018-19_deflacted  576 non-null    float64
 10  2016-17            545 non-null    float64
 11  2016-17_deflacted  545 non-null    float64
 12  2014-15            514 non-null    float64
 13  2014-15_deflacted  514 non-null    float64
 14  2012-13            494 non-null    float64
 15  2012-13_deflacted  494 non-null    float64
 16  2010-11   

In [28]:
## Now it is time to change the DataFrame format. We want to have a final DF in the form:
## /--------------|---------|---------|---------\
## |    player    | season  | nominal |  real   |
## |--------------|---------|---------|---------|
## | Player Name  | 2018-19 | 2250000 | 2500000 |
## | Player Name  | 2019-20 | 2300000 | 2530000 |
## | Player Name  | 2020-21 | 3000000 | 3010000 |
## | Other Player | 2016-17 | 1885000 | 2035000 |
## | ...          | ...     | ...     | ...     |
## \--------------|---------|---------|---------/

# At first, let's keep track of the columns with 'nominal' and 'real' salary values
nominal_cols = [i for i in salary.columns if 'deflacted' not in i]
real_cols = [i for i in salary.columns if 'deflacted' in i]

(nominal_cols[:3], real_cols[:3])

(['2021-22', '2022-23', '2023-24'],
 ['2020-21_deflacted', '2018-19_deflacted', '2016-17_deflacted'])

In [29]:
player_salaries = pd.DataFrame()
for row in salary.iterrows():
    name = row[0]
    nominal = row[1][nominal_cols].replace(0, np.nan).rename('nominal').to_frame()
    real = row[1][real_cols].rename(lambda x: x.replace('_deflacted', '')).rename('real').to_frame()
    player = nominal.join(real).sort_index().reset_index().rename(columns={'index': 'season'}).dropna(how='all', subset=['nominal', 'real'])
    player.insert(0, 'player', name)
    player_salaries = player_salaries.append(player)

In [30]:
player_salaries.iloc[500:520]

Unnamed: 0,player,season,nominal,real
17,Amir Johnson,2007-08,3666666.0,4536819.0
18,Amir Johnson,2008-09,3666667.0,4319885.0
19,Amir Johnson,2009-10,3666666.0,4382411.0
20,Amir Johnson,2010-11,5000000.0,5913724.0
21,Amir Johnson,2011-12,5500000.0,6281547.0
22,Amir Johnson,2012-13,6000000.0,6740436.0
23,Amir Johnson,2013-14,6500000.0,7176238.0
24,Amir Johnson,2014-15,7000000.0,7571353.0
25,Amir Johnson,2015-16,12000000.0,12963417.0
26,Amir Johnson,2016-17,12000000.0,12835406.0


In [31]:
len(player_salaries)

15899

In [32]:
player_salaries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15899 entries, 0 to 30
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   player   15899 non-null  object 
 1   season   15899 non-null  object 
 2   nominal  15899 non-null  float64
 3   real     14550 non-null  float64
dtypes: float64(2), object(2)
memory usage: 621.1+ KB


In [33]:
player_salaries.to_csv(SALARY_FILE, index=False)

# 3. Joining everything

In [34]:
# Loading files
stat = pd.read_csv(STAT_FILE)
salary = pd.read_csv(SALARY_FILE)

In [35]:
stat.head(3)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,SEASON_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,76001,Alaa Abdelnaby,1990-91,0,1610612757,POR,23.0,43,0.0,290.0,...,0.568,27.0,62.0,89.0,12,4.0,12.0,22.0,39,135
1,76001,Alaa Abdelnaby,1991-92,0,1610612757,POR,24.0,71,1.0,934.0,...,0.752,81.0,179.0,260.0,30,25.0,16.0,66.0,132,432
2,76001,Alaa Abdelnaby,1992-93,0,1610612749,MIL,25.0,12,0.0,159.0,...,0.75,12.0,25.0,37.0,10,6.0,4.0,13.0,24,64


In [36]:
# Setting `stat` index to ['PLAYER_NAME', 'SEASON_ID']
df_stat = stat.set_index(['PLAYER_NAME', 'SEASON_ID'])
df_stat.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_ID,LEAGUE_ID,TEAM_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,GS,MIN,FGM,FGA,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
PLAYER_NAME,SEASON_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Lorenzen Wright,2003-04,953,0,1610612763,MEM,28.0,65,46.0,1674.0,257,586,...,0.733,144.0,301.0,445.0,71,45.0,58.0,77.0,192,610
Shawn Kemp,1991-92,431,0,1610612760,SEA,22.0,64,23.0,1808.0,362,718,...,0.748,264.0,401.0,665.0,86,70.0,124.0,156.0,261,994
Corey Beck,1998-99,1133,0,1610612766,CHH,28.0,16,0.0,150.0,14,31,...,0.462,3.0,20.0,23.0,20,7.0,2.0,11.0,21,35


In [37]:
salary.head(3)

Unnamed: 0,player,season,nominal,real
0,A.C. Green,1990-91,1750000.0,3473015.0
1,A.C. Green,1991-92,1750000.0,3317240.0
2,A.C. Green,1992-93,1750000.0,3217865.0


In [38]:
# Renaming columns and setting `salary` index to ['PLAYER_NAME', 'SEASON_ID']
df_salary = salary.copy()
df_salary.columns = ['PLAYER_NAME', 'SEASON_ID', 'SALARY_NOMINAL', 'SALARY_REAL']
df_salary = df_salary.set_index(['PLAYER_NAME', 'SEASON_ID'])
df_salary.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,SALARY_NOMINAL,SALARY_REAL
PLAYER_NAME,SEASON_ID,Unnamed: 2_level_1,Unnamed: 3_level_1
A.C. Green,1990-91,1750000.0,3473015.0
A.C. Green,1991-92,1750000.0,3317240.0
A.C. Green,1992-93,1750000.0,3217865.0


In [39]:
# Joining both datasets
player = df_stat.join(df_salary, how='inner')

In [40]:
# Total rows
len(player)

16341

In [41]:
# Total unique players (all seasons)
len(player.reset_index()['PLAYER_NAME'].unique())

2497

In [42]:
player.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 16341 entries, ('A.C. Green', '1990-91') to ('Zylan Cheatham', '2019-20')
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PLAYER_ID          16341 non-null  int64  
 1   LEAGUE_ID          16341 non-null  int64  
 2   TEAM_ID            16341 non-null  int64  
 3   TEAM_ABBREVIATION  16341 non-null  object 
 4   PLAYER_AGE         16341 non-null  float64
 5   GP                 16341 non-null  int64  
 6   GS                 16341 non-null  float64
 7   MIN                16341 non-null  float64
 8   FGM                16341 non-null  int64  
 9   FGA                16341 non-null  int64  
 10  FG_PCT             16341 non-null  float64
 11  FG3M               16341 non-null  float64
 12  FG3A               16341 non-null  float64
 13  FG3_PCT            16341 non-null  float64
 14  FTM                16341 non-null  int64  
 15  FTA                163

In [43]:
player.to_csv(PLAYER_FILE)