In [1]:
# Import dependencies
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
def createPlayerDF(stat, year):
    # Set url for given year
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_{stat}.html'
    page = requests.get(url)
    
    # Convert the page html to a soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find the sought after table of data
    table = soup.find_all(class_="full_table")

    # Store the headers/column names
    head = soup.find(class_="thead")
    column_names_raw = [head.text for item in head][0]

    # Clean the column_names_raw list
    column_names = column_names_raw.replace("\n",",").split(",")[2:-1]
    
    # Create the dataframe
    players = []

    for i in range(len(table)):
        player_ = []

        for td in table[i].find_all("td"):
            player_.append(td.text)

        players.append(player_)

    df = pd.DataFrame(players, columns=column_names).set_index("Player")

    # Cleaning the player's name from occasional special characters
    df.index = df.index.str.replace('*', "", regex=True)
    
    return df

In [4]:
def createRosters(team):
    roster = []

    # Set url for given team
    url = f"https://www.basketball-reference.com/teams/{team}/{datetime.now().year}.html"
    page = requests.get(url)

    # Convert the page html to a soup object
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the sought after table of data
    table = soup.find(id="roster")
    player_table = table.find_all(attrs={"data-stat" : "player"})

    # Create a list of all players in the player table
    for p in player_table[1:]:
        roster.append(p.text)

    # Remove "TW" suffix
    for i in range(len(roster)):
        if "\xa0\xa0(TW)" in roster[i]:
            roster[i] = roster[i].replace("\xa0\xa0(TW)", "")

    return roster

In [5]:
def createPlayerTotals(df):
    # Drop categorical and unnecessary columns
    columns_to_drop=["Pos", "Age", "Tm", "GS", "FG", "FG%", "3P", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FT%", "TRB", "PTS"]
    df = df.drop(columns=columns_to_drop)

    # Convert data to numeric instead of object
    df = df.astype(float)
    
    return df

In [29]:
def createTeamAverages(roster, player_totals):
    # Initilizing
    team_averages = [0,0,0,0,0,0,0,0,0,0]
    
    # Normalize team stats to 240 minutes per game
    factor = calculatePerMinuteFactor(roster, player_totals)

    # Loop through roster to store team average stats        
    for player in roster:
        totals = player_totals[player]
        if totals[0] > 0:
            # Store the number of games played before dropping column
            g = totals["G"]

            # Drop the now unnecessary columns
            totals = totals.drop(labels=["MP", "G"])

            # Adjust player totals columns to show stats per game
            for col in totals.index:
                totals[col] = totals[col] / g * factor

            team_averages += totals
                             
    return team_averages

In [30]:
def calculatePerMinuteFactor(roster, player_totals):
    # Initilizing
    team_minutes = 0
    
    for player in roster:
        totals = player_totals[player]
        if totals[0] > 0:
            team_minutes += totals["MP"] / totals["G"]

    if team_minutes != 240:
        factor = 240/team_minutes
    
    return factor

In [8]:
# Get nba players data into dataframes from the year 2016 - present
currentYear = datetime.now().year
startYear = 2016
year_totals = {}

for year in range(startYear, currentYear+1):
    year_totals[year] = createPlayerDF('totals', str(year))

In [9]:
# Get nba team rosters into dataframes
nba_teams = teams.get_teams()
nba_team_abr = [team['abbreviation'] for team in nba_teams]
team_rosters = {}

# Convert abreviation for Brooklyn, Pheonix, & Charlotte for basketball-reference.com
nba_team_abr[14] = "BRK"
nba_team_abr[19] = "PHO"
nba_team_abr[29] = "CHO"

for team in nba_team_abr:
    team_rosters[team] = createRosters(team)

In [10]:
# Create dataframe for each player's stats over the last 5 years
currentYear = datetime.now().year
num_years = 5
player_totals = {}

for team in team_rosters.values():
    for player in team:
        # Initializing
        player_totals[player] = [0,0,0,0,0,0,0,0,0,0,0,0]

        for i in range(0, num_years+1):
            if player in year_totals[currentYear-i].index:
                player_totals[player] += createPlayerTotals(year_totals[currentYear-i]).loc[player]            

In [27]:
# Create a dataframe for each teams per minute average based on current roster
team_averages = {}

for team in team_rosters:
    team_averages[team] = createTeamAverages(team_rosters[team], player_totals)

In [215]:
year_totals[2020]

Unnamed: 0_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
Steven Adams,C,26,OKC,63,63,1680,283,478,0.592,1,3,0.333,282,475,0.594,0.593,117,201,0.582,207,376,583,146,51,67,94,122,684
Bam Adebayo,PF,22,MIA,72,72,2417,440,790,0.557,2,14,0.143,438,776,0.564,0.558,264,382,0.691,176,559,735,368,82,93,204,182,1146
LaMarcus Aldridge,C,34,SAS,53,53,1754,391,793,0.493,61,157,0.389,330,636,0.519,0.532,158,191,0.827,103,289,392,129,36,87,74,128,1001
Kyle Alexander,C,23,MIA,2,0,13,1,2,0.5,0,0,,1,2,0.5,0.5,0,0,,2,1,3,0,0,0,1,1,2
Nickeil Alexander-Walker,SG,21,NOP,47,1,591,98,266,0.368,46,133,0.346,52,133,0.391,0.455,25,37,0.676,9,75,84,89,17,8,54,57,267
Grayson Allen,SG,24,MEM,38,0,718,117,251,0.466,57,141,0.404,60,110,0.545,0.58,39,45,0.867,8,77,85,52,10,2,33,53,330
Jarrett Allen,C,21,BRK,70,64,1852,302,465,0.649,0,6,0.0,302,459,0.658,0.649,171,270,0.633,216,455,671,110,40,92,77,162,775
Kadeem Allen,PG,27,NYK,10,0,117,19,44,0.432,5,16,0.313,14,28,0.5,0.489,7,11,0.636,2,7,9,21,5,2,8,7,50
Al-Farouq Aminu,PF,29,ORL,18,2,380,25,86,0.291,9,36,0.25,16,50,0.32,0.343,19,29,0.655,24,63,87,21,18,8,17,27,78
Justin Anderson,SG,26,BRK,10,1,107,10,38,0.263,6,29,0.207,4,9,0.444,0.342,2,4,0.5,1,20,21,8,0,6,4,13,28


In [203]:
team_rosters["TOR"]

['Chris Boucher',
 'Fred VanVleet',
 'Svi Mykhailiuk',
 'Scottie Barnes',
 'Gary Trent Jr.',
 'Precious Achiuwa',
 'Dalano Banton',
 'Pascal Siakam',
 'OG Anunoby',
 'Malachi Flynn',
 'Justin Champagnie',
 'Khem Birch',
 'Yuta Watanabe',
 'Isaac Bonga',
 'Goran Dragić',
 'David Johnson']

In [213]:
player_totals["Precious Achiuwa"]

G        94
MP     1579
FGA     500
3PA      42
FTA     164
ORB     154
DRB     310
AST      72
STL      40
BLK      50
TOV      79
PF      167
Name: Precious Achiuwa, dtype: int64

In [53]:
player_totals["Precious Achiuwa"].index

Index(['G', 'MP', 'FGA', '3PA', 'FTA', 'ORB', 'DRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF'],
      dtype='object')

In [28]:
team_averages["TOR"]

FGA    83.751929
3PA    31.554998
FTA    19.600832
ORB    12.312486
DRB    30.368204
AST    21.942520
STL     8.130506
BLK     4.853526
TOV    11.791164
PF     20.677176
Name: Chris Boucher, dtype: float64

In [50]:
# Convert year totals dictionary to dataframe and save as csv

dfs = []

for year, df in year_totals.items():
    temp_df = df
    temp_df['Year'] = year
    dfs.append(temp_df)
    
year_totals_df = pd.concat(dfs)
year_totals_df = year_totals_df.reset_index()
year_totals_df = year_totals_df.set_index(["Year", "Player"])

file_dir = os.path.join("data", "year_totals.csv")
year_totals_df.to_csv(file_dir)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Year,Player,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2016,Quincy Acy,PF,25,SAC,59,29,876,119,214,0.556,19,49,0.388,100,165,0.606,0.6,50,68,0.735,65,123,188,27,29,24,27,103,307
2016,Jordan Adams,SG,21,MEM,2,0,15,2,6,0.333,0,1,0.0,2,5,0.4,0.333,3,5,0.6,0,2,2,3,3,0,2,2,7
2016,Steven Adams,C,22,OKC,80,80,2014,261,426,0.613,0,0,,261,426,0.613,0.613,114,196,0.582,219,314,533,62,42,89,84,223,636
2016,Arron Afflalo,SG,30,NYK,71,57,2371,354,799,0.443,91,238,0.382,263,561,0.469,0.5,110,131,0.84,23,243,266,144,25,10,82,142,909
2016,Alexis Ajinça,C,27,NOP,59,17,861,150,315,0.476,0,1,0.0,150,314,0.478,0.476,52,62,0.839,75,194,269,31,19,36,54,134,352
2016,Cole Aldrich,C,27,LAC,60,5,800,134,225,0.596,0,0,,134,225,0.596,0.596,60,84,0.714,86,202,288,50,47,68,64,139,328
2016,LaMarcus Aldridge,PF,30,SAS,74,74,2261,536,1045,0.513,0,16,0.0,536,1029,0.521,0.513,259,302,0.858,176,456,632,110,38,81,99,151,1331
2016,Cliff Alexander,PF,20,POR,8,0,36,5,10,0.5,0,0,,5,10,0.5,0.5,0,0,,2,4,6,0,1,2,1,1,10
2016,Lavoy Allen,PF,26,IND,79,28,1599,191,370,0.516,0,0,,191,370,0.516,0.516,46,73,0.63,162,262,424,76,26,42,69,147,428
2016,Tony Allen,SG,34,MEM,64,57,1620,215,469,0.458,15,42,0.357,200,427,0.468,0.474,90,138,0.652,104,192,296,70,110,18,78,175,535


In [51]:
# Convert team roster dictionary to dataframe and save as csv

dfs = []
for team, players in team_rosters.items():
    temp_df = pd.DataFrame(players, columns=["Player"])
    temp_df['Team'] = team
    dfs.append(temp_df)
    
team_rosters_df = pd.concat(dfs)
team_rosters_df = team_rosters_df.set_index("Player")

file_dir = os.path.join("data", "team_rosters.csv")
team_rosters_df.to_csv(file_dir)

Unnamed: 0_level_0,Team
Player,Unnamed: 1_level_1
Trae Young,ATL
John Collins,ATL
Delon Wright,ATL
Kevin Huerter,ATL
Clint Capela,ATL
Danilo Gallinari,ATL
Lou Williams,ATL
Gorgui Dieng,ATL
Timothé Luwawu-Cabarrot,ATL
Bogdan Bogdanović,ATL


In [54]:
# Convert player totals dictionary to dataframe and save as csv

dfs = []
for player, stats in player_totals.items():
    temp_df = pd.DataFrame(stats, columns=stats.index)
    temp_df['Player'] = player
    dfs.append(temp_df)
    
player_totals_df = pd.concat(dfs)
player_totals_df = player_totals_df.set_index("Player")

TypeError: 'builtin_function_or_method' object is not iterable

In [58]:
for player, stats in player_totals.items():
    temp_df = pd.DataFrame(stats)
temp_df

Unnamed: 0,Arnoldas Kulboka
G,1.0
MP,3.0
FGA,0.0
3PA,0.0
FTA,0.0
ORB,0.0
DRB,0.0
AST,0.0
STL,0.0
BLK,0.0
