In [1]:
# Import dependencies
import os
import pandas as pd
import requests
import joblib
from bs4 import BeautifulSoup
from datetime import datetime
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
def createPlayerDF(stat, year):
    # Set url for given year
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_{stat}.html'
    page = requests.get(url)
    
    # Convert the page html to a soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find the sought after table of data
    table = soup.find_all(class_="full_table")

    # Store the headers/column names
    head = soup.find(class_="thead")
    column_names_raw = [head.text for item in head][0]

    # Clean the column_names_raw list
    column_names = column_names_raw.replace("\n",",").split(",")[2:-1]
    
    # Create the dataframe
    players = []

    for i in range(len(table)):
        player_ = []

        for td in table[i].find_all("td"):
            player_.append(td.text)

        players.append(player_)

    df = pd.DataFrame(players, columns=column_names).set_index("Player")

    # Cleaning the player's name from occasional special characters
    df.index = df.index.str.replace('*', "", regex=True)
    
    return df

In [4]:
def createRosters(team):
    roster = []

    # Set url for given team
    url = f"https://www.basketball-reference.com/teams/{team}/{datetime.now().year}.html"
    page = requests.get(url)

    # Convert the page html to a soup object
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the sought after table of data
    table = soup.find(id="roster")
    player_table = table.find_all(attrs={"data-stat" : "player"})

    # Create a list of all players in the player table
    for p in player_table[1:]:
        roster.append(p.text)

    # Remove "TW" suffix
    for i in range(len(roster)):
        if "\xa0\xa0(TW)" in roster[i]:
            roster[i] = roster[i].replace("\xa0\xa0(TW)", "")

    return roster

In [5]:
def createPlayerTotals(df):
    # Drop categorical and unnecessary columns
    columns_to_drop=["Pos", "Age", "Tm", "GS", "FG", "FG%", "3P", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FT%", "TRB", "PTS"]
    df = df.drop(columns=columns_to_drop)

    # Convert data to numeric instead of object
    df = df.astype(float)
    
    return df

In [29]:
def createTeamAverages(roster):
    # Initilizing
    team_averages = [0,0,0,0,0,0,0,0,0,0]
    
    # Normalize team stats to 240 minutes per game
    factor = calculatePerMinuteFactor(roster, player_totals)

    # Loop through roster to store team average stats        
    for player in roster:
        totals = player_totals[player]
        if totals[0] > 0:
            # Store the number of games played before dropping column
            g = totals["G"]

            # Drop the now unnecessary columns
            totals = totals.drop(labels=["MP", "G"])

            # Adjust player totals columns to show stats per game
            for col in totals.index:
                totals[col] = totals[col] / g * factor

            team_averages += totals
                             
    return team_averages

In [30]:
def calculatePerMinuteFactor(roster, player_totals):
    # Initilizing
    team_minutes = 0
    
    for player in roster:
        totals = player_totals[player]
        if totals[0] > 0:
            team_minutes += totals["MP"] / totals["G"]

    if team_minutes != 240:
        factor = 240/team_minutes
    
    return factor

In [8]:
# Get nba players data into dataframes from the year 2016 - present
currentYear = datetime.now().year
startYear = 2016
year_totals = {}

for year in range(startYear, currentYear+1):
    year_totals[year] = createPlayerDF('totals', str(year))

In [9]:
# Get nba team rosters into dataframes
nba_teams = teams.get_teams()
nba_team_abr = [team['abbreviation'] for team in nba_teams]
team_rosters = {}

# Convert abreviation for Brooklyn, Pheonix, & Charlotte for basketball-reference.com
nba_team_abr[14] = "BRK"
nba_team_abr[19] = "PHO"
nba_team_abr[29] = "CHO"

for team in nba_team_abr:
    team_rosters[team] = createRosters(team)

In [10]:
# Create dataframe for each player's stats over the last 5 years
currentYear = datetime.now().year
num_years = 5
player_totals = {}

for team in team_rosters.values():
    for player in team:
        # Initializing
        player_totals[player] = [0,0,0,0,0,0,0,0,0,0,0,0]

        for i in range(0, num_years+1):
            if player in year_totals[currentYear-i].index:
                player_totals[player] += createPlayerTotals(year_totals[currentYear-i]).loc[player]            

In [154]:
# Create a dataframe for each teams per minute average based on current roster
team_averages = {}

for team in team_rosters:
    team_averages[team] = createTeamAverages(team_rosters[team], player_totals)

In [65]:
# Convert year totals dictionary to dataframe and save as csv

dfs = []

for year, df in year_totals.items():
    temp_df = df
    temp_df['Year'] = year
    dfs.append(temp_df)
    
year_totals_df = pd.concat(dfs)
year_totals_df = year_totals_df.reset_index()
year_totals_df = year_totals_df.set_index(["Year", "Player"])

file_dir = os.path.join("data", "year_totals.csv")
year_totals_df.to_csv(file_dir, encoding="utf-8-sig")

In [66]:
# Convert team roster dictionary to dataframe and save as csv

dfs = []

for team, players in team_rosters.items():
    temp_df = pd.DataFrame(players, columns=["Player"])
    temp_df['Team'] = team
    dfs.append(temp_df)
    
team_rosters_df = pd.concat(dfs)
team_rosters_df = team_rosters_df.set_index("Player")

file_dir = os.path.join("data", "team_rosters.csv")
team_rosters_df.to_csv(file_dir, encoding="utf-8-sig")

In [207]:
# Convert player totals dictionary to dataframe and save as csv

dfs = []

for player, stats in player_totals.items():
    if stats[0] > 0:
        temp_df = pd.DataFrame([stats])
    else:
        temp_df = pd.DataFrame({"G": 0, "MP": 0, "FGA": 0, "3PA": 0, "FTA": 0, "ORB": 0, "DRB": 0, "AST": 0, "STL": 0, "BLK": 0, "TOV": 0, "PF": 0,}, index=[player])
    dfs.append(temp_df)
    
player_totals_df = pd.concat(dfs)
player_totals_df.index.name = "Player"

file_dir = os.path.join("data", "player_totals.csv")
player_totals_df.to_csv(file_dir, encoding="utf-8-sig")

In [197]:
team = "TOR"

# Normalize team stats to 240 minutes per game
factor = calculatePerMinuteFactor(team_rosters[team], player_totals)

totals = player_totals_df.loc[team_rosters[team]]
totals_df = pd.DataFrame([totals.sum()])

g = totals_df["G"]

totals_df = totals_df.drop(columns=["MP", "G"])
team_averages = totals_df.divide(g, axis=0)
team_averages


Unnamed: 0,FGA,3PA,FTA,ORB,DRB,AST,STL,BLK,TOV,PF
0,8.388651,3.27089,2.057909,0.983288,2.85309,2.297318,0.801399,0.446172,1.17878,1.93082


In [206]:
totals = player_totals_df.loc[team_rosters[team]]
totals_df = pd.DataFrame()
dfs = []

for player in totals.index:
    test = totals.loc[player]
    print(test)
#     if totals[0] > 0:
#         # Store the number of games played before dropping column
#         g = totals["G"]

#         # Drop the now unnecessary columns
#         totals = totals.drop(labels=["MP", "G"])

#         # Adjust player totals columns to show stats per game
#         for col in totals.index:
#             totals[col] = totals[col] / g * factor

#         team_averages += totals

G       193.0
MP     3273.0
FGA    1246.0
3PA     517.0
FTA     424.0
ORB     333.0
DRB     655.0
AST     105.0
STL      91.0
BLK     239.0
TOV     110.0
PF      394.0
Name: Chris Boucher, dtype: float64
G       324.0
MP     8968.0
FGA    3616.0
3PA    1843.0
FTA     773.0
ORB     134.0
DRB     875.0
AST    1555.0
STL     399.0
BLK     117.0
TOV     494.0
PF      628.0
Name: Fred VanVleet, dtype: float64
G       202.0
MP     3671.0
FGA    1252.0
3PA     799.0
FTA     172.0
ORB      69.0
DRB     320.0
AST     293.0
STL     130.0
BLK      21.0
TOV     197.0
PF      222.0
Name: Svi Mykhailiuk, dtype: float64
G        37.0
MP     1311.0
FGA     463.0
3PA      98.0
FTA     107.0
ORB     103.0
DRB     186.0
AST     127.0
STL      38.0
BLK      32.0
TOV      69.0
PF      107.0
Name: Scottie Barnes, dtype: float64
G       169.0
MP     4425.0
FGA    1780.0
3PA     978.0
FTA     221.0
ORB      67.0
DRB     291.0
AST     217.0
STL     170.0
BLK      41.0
TOV     102.0
PF      262.0
Name: Gary Tre

In [199]:
player_totals_df.loc[team_rosters[team]]

Unnamed: 0_level_0,G,MP,FGA,3PA,FTA,ORB,DRB,AST,STL,BLK,TOV,PF
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Chris Boucher,193.0,3273.0,1246.0,517.0,424.0,333.0,655.0,105.0,91.0,239.0,110.0,394.0
Fred VanVleet,324.0,8968.0,3616.0,1843.0,773.0,134.0,875.0,1555.0,399.0,117.0,494.0,628.0
Svi Mykhailiuk,202.0,3671.0,1252.0,799.0,172.0,69.0,320.0,293.0,130.0,21.0,197.0,222.0
Scottie Barnes,37.0,1311.0,463.0,98.0,107.0,103.0,186.0,127.0,38.0,32.0,69.0,107.0
Gary Trent Jr.,169.0,4425.0,1780.0,978.0,221.0,67.0,291.0,217.0,170.0,41.0,102.0,262.0
Precious Achiuwa,96.0,1613.0,507.0,44.0,166.0,155.0,316.0,72.0,40.0,50.0,79.0,167.0
Dalano Banton,34.0,398.0,122.0,28.0,24.0,20.0,51.0,57.0,14.0,5.0,37.0,42.0
Pascal Siakam,363.0,10336.0,4240.0,1064.0,1189.0,488.0,1710.0,1032.0,323.0,251.0,612.0,968.0
OG Anunoby,282.0,7396.0,2299.0,1100.0,419.0,280.0,857.0,374.0,308.0,127.0,304.0,639.0
Malachi Flynn,74.0,1159.0,426.0,212.0,63.0,16.0,128.0,162.0,48.0,10.0,51.0,91.0


In [200]:
player_totals["Pascal Siakam"]

G        363.0
MP     10336.0
FGA     4240.0
3PA     1064.0
FTA     1189.0
ORB      488.0
DRB     1710.0
AST     1032.0
STL      323.0
BLK      251.0
TOV      612.0
PF       968.0
Name: Pascal Siakam, dtype: float64

In [198]:
totals = player_totals_df.loc[team_rosters[team]]
totals_df = pd.DataFrame([totals.sum()])
totals_df

Unnamed: 0,G,MP,FGA,3PA,FTA,ORB,DRB,AST,STL,BLK,TOV,PF
0,2573.0,59056.0,21584.0,8416.0,5295.0,2530.0,7341.0,5911.0,2062.0,1148.0,3033.0,4968.0


In [159]:
# Testing linear regression model using full team roster data

file_dir = os.path.join("model", "mlr_model.sav")
mlr_model = joblib.load(file_dir)

In [162]:
X = team_averages["TOR"]
X

[[FGA    83.751929
  3PA    31.554998
  FTA    19.600832
  ORB    12.312486
  DRB    30.368204
  AST    21.942520
  STL     8.130506
  BLK     4.853526
  TOV    11.791164
  PF     20.677176
  Name: Chris Boucher, dtype: float64]]

In [None]:

y_pred = 

In [156]:
team_averages["TOR"]

FGA    83.751929
3PA    31.554998
FTA    19.600832
ORB    12.312486
DRB    30.368204
AST    21.942520
STL     8.130506
BLK     4.853526
TOV    11.791164
PF     20.677176
Name: Chris Boucher, dtype: float64