In [23]:
# Import dependencies
import os
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [18]:
def createPlayerDF(stat, year):
    # Set url for given year
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_{stat}.html'
    page = requests.get(url)
    
    # Convert the page html to a soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Find the sought after table of data
    table = soup.find_all(class_="full_table")

    # Store the headers/column names
    head = soup.find(class_="thead")
    column_names_raw = [head.text for item in head][0]

    # Clean the column_names_raw list
    column_names = column_names_raw.replace("\n",",").split(",")[2:-1]
    
    # Create the dataframe
    players = []

    for i in range(len(table)):
        player_ = []

        for td in table[i].find_all("td"):
            player_.append(td.text)

        players.append(player_)

    df = pd.DataFrame(players, columns=column_names).set_index("Player")

    # Cleaning the player's name from occasional special characters
    df.index = df.index.str.replace('*', "", regex=True)
    
    return df

In [19]:
def createRosters(team):
    roster = []

    # Set url for given team
    url = f"https://www.basketball-reference.com/teams/{team}/{datetime.now().year}.html"
    page = requests.get(url)

    # Convert the page html to a soup object
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the sought after table of data
    table = soup.find(id="roster")
    player_table = table.find_all(attrs={"data-stat" : "player"})

    # Create a list of all players in the player table
    for p in player_table[1:]:
        roster.append(p.text)

    # Remove "TW" suffix
    for i in range(len(roster)):
        if "\xa0\xa0(TW)" in roster[i]:
            roster[i] = roster[i].replace("\xa0\xa0(TW)", "")

    return roster

In [20]:
def createPlayerTotals(df):
    # Drop categorical and unnecessary columns
    columns_to_drop=["Pos", "Age", "Tm", "GS", "FG", "FG%", "3P", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FT%", "TRB", "PTS"]
    df = df.drop(columns=columns_to_drop)

    # Convert data to numeric instead of object
    df = df.astype(float)
    
    return df

In [8]:
# Get nba players data into dataframes from the year 2016 - present
currentYear = datetime.now().year
startYear = 2016
year_totals = {}

for year in range(startYear, currentYear+1):
    year_totals[year] = createPlayerDF('totals', str(year))

In [21]:
# Get nba team rosters into dataframes
nba_teams = teams.get_teams()
nba_team_abr = [team['abbreviation'] for team in nba_teams]
team_rosters = {}

# Convert abreviation for Brooklyn, Pheonix, & Charlotte for basketball-reference.com
nba_team_abr[14] = "BRK"
nba_team_abr[19] = "PHO"
nba_team_abr[29] = "CHO"

for team in nba_team_abr:
    team_rosters[team] = createRosters(team)

In [10]:
# Create dataframe for each player's stats over the last 5 years
currentYear = datetime.now().year
num_years = 5
player_totals = {}

for team in team_rosters.values():
    for player in team:
        # Initializing
        player_totals[player] = [0,0,0,0,0,0,0,0,0,0,0,0]

        for i in range(0, num_years+1):
            if player in year_totals[currentYear-i].index:
                player_totals[player] += createPlayerTotals(year_totals[currentYear-i]).loc[player]            

In [65]:
# Convert year totals dictionary to dataframe and save as csv
dfs = []

for year, df in year_totals.items():
    temp_df = df
    temp_df['Year'] = year
    dfs.append(temp_df)
    
year_totals_df = pd.concat(dfs)
year_totals_df = year_totals_df.reset_index()
year_totals_df = year_totals_df.set_index(["Year", "Player"])

file_dir = os.path.join("data", "year_totals.csv")
year_totals_df.to_csv(file_dir, encoding="utf-8-sig")

In [None]:
# Save team roster dictionary as json
with open("data/team_rosters.json", "w", encoding="utf-8-sig") as outfile:
    json.dump(team_rosters, outfile, ensure_ascii=False)

In [66]:
# Convert team roster dictionary to dataframe and save as csv
dfs = []

for team, players in team_rosters.items():
    temp_df = pd.DataFrame(players, columns=["Player"])
    temp_df['Team'] = team
    dfs.append(temp_df)
    
team_rosters_df = pd.concat(dfs)
team_rosters_df = team_rosters_df.set_index("Player")

file_dir = os.path.join("data", "team_rosters.csv")
team_rosters_df.to_csv(file_dir, encoding="utf-8-sig")

In [207]:
# Convert player totals dictionary to dataframe and save as csv
dfs = []

for player, stats in player_totals.items():
    if stats[0] > 0:
        temp_df = pd.DataFrame([stats])
    else:
        temp_df = pd.DataFrame({"G": 0, "MP": 0, "FGA": 0, "3PA": 0, "FTA": 0, "ORB": 0, "DRB": 0, "AST": 0, "STL": 0, "BLK": 0, "TOV": 0, "PF": 0,}, index=[player])
    dfs.append(temp_df)
    
player_totals_df = pd.concat(dfs)
player_totals_df.index.name = "Player"

file_dir = os.path.join("data", "player_totals.csv")
player_totals_df.to_csv(file_dir, encoding="utf-8-sig")