In [1]:
# Import Dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import json

#I am using nba api which is a python library created for analyzing NBA stats
from nba_api.stats.static import players
from nba_api.stats.static import teams
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.endpoints import commonplayerinfo

In [2]:
# Open up google chrome in a web browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [C:\Users\Collin\.wdm\drivers\chromedriver\win32\94.0.4606.61\chromedriver.exe] found in cache


## Web Scrape Top 20 Players for highest Total Career 3 Pointers Made

In [3]:
# Go to Basketball Stats website on the google chrome web browser
url = 'https://www.nba.com/stats/alltime-leaders/?SeasonType=Regular%20Season&StatCategory=FG3M'
browser.visit(url)

In [4]:
#Initiate beatiful soup for this site
html = browser.html
bball_soup = BeautifulSoup(html, 'html.parser')

In [5]:
#Create dataframe and list to hold information
sharpshooter_df = pd.DataFrame(columns=['Player Name', 'Games Played','Minutes Played','3 Pointers Made', '3 Pointers Attempted', '3 Point %','Active?','3PM Rank','3P% Rank'])
sharpshooter_name = []
sharpshooter_df.head()

Unnamed: 0,Player Name,Games Played,Minutes Played,3 Pointers Made,3 Pointers Attempted,3 Point %,Active?,3PM Rank,3P% Rank


In [6]:
# Read in information for all players from nba api
all_players = players.get_players()
print(all_players[1081])

{'id': 201142, 'full_name': 'Kevin Durant', 'first_name': 'Kevin', 'last_name': 'Durant', 'is_active': True}


In [7]:
#Define function to find if player is active from player name
def get_player_active(player_full_name):
    for player in all_players:
        if player_full_name.lower() == player['full_name'].lower():
            return player['is_active']

In [8]:
# Find the Top 20 basketball players with the most made 3PMs
y=0
for i in range(0, 20):
    x = str(i)
    player_leaders = bball_soup.find('tr', index=x)
    stats1 = player_leaders.find_all('td')
    Min = stats1[3].text.split('\n')[1]
    y+=1
    sharpshooter_dict = {
        'Player Name': player_leaders.find('a').text,
        'Games Played': stats1[2].text,
        'Minutes Played': Min,
        '3 Pointers Made': stats1[8].text,
        '3 Pointers Attempted': stats1[9].text,
        '3 Point %': float(stats1[10].text)/100,
        'Active?': get_player_active(player_leaders.find('a').text),
        '3PM Rank': y,
        '3P% Rank': ''
        }
    sharpshooter_name.append(player_leaders.find('a').text)
    sharpshooter_df = sharpshooter_df.append(sharpshooter_dict, ignore_index = True)

In [9]:
sharpshooter_df.head()

Unnamed: 0,Player Name,Games Played,Minutes Played,3 Pointers Made,3 Pointers Attempted,3 Point %,Active?,3PM Rank,3P% Rank
0,Ray Allen,1300,46350,2973,7429,0.4,False,1,
1,Stephen Curry,762,26151,2832,6540,0.433,True,2,
2,Reggie Miller,1389,47620,2560,6486,0.395,False,3,
3,Kyle Korver,1232,31183,2450,5715,0.429,True,4,
4,James Harden,877,30198,2445,6728,0.363,True,5,


In [10]:
# Correct the dataset and change Vince Carter's Active Status to Retired
sharpshooter_df.iloc[5,6]=False
#sharpshooter_df.head(10)

In [11]:
#Write dataframe out to CSV file
sharpshooter_df.to_csv("sharpshooter.csv", index=False, header=True)

## Pull Data for All Time Leaders in 3 Pointers Made Per Game

In [12]:
# Go to page with leaders in all time 3PM per Game
url = 'https://www.landofbasketball.com/all_time_leaders/3_pointers_per_game_career_season.htm'
browser.visit(url)
#Initiate beatiful soup for this site
html = browser.html
bball_soup = BeautifulSoup(html, 'html.parser')

This url list includes players with a minimum of 400 games or 600 3-pointers made.

In [13]:
per_game_df = pd.DataFrame(columns=['Player Name', 'Career 3PM per Game','Active'])
#Find players with highest all time 3PM per game
top_player_code = bball_soup.find_all("tr", {"class": "a-top"})
top_10=0
for i in top_player_code:
    top_10+=1
    if top_10==11:
        break
    else:
        leader_dict = {
            "Player Name": i.a.text,
            "Career 3PM per Game": i.b.text,
            "Active": get_player_active(i.a.text)
        }
        per_game_df = per_game_df.append(leader_dict, ignore_index = True) 

In [14]:
per_game_df.head(10)

Unnamed: 0,Player Name,Career 3PM per Game,Active
0,Stephen Curry,3.72,True
1,Damian Lillard,3.01,True
2,Buddy Hield,2.98,True
3,Klay Thompson,2.92,True
4,James Harden,2.79,True
5,Donovan Mitchell,2.61,True
6,Paul George,2.49,True
7,D'Angelo Russell,2.42,True
8,Eric Gordon,2.34,True
9,Ray Allen,2.29,False


In [15]:
#Write dataframe out to CSV file
per_game_df.to_csv("per_game.csv", index=False, header=True)

## Pull Data For Top Players 3PM per Game

In [16]:
# I manually looped through the top 10 players

In [17]:
#Define function to find player id from player name
def get_player_id(player_full_name):
    for player in all_players:
        if player_full_name.lower() == player['full_name'].lower():
            return player['id']

In [18]:
player_url = "https://www.nba.com/stats/player/" + str(get_player_id("Ray Allen")) + "/career/?PerMode=Totals"
url = player_url
browser.visit(url)
html = browser.html
bball_soup = BeautifulSoup(html, 'html.parser')
    
each_df = pd.DataFrame(columns=['Season Made Up', 'Season', 'Games Played','Season 3PM','Season 3PA', 'Season 3P %', 'Total 3PM'])

In [19]:
for i in range(0,100):
        try:
            x = str(i)
            player_leaders = bball_soup.find('tr', index=x)
            stats1 = player_leaders.find_all('td')
            allen_dict = {
                'Season Made Up': '',
                'Season': player_leaders.find('a').text,
                'Games Played': stats1[3].text,
                'Season 3PM': stats1[10].text,
                'Season 3PA': stats1[11].text,
                'Season 3P %': stats1[12].text,
                'Total 3PM': ''
            }
            each_df = each_df.append(allen_dict, ignore_index = True)
        except AttributeError:
            break

In [20]:
#Calculate Total 3PM by season
each_df = each_df.sort_values(by=['Season'])
total_3PM = 0
season = 2000
for i in range(len(each_df)):
    total_3PM+=int(each_df.iloc[i,3])
    each_df.iloc[i,6] = total_3PM
    each_df.iloc[i,0] = season
    season +=1
each_df.head()

Unnamed: 0,Season Made Up,Season,Games Played,Season 3PM,Season 3PA,Season 3P %,Total 3PM
19,2000,1996-97,82,117,298,39.3,117
18,2001,1997-98,82,134,368,36.4,251
17,2002,1998-99,50,74,208,35.6,325
16,2003,1999-00,82,172,407,42.3,497
15,2004,2000-01,82,202,467,43.3,699


In [21]:
#Write dataframe out to CSV file
each_df.to_csv("Allen.csv", index=False, header=True)

## Pull Data for 3PM and 3PA per Game per Season

In [22]:
# Go to basketball reference for stats by season
url = "https://www.basketball-reference.com/leagues/NBA_stats_per_game.html"
browser.visit(url)

In [23]:
#Initiate beatiful soup for this site
html = browser.html
bball_soup = BeautifulSoup(html, 'html.parser')

In [24]:
#Create dataframe and list to hold information
season_3P_df = pd.DataFrame(columns=['Season', 'Season 3PM per Game','Season 3PA per Game','Average 3P%'])
season_3P_df.head()

Unnamed: 0,Season,Season 3PM per Game,Season 3PA per Game,Average 3P%


In [25]:
#Using this range will given me data dating back to the 1979-1980 season which is when the 3 pointer was added
for i in range(1,47):
    # There are headers in certain rows in I try to read in these rows it will cause an error and stop the code
    # So I will skip these rows
    if i==20 or i==21 or i==42 or i==43:
        print("")
    else:
        x = str(i)
        inputTag = bball_soup.find(attrs={"data-row" : x})
        stats = inputTag.find_all('td')
        season_dict = {
            "Season": inputTag.a.text,
            "Season 3PM per Game": stats[9].text,
            "Season 3PA per Game": stats[10].text,
            "Average 3P%": stats[23].text
        }
        season_3P_df = season_3P_df.append(season_dict, ignore_index = True)







In [26]:
season_3P_df.head(10)

Unnamed: 0,Season,Season 3PM per Game,Season 3PA per Game,Average 3P%
0,2020-21,12.7,34.6,0.367
1,2019-20,12.2,34.1,0.358
2,2018-19,11.4,32.0,0.355
3,2017-18,10.5,29.0,0.362
4,2016-17,9.7,27.0,0.358
5,2015-16,8.5,24.1,0.354
6,2014-15,7.8,22.4,0.35
7,2013-14,7.7,21.5,0.36
8,2012-13,7.2,20.0,0.359
9,2011-12,6.4,18.4,0.349


In [27]:
#Write dataframe out to CSV file
season_3P_df.to_csv("season_totals.csv", index=False, header=True)

## Pull Data for All Time Leaders 3P%

In [28]:
# Go to page with leaders in all time 3P%
url = 'https://www.nba.com/stats/alltime-leaders/?SeasonType=Regular%20Season&StatCategory=FG3_PCT'
browser.visit(url)

In [29]:
#Initiate beatiful soup for this site
html = browser.html
bball_soup = BeautifulSoup(html, 'html.parser')

In [30]:
#Read in players with highest career 3P%
for i in range(0, 20):
    x = str(i)
    player_leader = bball_soup.find('tr', index=x)
    #Check in player is already in dataframe if he is do not add a duplicate
    if player_leader.find('a').text in sharpshooter_name:
        #If player already in dataframe add a Y to 3P% column to make it easier to filter
        y=0
        for x in sharpshooter_name:
            y+=1
            if x ==player_leader.find('a').text:
                sharpshooter_df.iloc[y-1,8]=i+1
    else:
        stats1 = player_leader.find_all('td')
        Min = stats1[3].text.split('\n')[1]
        
        sharpshooter_dict = {
            'Player Name': player_leader.find('a').text,
            'Games Played': stats1[2].text,
            'Minutes Played': Min,
            '3 Pointers Made': stats1[8].text,
            '3 Pointers Attempted': stats1[9].text,
            '3 Point %': float(stats1[10].text)/100,
            'Active?': get_player_active(player_leader.find('a').text),
            '3PM Rank': '',
            '3P% Rank': i+1
        }
        sharpshooter_name.append(player_leader.find('a').text)
        sharpshooter_df = sharpshooter_df.append(sharpshooter_dict, ignore_index = True) 

In [31]:
#sharpshooter_df.head(50)

In [32]:
#Write dataframe out to CSV file
sharpshooter_df.to_csv("sharpshooter.csv", index=False, header=True)

## Get Shot Chart Data

In [33]:
#Read in NBA Teams
all_teams = teams.get_teams()
#for team in all_teams:
#    print(team)
shot_df = pd.DataFrame()

In [34]:
#Define function to find player id from player name
def get_player_id(player_full_name):
    for player in all_players:
        if player_full_name.lower() == player['full_name'].lower():
            return player['id']

In [35]:
def get_team_id(team_abbreviation):
    for team in all_teams:
        if team_abbreviation.lower() == team['abbreviation'].lower():
            return team['id']

In [36]:
current_player = "shaquille o'neal"
player_id = get_player_id(current_player)
print(current_player)
print(player_id)

shaquille o'neal
406


In [37]:
team_id = get_team_id('LAL')
print(team_id)

1610612747


In [38]:
#Create a dataframe for shot chart details
#shot_df = pd.DataFrame(columns=['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE', 'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'HTM', 'VTM'])

In [39]:
# Here I am going to pull down the shot chart data for a specific player while they were on a specific team
shot_chart_json = shotchartdetail.ShotChartDetail(
            team_id = team_id,
            player_id = player_id,
            context_measure_simple = 'FGA',
            season_type_all_star = 'Regular Season')

In [40]:
shot_chart_data = json.loads(shot_chart_json.get_json())
# Extract just the useful information from the dictionary created
useful_data = shot_chart_data['resultSets'][0]
shot_dictionary = useful_data['rowSet']
print(useful_data['headers'])
#print(shot_dictionary[0])

['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE', 'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'HTM', 'VTM']


In [41]:
# Create a dataframe from the useful data 
shot_df = shot_df.append(shot_dictionary, ignore_index = True)
print(len(shot_df))

9498


In [42]:
#print(shot_df[0])

In [43]:
#Write file out to CSV file
shot_df.to_csv("shot_chart_attempt.csv", index=False, header=True)

## Code Not Used

In [None]:
#I am using nba api which is a python library created for analyzing NBA stats
from nba_api.stats.static import players
from nba_api.stats.static import teams
from nba_api.stats.endpoints import shotchartdetail
from nba_api.stats.endpoints import commonplayerinfo

In [None]:
common_player = commonplayerinfo.CommonPlayerInfo(201142)
common_player_data = json.loads(common_player.get_json())
team_id = common_player_data['resultSets'][0]['rowSet'][0][18]
print(team_id)

In [None]:
# Read in information for all players
all_players = players.get_players()
print(all_players[1081])

In [None]:
#Find the year every player retired
#CommonPlayerInfo is an endpoint with player data such as retiring year, position, team and a lot more
for i in range(100,120):
    common_player = 201142.CommonPlayerInfo(all_players[i]['id'])
    common_player_data = json.loads(common_player.get_json())
    #Filter out all players that retired before 1978
    if common_player_data['resultSets'][0]['rowSet'][0][25]>2000:
        player_dict = {
            'id': all_players[i]['id'],
            'full_name': all_players[i]['full_name'],
            'position': common_player_data['resultSets'][0]['rowSet'][0][15],
            'team_id': common_player_data['resultSets'][0]['rowSet'][0][18]
        }
        player_df = player_df.append(player_dict, ignore_index = True)

In [None]:
#Write file out to CSV file
player_df.to_csv("player_info.csv", index=False, header=True)

In [None]:
# Find the team the player was on that year 
player_info = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(
            player_id = 203518,
            season = '2016-17',
            season_type_playoffs = 'Regular Season')

In [None]:
#Convert object to dictionary
player_data = json.loads(player_info.get_json())
#print(player_data)