# NBA + NCAAB Player Data Collection

In this notebook, we'll be diving into how we can collect player data from the NBA and the NCAA using a package called [sportsreference](https://github.com/roclark/sportsreference). We'll be extracting the past 10 years of player data in the NCAA, then extracting the past 10 years of player data in the NBA, and finally combining the two to see all the different players that have played in both leagues in the past 10 years.

In [2]:
# Standard imports
import numpy as np
import pandas as pd
import json
import time
import os.path
from os import path
import math
import datetime
import unidecode
import requests
from bs4 import BeautifulSoup

## Part 1: Getting all the players in the NCAA from the past 10 years, and all their corresponding college stats

In [1]:
# Modules from sportsrefernece.ncaab for college basketball
from sportsreference.ncaab.boxscore import Boxscore as NCAAB_Boxscore
from sportsreference.ncaab.conferences import Conferences as NCAAB_Conferences
from sportsreference.ncaab.rankings import Rankings as NCAAB_Rankings
from sportsreference.ncaab.roster import Player as NCAAB_Player
from sportsreference.ncaab.roster import Roster as NCAAB_Roster
from sportsreference.ncaab.schedule import Schedule as NCAAB_Schedule
from sportsreference.ncaab.teams import Teams as NCAAB_Teams

In [3]:
s = NCAAB_Player('stephen-curry-1')
s

<sportsreference.ncaab.roster.Player at 0x10e8027f0>

In [5]:
s.dataframe.columns

Index(['assist_percentage', 'assists', 'block_percentage', 'blocks',
       'box_plus_minus', 'conference', 'defensive_box_plus_minus',
       'defensive_rebound_percentage', 'defensive_rebounds',
       'defensive_win_shares', 'effective_field_goal_percentage',
       'field_goal_attempts', 'field_goal_percentage', 'field_goals',
       'free_throw_attempt_rate', 'free_throw_attempts',
       'free_throw_percentage', 'free_throws', 'games_played', 'games_started',
       'height', 'minutes_played', 'offensive_box_plus_minus',
       'offensive_rebound_percentage', 'offensive_rebounds',
       'offensive_win_shares', 'personal_fouls', 'player_efficiency_rating',
       'player_id', 'points', 'points_produced', 'position',
       'steal_percentage', 'steals', 'team_abbreviation',
       'three_point_attempt_rate', 'three_point_attempts',
       'three_point_percentage', 'three_pointers', 'total_rebound_percentage',
       'total_rebounds', 'true_shooting_percentage', 'turnover_percentag

In [3]:
# Getting the past 10 years
past_10_years = ['201' + str(end) for end in range(10)]

# Storing player_ids for ease of usage
all_ids = set()

In [394]:
past_10_years

['2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019']

In [400]:
gsw = NBA_Roster('GSW', year='2010')

In [416]:
gsw.players[4].dataframe

Unnamed: 0,and_ones,assist_percentage,assists,block_percentage,blocking_fouls,blocks,box_plus_minus,center_percentage,defensive_box_plus_minus,defensive_rebound_percentage,...,turnovers,two_point_attempts,two_point_percentage,two_pointers,two_pointers_assisted_percentage,usage_percentage,value_over_replacement_player,weight,win_shares,win_shares_per_48_minutes
2005-06,,14.9,78.0,0.9,,11.0,-3.1,0,-0.4,10.7,...,58.0,236.0,0.441,104.0,0.423,20.2,-0.2,185,0.5,0.028
2006-07,,18.8,319.0,0.6,,21.0,-1.0,0,-0.6,7.7,...,221.0,867.0,0.509,441.0,0.449,22.4,0.7,185,4.0,0.072
2007-08,,15.8,315.0,0.6,,27.0,1.7,0,-0.4,9.8,...,173.0,1175.0,0.545,640.0,0.455,21.7,2.9,185,9.0,0.14
2008-09,,17.5,93.0,0.6,,8.0,-3.3,0,-1.8,11.5,...,67.0,404.0,0.46,186.0,0.323,25.8,-0.3,185,0.2,0.012
2009-10,,21.2,340.0,0.7,,25.0,-1.5,0,-1.4,9.0,...,244.0,1178.0,0.47,554.0,0.314,29.5,0.3,185,1.3,0.023
2010-11,,23.4,450.0,0.5,,23.0,0.2,0,-1.8,8.5,...,252.0,1232.0,0.478,589.0,0.35,28.1,1.8,185,6.0,0.089
2011-12,,28.6,346.0,0.6,,18.0,-0.6,0,-1.9,9.6,...,177.0,839.0,0.462,388.0,0.358,28.7,0.8,185,2.6,0.059
2012-13,,27.1,496.0,0.9,,36.0,0.8,0,0.0,9.8,...,254.0,1108.0,0.454,503.0,0.372,26.3,2.2,185,4.6,0.071
2013-14,,24.8,471.0,0.6,,23.0,0.0,0,-0.6,9.8,...,264.0,1069.0,0.474,507.0,0.268,26.0,1.5,185,4.9,0.078
2014-15,,20.5,329.0,0.8,,25.0,-0.1,0,-0.7,6.4,...,198.0,1070.0,0.487,521.0,0.319,27.9,1.3,185,3.6,0.065


In [415]:
print(gsw.players[4]._season)

['2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', 'Career', '2017-18', '2018-19']


In [417]:
for player in gsw.players:
    print(player._season)
    print(player.name)

['2006-07', '2007-08', '2008-09', '2009-10', '2011-12', 'Career', '2010-11', '2012-13']
Kelenna Azubuike
['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', 'Career', '2012-13']
Raja Bell
['2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', 'Career']
Andris Biedriņš
['2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', 'Career']
Stephen Curry
['2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', 'Career', '2017-18', '2018-19']
Monta Ellis
['1999-00', '2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', 'Career']
Devean George
['2009-10', 'Career']
Chris Hunter
['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '

In [395]:
davidson = NCAAB_Roster('davidson', year='2010')

In [399]:
for player in davidson.players:
    print(player._season)
    print(player.name)

['2009-10', '2010-11', '2011-12', '2012-13', 'Career']
Jake Cohen
['2009-10', '2010-11', '2011-12', '2012-13', 'Career']
J.P. Kuhlman
['2006-07', '2007-08', '2008-09', '2009-10', 'Career']
Will Archambault
['2007-08', '2008-09', '2009-10', '2010-11', 'Career']
Brendan Mckillop
['2006-07', '2007-08', '2008-09', '2009-10', 'Career']
Steve Rossiter
['2008-09', '2009-10', '2010-11', 'Career']
Ben Allison
['2006-07', '2007-08', '2008-09', '2009-10', 'Career']
Bryant Barr
['2009-10', '2010-11', '2011-12', '2012-13', 'Career']
Nik Cochran
['2008-09', '2009-10', '2010-11', '2011-12', 'Career']
Frank Ben-Eze
['2006-07', '2007-08', '2008-09', '2009-10', 'Career']
Dan Nelms
['2008-09', '2009-10', '2010-11', '2011-12', 'Career']
Will Reigel
['2008-09', '2009-10', '2010-11', '2011-12', 'Career']
A.J. Atkinson
['2007-08', '2008-09', '2009-10', 'Career']
Aaron Bond


In [None]:
# Finding all the different player_ids for each of the different teams in the NCAA over the past 10 years
for year in past_10_years:
    teams = NCAAB_Teams(year)
    for team in teams:
        player_ids = NCAAB_Roster(team.abbreviation, year, True).players.keys()
        print("Finished with " + team.abbreviation + " in " + year + ". Moving on to the next year.")
        all_ids.update(player_ids)

In [None]:
player_ids = {'player_ids': list(all_ids)}

In [None]:
# Saving all player_ids to local file.
with open('player_ids.json', 'w') as outfile:
    json.dump(player_ids, outfile)
    outfile.close()

### Now that we've obtained all the different player_ids from the past 10 years in the NCAA, we can create a dataframe from all of the players' career averages.

In [4]:
with open('player_ids.json', 'r') as file:
    data = json.loads(file.read())
    player_ids = data['player_ids']

In [5]:
len(player_ids)

20599

In [6]:
def clean_height(df):
    def convert_height_to_int(string):
        """
        Converts height data in format 6-8 (6 feet 8 inches) to only inches (80).
        """
        return int(string[0]) * 12 + int(string[2])
    
    df['height'] = df['height'].apply(convert_height_to_int)
    return df

In [7]:
def construct_data(player_ids, num_players, interval_length, only_career=False, serialized=False):
    """
    Constructs the dataframe of player's data over the years they've played, with options to only return career
    average stats or return serialized data.
    
    Calculates total time to run and prints time taken between given interval length.
    """
    # Return dataframe if it already exists as a csv.
    if serialized and str(path.exists('ncaa_data.csv')):
        return pd.read_csv('ncaa_data.csv')
    
    # Begin recording time
    start_time = time.time()
    last_interval = start_time
    
    # Create basic df
    combined_stats = NCAAB_Player(player_ids[0]).dataframe
    
    # Add player data over time to dataframe.
    for i in range(1, num_players):
        player_id = player_ids[i]
       
        # Getting yearly and career average stats for each player.
        try:
            stats = NCAAB_Player(player_id).dataframe # Remove .loc['Career'] to also get yearly stats
            combined_stats = combined_stats.append(stats)
        except AttributeError:
            print("Error here")
            print(player_id)
            pass
        
        # Printing interval breakpoints
        if i % interval_length == 0:
            print("Currently at: " + str(i))
            curr_time = time.time()
            print("Taken " + str(curr_time - last_interval) + " seconds since the last breakpoint.")
            last_interval = curr_time

    end_time = time.time()
    print("Taken " + str(end_time - start_time) + " seconds to run the above code.")

    # Formatting dataframe
    combined_stats = combined_stats.rename_axis('row_type').reset_index()
    cols = ['player_id']  + [col for col in combined_stats if col != 'player_id']
    combined_stats = combined_stats[cols]
    clean_stats = clean_height(combined_stats)
    
    # Store data in csv
    clean_stats.to_csv('ncaa_data.csv')
    
    return clean_stats

In [8]:
per_id = 0.5 # Takes around 0.5 seconds to query per player_id
duration = len(player_ids) * per_id / 60 / 60
hours = math.floor(duration)
minutes = round(duration % 1 * 60)
print("""Constructing the player data for all the NCAAB players in the past 10 years will take around

{} hours and {} minutes

to finish! Long time-- watch a movie or something and let this run in the background.""".format(hours, minutes))

Constructing the player data for all the NCAAB players in the past 10 years will take around

2 hours and 52 minutes

to finish! Long time-- watch a movie or something and let this run in the background.


In [9]:
data = construct_data(player_ids, 50, 5, serialized=True)
data

Unnamed: 0.1,Unnamed: 0,player_id,row_type,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,...,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes
0,0,raymar-morgan-1,2006-07,,24,,13,,big-ten,,...,0.538,18.5,69.0,233,0.498,116,,230,2.8,0.148
1,1,raymar-morgan-1,2007-08,12.3,58,1.4,19,,big-ten,,...,0.6,16.5,83.0,302,0.586,177,26.2,230,5.6,0.224
2,2,raymar-morgan-1,2008-09,11.2,41,0.8,7,,big-ten,,...,0.568,16.4,62.0,230,0.552,127,23.6,230,3.7,0.19
3,3,raymar-morgan-1,2009-10,12.9,63,2.9,25,,big-ten,,...,0.569,16.2,69.0,274,0.544,149,22.0,230,4.3,0.174
4,4,raymar-morgan-1,Career,12.2,186,1.8,64,,,,...,0.571,16.8,283.0,1039,0.548,569,24.0,230,16.4,0.186
5,5,erik-stevenson-1,2018-19,18.2,78,1.0,7,3.4,aac,3.0,...,0.455,18.0,58.0,82,0.415,34,19.5,210,1.6,0.077
6,6,erik-stevenson-1,2019-20,18.1,39,1.5,6,10.5,aac,4.9,...,0.549,9.9,21.0,77,0.481,37,22.7,210,2.5,0.232
7,7,erik-stevenson-1,Career,18.2,117,1.2,13,5.9,,3.7,...,0.494,14.8,79.0,159,0.447,71,20.6,210,4.1,0.131
8,8,kevin-booze-1,2014-15,30.3,85,0.0,0,-5.2,southland,-3.9,...,0.51,23.7,55.0,71,0.338,24,21.4,170,1.1,0.083
9,9,kevin-booze-1,2015-16,37.6,114,0.2,1,-3.6,southland,-3.3,...,0.526,21.9,67.0,125,0.472,59,23.1,170,1.5,0.096


### TODO: Deal with quantitative variables (conference, position, team_abbreviation). First two may be dealt with one hot encoding, but last one is difficult. 

### Idea: Funnel players into two groups, determined by whether or not they play for a historically well-drafted school.

### TODO: Fix NaN values for people with career averages of NaN (look at raymar-morgan-1)

# Part 2: Getting all the players in the NBA rookies that have come from the NCAA from the past 9 years, and all their corresponding rookie year stats

For this part, we did this two different ways, one mapping all the different NCAAB players to their possible NBA counterparts, and the other mapping all the NBA players to their possible NCAAB counterparts (there's no guarentee someone who played in the NCAA played in the NBA, or vice versa). Both options are listed, though we'll probably use the second one more because it's the cleaner, quicker way (less queries to make as there's less NBA players than NCAA players).

In [10]:
# Modules from sportsrefernece.ncaab for college basketball
from sportsreference.nba.boxscore import Boxscore as NBA_Boxscore
from sportsreference.nba.roster import Player as NBA_Player
from sportsreference.nba.roster import Roster as NBA_Roster
from sportsreference.nba.schedule import Schedule as NBA_Schedule
from sportsreference.nba.teams import Teams as NBA_Teams

## First way, taking all the different NCAA player ids, and seeing if these players exist in the NBA. Not as quick, as this way is many-to-one.

In [11]:
def convert_ncaa_nba_name(name):
    """
    Converts the format of the NCAA player_id to the NBA player_id.
    """
    splitted = name.split("-")
    first = splitted[0]
    last, num = splitted[len(splitted) - 2:] # To take into account middle names and hyphened names.
    return last[:5] + first[:2] + '0' + num

In [41]:
def get_nba_players(player_ids, num_players=0):
    """
    Gets the num_players number of players from the given NCAA player_ids
    from 2010 onwards.
    """
    players = []
    ncaa_ids = []
    if not num_players:
        num_players = len(player_ids)
    for num in range(len(player_ids)):
        nba_name = convert_ncaa_nba_name(player_ids[num])
        nba_player = NBA_Player(nba_name)
        
        try:
            nba_player.dataframe
        except TypeError:
            print("NCAA player", player_ids[num], "didn't make it to the NBA from 2009 to today.")
            continue
        
        date_str = nba_player._most_recent_season[:-3]
        try:
            date_obj = datetime.datetime.strptime(date_str, '%Y').date()
        except ValueError:
            print("Not enough data on NBA player", nba_player.player_id, ". Most likely not recent enough player to have data supported.")
            continue
        
        date_2010 = datetime.datetime.strptime('2010', '%Y').date()
        if date_obj >= date_2010:
            print("Found NBA player", nba_player.player_id)
            players.append(nba_player)
            ncaa_ids.append(player_ids[num])
    return players, ncaa_ids


In [13]:
# Timing how long it takes to retrive 'num' number of players
start_time = time.time()
current_nba_players, ncaa_ids = get_nba_players(player_ids, 100)
end_time = time.time()
print("Took", end_time - start_time, "seconds.")
current_nba_players, ncaa_ids

NCAA player garrett-golday-1 didn't make it to the NBA from 2009 to today.
NCAA player josh-armstrong-1 didn't make it to the NBA from 2009 to today.
NCAA player xairius-larry-1 didn't make it to the NBA from 2009 to today.
NCAA player cameron-miles-1 didn't make it to the NBA from 2009 to today.
NCAA player devon-friend-1 didn't make it to the NBA from 2009 to today.
NCAA player tyler-foster-2 didn't make it to the NBA from 2009 to today.
NCAA player mikhail-mclean-1 didn't make it to the NBA from 2009 to today.
NCAA player eric-horn-2 didn't make it to the NBA from 2009 to today.
NCAA player shane-reybold-1 didn't make it to the NBA from 2009 to today.
NCAA player josh-cuthbertson-1 didn't make it to the NBA from 2009 to today.
NCAA player zjori-bosha-1 didn't make it to the NBA from 2009 to today.
NCAA player questin-shropshire-1 didn't make it to the NBA from 2009 to today.
NCAA player robert-mitchell-1 didn't make it to the NBA from 2009 to today.
NCAA player griffin-hoffmann-1 di

([<sportsreference.nba.roster.Player at 0x10dbb9c88>,
  <sportsreference.nba.roster.Player at 0x118e88fd0>,
  <sportsreference.nba.roster.Player at 0x118e89d30>],
 ['nik-stauskas-1', 'lance-stephenson-1', 'jamie-jones-2'])

## Other way, from NBA to NCAA, where we get all the NBA Players from the past 10 years, and then their respective NBA data. Doing it this way because there are less NBA players than NCAA players (less requests)

### Very difficult to convert nba name to ncaa name. Lots of verification, for cases where this doesn't work, we will just exclude their data. 

### I.e. Frank Mason III, Troy Brown Jr., P.J. Tucker. May need to expand this function and fully encapsulate all the different cases

In [14]:
def convert_nba_ncaa_name(name):
    """
    Converts the format of the NBA player_id to the NCAA player_id.
    """
    return name.lower().replace(" ", "-") + "-1"

# Set columns to NBA and NCAAB so that the tables can merge properly

In [240]:
nba_curry = NBA_Player('curryst01').dataframe
nba_curry.rename(columns=lambda x: 'NBA_' + x, inplace=True)
col_names = ['name'] + list(nba_curry.columns)
nba_curry['name'] = 'Stephen Curry'
nba_curry = nba_curry[col_names]

In [244]:
rook = nba_curry.iloc[[0]]
rook

Unnamed: 0,name,NBA_and_ones,NBA_assist_percentage,NBA_assists,NBA_block_percentage,NBA_blocking_fouls,NBA_blocks,NBA_box_plus_minus,NBA_center_percentage,NBA_defensive_box_plus_minus,...,NBA_turnovers,NBA_two_point_attempts,NBA_two_point_percentage,NBA_two_pointers,NBA_two_pointers_assisted_percentage,NBA_usage_percentage,NBA_value_over_replacement_player,NBA_weight,NBA_win_shares,NBA_win_shares_per_48_minutes
2009-10,Stephen Curry,,24.6,472,0.5,,19,1.0,0,-0.7,...,243,763,0.474,362,0.307,21.8,2.2,190,4.7,0.077


In [338]:
curry = NBA_Player('curryst01')
curry

<sportsreference.nba.roster.Player at 0x11af0afd0>

In [None]:
datetime.datetime.strptime(curry._season[0], '%Y').date()

In [353]:
curry.__dict__

{'_most_recent_season': '2018-19',
 '_index': 11,
 '_player_id': 'curryst01',
 '_season': ['2009-10',
  '2010-11',
  '2011-12',
  '2012-13',
  '2013-14',
  '2014-15',
  '2015-16',
  '2016-17',
  '2017-18',
  '2018-19',
  '2019-20',
  'Career'],
 '_name': 'Stephen Curry',
 '_team_abbreviation': ['GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  'GSW',
  ''],
 '_position': ['PG',
  'PG',
  'PG',
  'PG',
  'PG',
  'PG',
  'PG',
  'PG',
  'PG',
  'PG',
  'PG',
  ''],
 '_height': '6-3',
 '_weight': '190lb',
 '_birth_date': '1988-03-14',
 '_nationality': 'United States of America',
 '_games_played': ['80',
  '74',
  '26',
  '78',
  '78',
  '80',
  '79',
  '79',
  '51',
  '69',
  '4',
  '698'],
 '_games_started': ['77',
  '74',
  '23',
  '78',
  '78',
  '80',
  '79',
  '79',
  '51',
  '69',
  '4',
  '692'],
 '_player_efficiency_rating': ['16.3',
  '19.4',
  '21.2',
  '21.3',
  '24.1',
  '28.0',
  '31.5',
  '24.6',
  '28.2',
  '24.4',
  '20.9',
  '23.8']

In [2]:
other_curry = NCAAB_Player('stephen-curry-1')
other_curry.dataframe

NameError: name 'NCAAB_Player' is not defined

In [355]:
curry._season[0][0:4]

'2009'

In [361]:
a = datetime.datetime.strptime(curry._season[0][0:4], '%Y').date()

In [362]:
a = datetime.datetime.strptime(curry._season[0][0:4], '%Y').date()
b = datetime.datetime.strptime(other_curry._most_recent_season[0:4], '%Y').date()

In [363]:
a > b

True

In [255]:
college_curry = NCAAB_Player('stephen-curry-1').dataframe
college_curry.rename(columns=lambda x: 'NCAA_' + x, inplace=True)
col_names = ['name'] + list(college_curry.columns)
college_curry['name'] = 'Stephen Curry'
college_curry = college_curry[col_names]
one_yr = college_curry.iloc[[college_curry.shape[0] - 2]]
one_yr

Unnamed: 0,name,NCAA_assist_percentage,NCAA_assists,NCAA_block_percentage,NCAA_blocks,NCAA_box_plus_minus,NCAA_conference,NCAA_defensive_box_plus_minus,NCAA_defensive_rebound_percentage,NCAA_defensive_rebounds,...,NCAA_true_shooting_percentage,NCAA_turnover_percentage,NCAA_turnovers,NCAA_two_point_attempts,NCAA_two_point_percentage,NCAA_two_pointers,NCAA_usage_percentage,NCAA_weight,NCAA_win_shares,NCAA_win_shares_per_40_minutes
2008-09,Stephen Curry,40.2,189,0.6,8,,southern,,,130,...,0.604,13.5,126,351,0.519,182,38.3,185,9.9,0.348


In [265]:
merged = one_yr.merge(rook)
merged

Unnamed: 0,name,NCAA_assist_percentage,NCAA_assists,NCAA_block_percentage,NCAA_blocks,NCAA_box_plus_minus,NCAA_conference,NCAA_defensive_box_plus_minus,NCAA_defensive_rebound_percentage,NCAA_defensive_rebounds,...,NBA_turnovers,NBA_two_point_attempts,NBA_two_point_percentage,NBA_two_pointers,NBA_two_pointers_assisted_percentage,NBA_usage_percentage,NBA_value_over_replacement_player,NBA_weight,NBA_win_shares,NBA_win_shares_per_48_minutes
0,Stephen Curry,40.2,189,0.6,8,,southern,,,130,...,243,763,0.474,362,0.307,21.8,2.2,190,4.7,0.077


In [288]:
beginning = pd.DataFrame(columns=one_yr.merge(rook).columns)
new = beginning.append(merged)
new

Unnamed: 0,name,NCAA_assist_percentage,NCAA_assists,NCAA_block_percentage,NCAA_blocks,NCAA_box_plus_minus,NCAA_conference,NCAA_defensive_box_plus_minus,NCAA_defensive_rebound_percentage,NCAA_defensive_rebounds,...,NBA_turnovers,NBA_two_point_attempts,NBA_two_point_percentage,NBA_two_pointers,NBA_two_pointers_assisted_percentage,NBA_usage_percentage,NBA_value_over_replacement_player,NBA_weight,NBA_win_shares,NBA_win_shares_per_48_minutes
0,Stephen Curry,40.2,189,0.6,8,,southern,,,130,...,243,763,0.474,362,0.307,21.8,2.2,190,4.7,0.077


In [259]:
format_df('Stephen Curry', NBA_Player('curryst01').dataframe, False, lambda x: 'NBA_' + x)

Unnamed: 0,name,NBA_and_ones,NBA_assist_percentage,NBA_assists,NBA_block_percentage,NBA_blocking_fouls,NBA_blocks,NBA_box_plus_minus,NBA_center_percentage,NBA_defensive_box_plus_minus,...,NBA_turnovers,NBA_two_point_attempts,NBA_two_point_percentage,NBA_two_pointers,NBA_two_pointers_assisted_percentage,NBA_usage_percentage,NBA_value_over_replacement_player,NBA_weight,NBA_win_shares,NBA_win_shares_per_48_minutes
2009-10,Stephen Curry,,24.6,472,0.5,,19,1.0,0,-0.7,...,243,763,0.474,362,0.307,21.8,2.2,190,4.7,0.077


In [257]:
def format_df(player_name, df, is_college, func):
    """
    Format's the dataframe to be merged with other league dataframe later.
    """
    df.rename(columns=func, inplace=True)
    col_names = ['name'] + list(df.columns)
    df['name'] = player_name
    df = df[col_names]
    if is_college == True:
        return df.iloc[[df.shape[0] - 2]]
    else:
        return df.iloc[[0]]

In [433]:
def get_nba_ncaa_10_years(set_players, one_loop=True):
    """
    Getting the college basketball data for all NBA Players in the past 10 years.
    """
    # Generating columns for combined dataframe
    nba_cols = format_df('Stephen Curry', NBA_Player('curryst01').dataframe, False, lambda x: 'NBA_' + x) 
    ncaa_cols = format_df('Stephen Curry', NCAAB_Player('stephen-curry-1').dataframe, True, lambda x: 'NCAAB_' + x)
    all_cols = nba_cols.merge(ncaa_cols).columns

    combined = pd.DataFrame(columns=all_cols)
    seen = set() # To keep track of seen NBA players
    failed = dict()

    for year in past_10_years:
        
        sub_year = pd.DataFrame(columns=all_cols)
        
        teams = NBA_Teams(year=year)
        for team in teams:
            
            start = time.time() # For time measuring purposes
            
            players = NBA_Roster(team.abbreviation, year).players
            for player in players:
                if player.name in seen:
                    continue
                seen.add(player.name)
                unaccented_name = unidecode.unidecode(player.name) # We use this because maybe some of NBA players played in the NCAA with an accented name
                ncaab_player_id = convert_nba_ncaa_name(unaccented_name) 
                
                try:
                    college_stats = NCAAB_Player(ncaab_player_id)
                except TypeError: # Player doesn't exist
                    print("Couldn't find NCAA player date for", player.name, ". Moving on.")
                    if 'lost' not in failed:
                        failed['lost'] = [player.name]
                    else:
                        failed['lost'].append(player.name)
                    continue
                
                if ncaab_player_id in set_players:
                    
                    # Confirming that the college player we find for the given NBA player has indeed played in college before the NBA
                    # (verifying that they are the same person, as you can't play in the NBA and then play in the NCAA)
                    last_college_date = datetime.datetime.strptime(college_stats._most_recent_season[0:4], '%Y').date()
                    first_nba_date = datetime.datetime.strptime(player._season[0][0:4], '%Y').date()
                   
                    if first_nba_date > last_college_date:
                        
                        # Generating properly formatted dataframes for college and NBA stats
                        new_college = format_df(player.name, college_stats.dataframe, True, lambda x: 'NCAAB_' + x)
                        new_nba = format_df(player.name, player.dataframe, False, lambda x: 'NBA_' + x)

                        merged = new_nba.merge(new_college)
                        sub_year = sub_year.append(merged)
#                         combined = combined.append(merged)
                    else:
                        print("NBA Date before college date for", player.name, ". Moving on.")
                        if 'invalid-date' not in failed:
                            failed['invalid-date'] = [player.name]
                        else:
                            failed['invalid-date'].append(player.name)
                        continue
                
                else:
                    print("College id not in set_players for", player.name, ". Moving on.")
                    if 'old-player' not in failed:
                        failed['old-player'] = [player.name]
                    else:
                        failed['old-player'].append(player.name)
                    continue

            print("\n")
            print("Looked at", team.name, "on year", year, ". Moving to the next team.")
            print("\n")

            if one_loop:
                end = time.time()
                print("One iteration for one team and one year would take", end - start, "seconds to run.")
                print("Would take", (end - start) * 300 / 60 / 60, "hours to find all players that played in the NBA in the past 10 years and their respective college stats.")
                return combined.reset_index().drop(columns=['index']), failed
        
        cleaned_year = sub_year.reset_index().drop(columns=['index'])
        cy = cleaned_year.to_csv("{}_Player_Data.csv".format(year))
        
        combined = combined.append(cleaned_year)
    
    return combined.reset_index().drop(columns=['index']), failed


In [434]:
data, failed = get_nba_ncaa_10_years(player_ids, one_loop=False)
csv_data = data.to_csv("all_player_data.csv")

Couldn't find NCAA player date for Lou Amundson . Moving on.
Couldn't find NCAA player date for Leandro Barbosa . Moving on.
College id not in set_players for Earl Clark . Moving on.
College id not in set_players for Jarron Collins . Moving on.
Couldn't find NCAA player date for Goran Dragić . Moving on.
College id not in set_players for Jared Dudley . Moving on.
College id not in set_players for Channing Frye . Moving on.
College id not in set_players for Taylor Griffin . Moving on.
College id not in set_players for Grant Hill . Moving on.
College id not in set_players for Dwayne Jones . Moving on.
College id not in set_players for Robin Lopez . Moving on.
College id not in set_players for Steve Nash . Moving on.
College id not in set_players for Jason Richardson . Moving on.
Couldn't find NCAA player date for Amar'e Stoudemire . Moving on.
College id not in set_players for Alando Tucker . Moving on.


Looked at Phoenix Suns on year 2010 . Moving to the next team.


College id not in 

College id not in set_players for Daniel Gibson . Moving on.
College id not in set_players for Danny Green . Moving on.
Couldn't find NCAA player date for J.J. Hickson . Moving on.
Couldn't find NCAA player date for Zydrunas Ilgauskas . Moving on.
College id not in set_players for Cedric Jackson . Moving on.
College id not in set_players for Darnell Jackson . Moving on.
Couldn't find NCAA player date for LeBron James . Moving on.
College id not in set_players for Antawn Jamison . Moving on.
Couldn't find NCAA player date for Jamario Moon . Moving on.
Couldn't find NCAA player date for Shaquille O'Neal . Moving on.
College id not in set_players for Anthony Parker . Moving on.
College id not in set_players for Leon Powe . Moving on.
Couldn't find NCAA player date for Sebastian Telfair . Moving on.
Couldn't find NCAA player date for Anderson Varejão . Moving on.
College id not in set_players for Delonte West . Moving on.
College id not in set_players for Jawad Williams . Moving on.
Colleg

College id not in set_players for Bobby Brown . Moving on.
College id not in set_players for Devin Brown . Moving on.
College id not in set_players for Darren Collison . Moving on.
College id not in set_players for Aaron Gray . Moving on.
College id not in set_players for Jason Hart . Moving on.
College id not in set_players for Sean Marks . Moving on.
College id not in set_players for Emeka Okafor . Moving on.
College id not in set_players for Chris Paul . Moving on.
College id not in set_players for Morris Peterson . Moving on.
College id not in set_players for James Posey . Moving on.
College id not in set_players for Darius Songaila . Moving on.
Couldn't find NCAA player date for Peja Stojaković . Moving on.
College id not in set_players for Marcus Thornton . Moving on.
College id not in set_players for David West . Moving on.
College id not in set_players for Julian Wright . Moving on.


Looked at New Orleans Hornets on year 2010 . Moving to the next team.


College id not in set_

College id not in set_players for Dwyane Wade . Moving on.
Couldn't find NCAA player date for Dorell Wright . Moving on.


Looked at Miami Heat on year 2010 . Moving to the next team.


College id not in set_players for Gilbert Arenas . Moving on.
Couldn't find NCAA player date for Andray Blatche . Moving on.
College id not in set_players for Earl Boykins . Moving on.
College id not in set_players for Paul Davis . Moving on.
College id not in set_players for Randy Foye . Moving on.
College id not in set_players for Alonzo Gee . Moving on.
College id not in set_players for Mike James . Moving on.
College id not in set_players for JaVale McGee . Moving on.
College id not in set_players for Mike Miller . Moving on.
Couldn't find NCAA player date for Fabricio Oberto . Moving on.
College id not in set_players for Al Thornton . Moving on.
College id not in set_players for Nick Young . Moving on.


Looked at Washington Wizards on year 2010 . Moving to the next team.


College id not in set_pl



Looked at Cleveland Cavaliers on year 2011 . Moving to the next team.


Couldn't find NCAA player date for Pape Sy . Moving on.


Looked at Atlanta Hawks on year 2011 . Moving to the next team.


College id not in set_players for Patrick Ewing . Moving on.


Looked at New Orleans Hornets on year 2011 . Moving to the next team.


College id not in set_players for Orien Greene . Moving on.


Looked at New Jersey Nets on year 2011 . Moving to the next team.




Looked at Charlotte Bobcats on year 2011 . Moving to the next team.




Looked at Milwaukee Bucks on year 2011 . Moving to the next team.


NBA Date before college date for Jordan Hamilton . Moving on.


Looked at Denver Nuggets on year 2012 . Moving to the next team.


College id not in set_players for Derrick Byars . Moving on.
Couldn't find NCAA player date for Eric Dawson . Moving on.
College id not in set_players for Justin Dentmon . Moving on.


Looked at San Antonio Spurs on year 2012 . Moving to the next team.




Looked 

College id not in set_players for Reggie Bullock . Moving on.


Looked at Los Angeles Clippers on year 2014 . Moving to the next team.




Looked at Houston Rockets on year 2014 . Moving to the next team.




Looked at Minnesota Timberwolves on year 2014 . Moving to the next team.




Looked at Portland Trail Blazers on year 2014 . Moving to the next team.




Looked at Oklahoma City Thunder on year 2014 . Moving to the next team.




Looked at San Antonio Spurs on year 2014 . Moving to the next team.


College id not in set_players for Dionte Christmas . Moving on.


Looked at Phoenix Suns on year 2014 . Moving to the next team.


Couldn't find NCAA player date for Ricky Ledo . Moving on.
College id not in set_players for Gal Mekel . Moving on.


Looked at Dallas Mavericks on year 2014 . Moving to the next team.




Looked at Denver Nuggets on year 2014 . Moving to the next team.


Couldn't find NCAA player date for Ognjen Kuzmić . Moving on.
Couldn't find NCAA player date for Nemanja

Couldn't find NCAA player date for Boban Marjanović . Moving on.


Looked at San Antonio Spurs on year 2016 . Moving to the next team.




Looked at Charlotte Hornets on year 2016 . Moving to the next team.


Couldn't find NCAA player date for Edy Tavares . Moving on.


Looked at Atlanta Hawks on year 2016 . Moving to the next team.


Couldn't find NCAA player date for Bryce Dejean-Jones . Moving on.


Looked at New Orleans Pelicans on year 2016 . Moving to the next team.




Looked at Toronto Raptors on year 2016 . Moving to the next team.


Couldn't find NCAA player date for Nemanja Bjelica . Moving on.


Looked at Minnesota Timberwolves on year 2016 . Moving to the next team.


Couldn't find NCAA player date for Salah Mejri . Moving on.


Looked at Dallas Mavericks on year 2016 . Moving to the next team.


College id not in set_players for Joe Young . Moving on.


Looked at Indiana Pacers on year 2016 . Moving to the next team.


Couldn't find NCAA player date for Mario Hezonja . Mo



Looked at Oklahoma City Thunder on year 2018 . Moving to the next team.




Looked at Washington Wizards on year 2018 . Moving to the next team.


Couldn't find NCAA player date for James Webb III . Moving on.


Looked at Brooklyn Nets on year 2018 . Moving to the next team.


College id not in set_players for Sterling Brown . Moving on.
Couldn't find NCAA player date for D.J. Wilson . Moving on.


Looked at Milwaukee Bucks on year 2018 . Moving to the next team.




Looked at Portland Trail Blazers on year 2018 . Moving to the next team.


Couldn't find NCAA player date for T.J. Leaf . Moving on.


Looked at Indiana Pacers on year 2018 . Moving to the next team.


Couldn't find NCAA player date for Frank Ntilikina . Moving on.


Looked at New York Knicks on year 2018 . Moving to the next team.


Couldn't find NCAA player date for Naz Mitrou-Long . Moving on.
Couldn't find NCAA player date for Royce O'Neale . Moving on.


Looked at Utah Jazz on year 2018 . Moving to the next team.




In [438]:
data

Unnamed: 0,name,NBA_and_ones,NBA_assist_percentage,NBA_assists,NBA_block_percentage,NBA_blocking_fouls,NBA_blocks,NBA_box_plus_minus,NBA_center_percentage,NBA_defensive_box_plus_minus,...,NCAAB_true_shooting_percentage,NCAAB_turnover_percentage,NCAAB_turnovers,NCAAB_two_point_attempts,NCAAB_two_point_percentage,NCAAB_two_pointers,NCAAB_usage_percentage,NCAAB_weight,NCAAB_win_shares,NCAAB_win_shares_per_40_minutes
0,Landry Fields,,9.0,155,0.5,,17,0.6,0,0.1,...,0.560,11.7,83,420,0.521,219,31.5,215,6.0,0.206
1,Andy Rautins,,18.7,3,0.0,,0,-16.0,0,-4.3,...,0.632,22.6,98,56,0.571,32,18.4,195,4.9,0.173
2,Patrick Patterson,,7.6,41,3.0,,37,0.5,0,0.5,...,0.624,8.6,41,305,0.626,191,18.0,235,7.0,0.223
3,Gani Lawal,,0.0,0,0.0,,0,-4.9,0,0.5,...,0.547,15.3,78,324,0.531,172,26.1,234,4.1,0.177
4,Cole Aldrich,,4.0,4,3.6,,7,-0.8,0,2.6,...,0.596,14.1,56,265,0.562,149,19.9,245,5.9,0.245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,Collin Sexton,,15.3,243,0.2,,6,-5.2,0,-3.7,...,0.567,14.2,92,307,0.495,152,32.9,190,4.4,0.177
528,Jevon Carter,,17.4,69,1.8,,11,-4.0,0,-0.1,...,0.549,14.4,98,313,0.441,138,25.0,185,7.0,0.214
529,Dusty Hannahs,,28.8,5,0.0,,0,-12.2,0,-5.1,...,0.589,12.0,60,192,0.490,94,27.1,208,2.9,0.132
530,Julian Washburn,,7.7,14,0.7,,2,-3.4,0,1.4,...,0.505,11.1,42,201,0.433,87,18.1,210,3.3,0.118


In [457]:
names = data['name'].to_list()
names

['Landry Fields',
 'Andy Rautins',
 'Patrick Patterson',
 'Gani Lawal',
 'Cole Aldrich',
 'Jeremy Lin',
 'Ekpe Udoh',
 'Dexter Pittman',
 'Derrick Caracter',
 'Devin Ebanks',
 'Lazar Hayward',
 'Dominique Jones',
 'Xavier Henry',
 'Greivis Vásquez',
 'Paul George',
 'Lance Stephenson',
 'Jeremy Evans',
 'Derrick Favors',
 'Gordon Hayward',
 'DeMarcus Cousins',
 'Hassan Whiteside',
 'Solomon Alabi',
 'Ed Davis',
 'Craig Brackins',
 'Evan Turner',
 'Al-Farouq Aminu',
 'Eric Bledsoe',
 'Willie Warren',
 'Trevor Booker',
 'Jordan Crawford',
 'John Wall',
 'Greg Monroe',
 'Avery Bradley',
 'Luke Harangody',
 'Luke Babbitt',
 'Armon Johnson',
 'Manny Harris',
 'Samardo Samuels',
 'Quincy Pondexter',
 'Damion James',
 'Ben Uzoh',
 'Sherron Collins',
 'Larry Sanders',
 'Kenneth Faried',
 'Julyan Stone',
 'Cory Joseph',
 'Kawhi Leonard',
 'Malcolm Thomas',
 'Reggie Jackson',
 'Ryan Reid',
 'Alec Burks',
 'Tobias Harris',
 'Darington Hobson',
 'Jon Leuer',
 'Jimmer Fredette',
 'Tyler Honeycutt',

In [6]:
failed

NameError: name 'failed' is not defined

In [471]:
len(failed['invalid-date'])

7

In [467]:
size = 0
for key in failed.keys():
    size += len(failed[key])
print("Number of unique NBA players in the past 10 years:", size + len(names))

Number of unique NBA players in the past 10 years: 1234


# Extraneous for now

### There is a variety of different ways players get into the NBA.

### Cases:

1. From NCAA to NBA Draft
2. From NCAA to going undrafted and signed separately
3. From overseas to NBA Draft
4. From overseas to undrafted and signed separately

Problems with finding NBA players that came from the NCAA over the past ten years with this package:
1. URL formatting (hard to determine a perfect one-to-one mapping from NCAA player to NBA player, if the player has indeed played in both leagues.)

In [17]:
curry = NBA_Player('curryst01')
curry

<sportsreference.nba.roster.Player at 0x11d4c4748>

In [20]:
# Removing empty/not useful qualitative columns
new_curry = curry.dataframe.drop(columns=[
    'and_ones', 
    'blocking_fouls', 
    'lost_ball_turnovers', 
    'net_plus_minus', 
    'offensive_fouls',
    'on_court_plus_minus',
    'other_turnovers',
    'passing_turnovers',
    'player_id',
    'points_generated_by_assists',
    'shooting_fouls',
    'shooting_fouls_drawn',
    'shots_blocked',
    'take_fouls',
    'team_abbreviation'
])

# Evaluating runtimes for different methods of data extraction

In [44]:
start_time = time.time()
maindata = []
times = []
midpoint = time.time()
for i in range(100):
    name = player_ids[i]
    realname = name
    name = name.lower()
    name = name.replace(" ", "-")
    website_url = requests.get('https://www.sports-reference.com/cbb/players/'+ name +'.html')
    soup = BeautifulSoup(website_url.content,'lxml')
    soup.prettify()
    My_table = soup.find('table',{'id':'players_per_game'})
    if My_table is not None:
        maindata = maindata + [realname]
        tabledata = My_table.findAll('td')
        data = []
        for cell in tabledata:
            data.append(cell.get_text())
        maindata = maindata +data[-28:]
    if i % 10 == 0:
        midendpoint = time.time()
        times.append(midendpoint - midpoint)
        print(midendpoint - midpoint)
        midpoint = midendpoint
        
end_time = time.time()
end_time - start_time

0.38002777099609375
3.728515148162842
7.556318044662476
5.162214040756226
5.096082925796509
5.424991130828857
5.260880947113037
5.241074085235596
5.042371034622192
6.61317777633667


53.03249406814575

In [45]:
more_times = []
for i in range(10):
    current = time.time()
    construct_data(player_ids, 10, 1)
    end = time.time()
    more_times.append(end - current)

Currently at: 1
Taken 0.9880430698394775 seconds since the last breakpoint.
Currently at: 2
Taken 0.4531288146972656 seconds since the last breakpoint.
Currently at: 3
Taken 0.4620842933654785 seconds since the last breakpoint.
Currently at: 4
Taken 0.41640186309814453 seconds since the last breakpoint.
Currently at: 5
Taken 0.39745616912841797 seconds since the last breakpoint.
Currently at: 6
Taken 0.411693811416626 seconds since the last breakpoint.
Currently at: 7
Taken 0.6019840240478516 seconds since the last breakpoint.
Currently at: 8
Taken 0.5324478149414062 seconds since the last breakpoint.
Currently at: 9
Taken 0.43987321853637695 seconds since the last breakpoint.
Taken 4.703129053115845 seconds to run the above code.
Currently at: 1
Taken 0.9887392520904541 seconds since the last breakpoint.
Currently at: 2
Taken 0.4690537452697754 seconds since the last breakpoint.
Currently at: 3
Taken 0.4631178379058838 seconds since the last breakpoint.
Currently at: 4
Taken 0.4180061

In [46]:
times

[0.38002777099609375,
 3.728515148162842,
 7.556318044662476,
 5.162214040756226,
 5.096082925796509,
 5.424991130828857,
 5.260880947113037,
 5.241074085235596,
 5.042371034622192,
 6.61317777633667]

In [47]:
more_times

[4.722679853439331,
 4.949055910110474,
 4.979331970214844,
 4.570301294326782,
 4.459066152572632,
 4.52507209777832,
 4.517408132553101,
 5.924187898635864,
 6.417668104171753,
 6.589348793029785]

In [48]:
np.mean(times)

4.95056529045105

In [49]:
np.mean(more_times)

5.165412020683289