## Obtaining Data

For this project, I collected data from the very helpful website "[basketball-reference.com](https://www.basketball-reference.com/)". [This guide](https://lfbueno.com/2019-02-19-scrape-bb/) by Luis Felipe Bueno was also very helpful.

In [40]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

I'm going to need a for-loop eventually, since the data is on a number of pages. But I'll scrape a single page first, so I can get my syntax set.

In [2]:
# For more information on the scraping syntax, see Luis Felipe Bueno's article linked above.

stats_page = requests.get('https://www.basketball-reference.com/leagues/NBA_2019_per_game.html')

In [3]:
content = stats_page.content

In [4]:
soup = BeautifulSoup(content, 'html.parser')
table = soup.find(name='table', attrs={'id':'per_game_stats'})

In [5]:
table_str = str(table)
df = pd.read_html(table_str)[0]

df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9


In [6]:
df.shape

(734, 30)

In [7]:
df.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
729,528,Tyler Zeller,C,29,MEM,4,1,20.5,4.0,7.0,...,0.778,2.3,2.3,4.5,0.8,0.3,0.8,1.0,4.0,11.5
730,529,Ante Žižić,C,22,CLE,59,25,18.3,3.1,5.6,...,0.705,1.8,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8
731,530,Ivica Zubac,C,21,TOT,59,37,17.6,3.6,6.4,...,0.802,1.9,4.2,6.1,1.1,0.2,0.9,1.2,2.3,8.9
732,530,Ivica Zubac,C,21,LAL,33,12,15.6,3.4,5.8,...,0.864,1.6,3.3,4.9,0.8,0.1,0.8,1.0,2.2,8.5
733,530,Ivica Zubac,C,21,LAC,26,25,20.2,3.8,7.2,...,0.733,2.3,5.3,7.7,1.5,0.4,0.9,1.4,2.5,9.4


In [8]:
# One difficulty is that there are three historical basketball leagues: there was a BAA league before the NBA,
# and for a decade or so there was a competitor ABA league. So I'll need separate for-loops for each league.

nba_years = np.arange(1950, 2020, 1) # The first NBA season was 1949-1950.
nba_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'

df = pd.DataFrame()

for year in nba_years:
    req_url = nba_url.format(year)
    page = requests.get(req_url)
    content = page.content
    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find(name='table', attrs={'id':'per_game_stats'})
    df2 = pd.read_html(str(table))[0]
    df2['Year'] = year # I want to keep the year data for my model.
    df2['Lge'] = 'NBA' # Same with the data on which league it is.
    
    df = pd.concat([df, df2])
    
df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Lge
0,1,Curly Armstrong,G-F,31,FTW,63,,,2.3,8.2,...,,,2.8,,,,3.4,7.3,1950,NBA
1,2,Cliff Barker,SG,29,INO,49,,,2.1,5.6,...,,,2.2,,,,2.0,5.7,1950,NBA
2,3,Leo Barnhorst,SF,25,CHS,67,,,2.6,7.4,...,,,2.1,,,,2.9,6.5,1950,NBA
3,4,Ed Bartels,F,24,TOT,15,,,1.5,5.7,...,,,1.3,,,,1.9,4.2,1950,NBA
4,4,Ed Bartels,F,24,DNN,13,,,1.6,6.3,...,,,1.5,,,,2.1,4.5,1950,NBA


In [9]:
df.shape

(27101, 32)

In [10]:
baa_years = [1947, 1948, 1949] # These are the only years the BAA existed.
baa_url = 'https://www.basketball-reference.com/leagues/BAA_{}_per_game.html'

for year in baa_years:
    req_url = baa_url.format(year)
    page = requests.get(req_url)
    content = page.content
    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find(name='table', attrs={'id':'per_game_stats'})
    df2 = pd.read_html(str(table))[0]
    df2['Year'] = year
    df2['Lge'] = 'BAA'
    
    df = pd.concat([df, df2])
    
aba_years = np.arange(1968, 1976, 1) # These are the only years the ABA existed.
aba_url = 'https://www.basketball-reference.com/leagues/ABA_{}_per_game.html'

for year in aba_years:
    req_url = aba_url.format(year)
    page = requests.get(req_url)
    content = page.content
    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find(name='table', attrs={'id':'per_game_stats'})
    df2 = pd.read_html(str(table))[0]
    df2['Year'] = year
    df2['Lge'] = 'ABA'
    
    df = pd.concat([df, df2])
    
df.shape

(29237, 32)

In [11]:
df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Lge
0,1,Curly Armstrong,G-F,31,FTW,63,,,2.3,8.2,...,,,2.8,,,,3.4,7.3,1950,NBA
1,2,Cliff Barker,SG,29,INO,49,,,2.1,5.6,...,,,2.2,,,,2.0,5.7,1950,NBA
2,3,Leo Barnhorst,SF,25,CHS,67,,,2.6,7.4,...,,,2.1,,,,2.9,6.5,1950,NBA
3,4,Ed Bartels,F,24,TOT,15,,,1.5,5.7,...,,,1.3,,,,1.9,4.2,1950,NBA
4,4,Ed Bartels,F,24,DNN,13,,,1.6,6.3,...,,,1.5,,,,2.1,4.5,1950,NBA


In [12]:
df.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Lge
145,125,Hank Williams,SF,22,UTS,39,,11.6,1.8,4.2,...,1.5,2.3,0.6,0.4,0.1,0.8,1.9,4.2,1975,ABA
146,126,Milt Williams,PG,29,SSL,4,,23.8,2.8,4.8,...,2.3,3.3,3.0,2.5,0.0,2.5,2.5,5.5,1975,ABA
147,127,John Williamson,SG,23,NYA,75,,25.0,4.9,10.2,...,1.3,2.0,2.6,0.8,0.3,2.0,2.5,11.5,1975,ABA
148,128,Willie Wise,SF,27,VIR,16,,35.9,8.0,18.5,...,4.4,6.4,3.4,1.6,0.2,2.8,3.1,20.9,1975,ABA
149,129,Dennis Wuycik,SF,24,SSL,25,,8.8,1.4,3.0,...,0.8,1.5,0.7,0.2,0.0,1.0,1.6,3.2,1975,ABA


In [13]:
# Need to reset the index for the sake of a later for-loop.

df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Lge
29232,125,Hank Williams,SF,22,UTS,39,,11.6,1.8,4.2,...,1.5,2.3,0.6,0.4,0.1,0.8,1.9,4.2,1975,ABA
29233,126,Milt Williams,PG,29,SSL,4,,23.8,2.8,4.8,...,2.3,3.3,3.0,2.5,0.0,2.5,2.5,5.5,1975,ABA
29234,127,John Williamson,SG,23,NYA,75,,25.0,4.9,10.2,...,1.3,2.0,2.6,0.8,0.3,2.0,2.5,11.5,1975,ABA
29235,128,Willie Wise,SF,27,VIR,16,,35.9,8.0,18.5,...,4.4,6.4,3.4,1.6,0.2,2.8,3.1,20.9,1975,ABA
29236,129,Dennis Wuycik,SF,24,SSL,25,,8.8,1.4,3.0,...,0.8,1.5,0.7,0.2,0.0,1.0,1.6,3.2,1975,ABA


In [14]:
# Confirming I scraped the years correctly.
df.describe()

Unnamed: 0,Year
count,29237.0
mean,1991.729521
std,19.138099
min,1947.0
25%,1977.0
50%,1995.0
75%,2008.0
max,2019.0


In [15]:
# Finally, I'll save this into a csv so I can easily use it in other notebooks.

df.to_csv('raw-player-data.csv', index=None, header=True)

I've now obtained individual data, but that's not enough. I also need to get data on which players were on which teams, and when. (To see why, see my "Engineering Features" notebook.)

In [16]:
# Again, I'll start with scraping a single page so I can get the syntax down.

stats_page = requests.get('https://www.basketball-reference.com/teams/TOR/2019.html')
content = stats_page.content
soup = BeautifulSoup(content, 'html.parser')
table = soup.find(name='table', attrs={'id': 'roster'})

table_str = str(table)
df2 = pd.read_html(table_str)[0]

df2.head()

Unnamed: 0,No.,Player,Pos,Ht,Wt,Birth Date,Unnamed: 6,Exp,College
0,3,OG Anunoby,SF,6-8,232,"July 17, 1997",gb,1,Indiana
1,25,Chris Boucher,PF,6-10,200,"January 11, 1993",lc,1,Oregon
2,4,Lorenzo Brown,PG,6-5,189,"August 26, 1990",us,4,NC State
3,33,Marc Gasol,C,7-1,255,"January 29, 1985",es,10,
4,14,Danny Green,SG,6-6,215,"June 22, 1987",us,9,UNC


In [17]:
# Let's try with another team.

stats_page = requests.get('https://www.basketball-reference.com/teams/PHI/2019.html')
content = stats_page.content
soup = BeautifulSoup(content, 'html.parser')
table = soup.find(name='table', attrs={'id': 'roster'})

table_str = str(table)
df3 = pd.read_html(table_str)[0]

df3.head()

Unnamed: 0,No.,Player,Pos,Ht,Wt,Birth Date,Unnamed: 6,Exp,College
0,43,Jonah Bolden,PF,6-10,220,"January 2, 1996",au,R,UCLA
1,0,Corey Brewer,SF,6-9,186,"March 5, 1986",us,11,Florida
2,23,Jimmy Butler,SF,6-7,230,"September 14, 1989",us,7,Marquette
3,22,Wilson Chandler,PF,6-8,235,"May 10, 1987",us,10,DePaul
4,33,Robert Covington,SF,6-8,211,"December 14, 1990",us,5,Tennessee State


In [18]:
# I only need the "Player" column. But when I concatenate the dataframes, I'll need to keep each team/year
# pair separate. I can put that information in the column name.

pd.DataFrame(df3['Player']).rename(columns={'Player': 'PHI 2019'})

Unnamed: 0,PHI 2019
0,Jonah Bolden
1,Corey Brewer
2,Jimmy Butler
3,Wilson Chandler
4,Robert Covington
5,Joel Embiid
6,James Ennis
7,Markelle Fultz
8,Tobias Harris
9,Haywood Highsmith


In [19]:
temp1 = pd.DataFrame(df2['Player']).rename(columns={'Player': 'TOR 2019'})
temp2 = pd.DataFrame(df3['Player']).rename(columns={'Player': 'PHI 2019'})

df3 = pd.concat([temp1, temp2], axis=1)
df3.head()

Unnamed: 0,TOR 2019,PHI 2019
0,OG Anunoby,Jonah Bolden
1,Chris Boucher,Corey Brewer
2,Lorenzo Brown,Jimmy Butler
3,Marc Gasol,Wilson Chandler
4,Danny Green,Robert Covington


In [20]:
# One difficulty is that I need to get a list of each team that basketball-reference uses in its URLs.
# I can get this from my original dataframe.

df['Tm'].value_counts()

TOT    2495
NYK    1137
Tm     1103
BOS    1097
DET     959
       ... 
MMS      14
CAP      14
WSA      14
MNM      11
DNA      11
Name: Tm, Length: 105, dtype: int64

In [49]:
teams = df['Tm'].value_counts().index.tolist()
teams

['TOT',
 'NYK',
 'Tm',
 'BOS',
 'DET',
 'PHI',
 'LAL',
 'MIL',
 'CHI',
 'CLE',
 'ATL',
 'PHO',
 'HOU',
 'GSW',
 'POR',
 'SAS',
 'DEN',
 'DAL',
 'IND',
 'UTA',
 'LAC',
 'SEA',
 'NJN',
 'SAC',
 'MIA',
 'ORL',
 'MIN',
 'TOR',
 'WAS',
 'WSB',
 'MEM',
 'CHH',
 'PHW',
 'CIN',
 'OKC',
 'STL',
 'CHA',
 'SYR',
 'NOH',
 'MNL',
 'BLB',
 'BAL',
 'KCK',
 'BRK',
 'FTW',
 'NOP',
 'KEN',
 'BUF',
 'SFW',
 'NYA',
 'INA',
 'ROC',
 'DNR',
 'VAN',
 'SDC',
 'CHO',
 'DLC',
 'NOJ',
 'CAR',
 'MLH',
 'VIR',
 'UTS',
 'WSC',
 'CHS',
 'STB',
 'PRO',
 'INO',
 'SDR',
 'KCO',
 'SDA',
 'PTP',
 'TRI',
 'NOB',
 'MMT',
 'HSM',
 'MMF',
 'PTC',
 'FLO',
 'NOK',
 'LAS',
 'MMP',
 'SAA',
 'OAK',
 'INJ',
 'WAT',
 'MNP',
 'TRH',
 'ANA',
 'SSL',
 'CLR',
 'NYN',
 'TEX',
 'SHE',
 'NJA',
 'PIT',
 'CHP',
 'CHZ',
 'AND',
 'DNN',
 'DTF',
 'MMS',
 'CAP',
 'WSA',
 'MNM',
 'DNA']

In [22]:
# An f-string will be much better here than .format. I'll try it to confirm it works correctly.

tm = 'NYK'
yr = 2010
url = f'https://www.basketball-reference.com/teams/{tm}/{yr}.html'
url

'https://www.basketball-reference.com/teams/NYK/2010.html'

In [23]:
# The last import part is to make sure I don't try to scrape a team/year pair that doesn't exist.
# Let's test it first with a single team.

team = 'NYK'
df4 = pd.DataFrame()
years = []

# Use a for-loop to find all the years that share a row with the team name.
for i in range(len(df)):
    if df.loc[i, 'Tm'] != team:
        continue
    elif df.loc[i, 'Year'] == 2019:
        years.append(2019)
        years.append(2020)
        # I only scraped player stats up until 2019, since the 2019-2020 player stats don't yet exist.
        # However, information about who's on what team in 2020 *does* exist, so I want to scrape that too.
        # Luckily, all and only 2020 teams also existed in 2019, so I can just say that if they have 2019
        # data, I should scrape the 2020 data too.
    else:
        years.append(int(df.loc[i, 'Year']))

years = list(set(years)) # Put it into a set to get rid of duplicates.

for year in years:
    url = f'https://www.basketball-reference.com/teams/{team}/{year}.html'
    stats_page = requests.get(url)
    content = stats_page.content
    soup = BeautifulSoup(content, 'html.parser')
    table = soup.find(name='table', attrs={'id': 'roster'})
    temp1 = pd.read_html(str(table))[0]
    temp2 = pd.DataFrame(temp1['Player']).rename(columns={'Player': f'{team} {year}'})
        
    df4 = pd.concat([df4, temp2], axis=1)

df4.head()

Unnamed: 0,NYK 1947,NYK 1948,NYK 1949,NYK 1950,NYK 1951,NYK 1952,NYK 1953,NYK 1954,NYK 1955,NYK 1956,...,NYK 2011,NYK 2012,NYK 2013,NYK 2014,NYK 2015,NYK 2016,NYK 2017,NYK 2018,NYK 2019,NYK 2020
0,Aud Brindley,Carl Braun,Carl Braun,Ed Bartels,Vince Boryla,Vince Boryla,Vince Boryla,Don Ackerman,Don Anielak,Dick Atha,...,Carmelo Anthony,Carmelo Anthony,Carmelo Anthony,Cole Aldrich,Quincy Acy,Arron Afflalo,Carmelo Anthony,Ron Baker,Kadeem Allen,Frank Ntilikina
1,Tommy Byrnes,Tommy Byrnes,Tommy Byrnes,Vince Boryla,Nat Clifton,Nat Clifton,Carl Braun,Jim Baechtold,Jim Baechtold,Jim Baechtold,...,Renaldo Balkman,Renaldo Balkman,Earl Barron,Carmelo Anthony,Cole Aldrich,Lou Amundson,Ron Baker,Michael Beasley,Ron Baker,Dennis Smith Jr.
2,Bob Cluggish,Leo Gottlieb,Joe Colone,Carl Braun,Ray Ellefson,Harry Gallatin,Dick Bunt,Vince Boryla,Carl Braun,Carl Braun,...,Chauncey Billups,Mike Bibby,Ronnie Brewer,Andrea Bargnani,Lou Amundson,Thanasis Antetokounmpo,Willy Hernangómez,Trey Burke,Trey Burke,Damyean Dotson
3,Bob Fitzgerald,Sonny Hertzberg,Harry Gallatin,Harry Donovan,Harry Gallatin,George Kaftan,Nat Clifton,Carl Braun,Fred Christ,Nat Clifton,...,Derrick Brown,Tyson Chandler,Marcus Camby,Shannon Brown,Carmelo Anthony,Carmelo Anthony,Justin Holiday,Damyean Dotson,Damyean Dotson,Kevin Knox
4,Frido Frey,Dick Holub,Gene James,Harry Gallatin,Gene James,Ray Lumpp,Jerry Fleishman,Nat Clifton,Nat Clifton,Walter Dukes,...,Anthony Carter,Baron Davis,Tyson Chandler,Tyson Chandler,Andrea Bargnani,José Calderón,Brandon Jennings,Tim Hardaway,Henry Ellenson,Mitchell Robinson


In [25]:
# Now I can finally run the full for-loop. It's nested, though, so it takes a while.
# Only...whoops! There's an error. Let's run try/except syntax to see why.

df5 = pd.DataFrame()

for team in teams:
    years = []
    for i in range(len(df)):
        if df.loc[i, 'Tm'] != team:
            continue
        else:
            years.append(int(df.loc[i, 'Year']))
    years = list(set(years))
    for year in years:
        url = f'https://www.basketball-reference.com/teams/{team}/{year}.html'
        stats_page = requests.get(url)
        content = stats_page.content
        soup = BeautifulSoup(content, 'html.parser')
        table = soup.find(name='table', attrs={'id': 'roster'})
        try:
            temp1 = pd.read_html(str(table))[0]
        except:
            print(f'{team}, {year}')
        #temp2 = pd.DataFrame(temp1['Player']).rename(columns={'Player': f'{team} {year}'})
        
        #df5 = pd.concat([df5, temp2], axis=1)
            
#df5.head()

TOT, 1947
TOT, 1948
TOT, 1949
TOT, 1950
TOT, 1951
TOT, 1952
TOT, 1953
TOT, 1954
TOT, 1955
TOT, 1956
TOT, 1957
TOT, 1958
TOT, 1959
TOT, 1960
TOT, 1961
TOT, 1962
TOT, 1963
TOT, 1964
TOT, 1965
TOT, 1966
TOT, 1967
TOT, 1968
TOT, 1969
TOT, 1970
TOT, 1971
TOT, 1972
TOT, 1973
TOT, 1974
TOT, 1975
TOT, 1976
TOT, 1977
TOT, 1978
TOT, 1979
TOT, 1980
TOT, 1981
TOT, 1982
TOT, 1983
TOT, 1984
TOT, 1985
TOT, 1986
TOT, 1987
TOT, 1988
TOT, 1989
TOT, 1990
TOT, 1991
TOT, 1992
TOT, 1993
TOT, 1994
TOT, 1995
TOT, 1996
TOT, 1997
TOT, 1998
TOT, 1999
TOT, 2000
TOT, 2001
TOT, 2002
TOT, 2003
TOT, 2004
TOT, 2005
TOT, 2006
TOT, 2007
TOT, 2008
TOT, 2009
TOT, 2010
TOT, 2011
TOT, 2012
TOT, 2013
TOT, 2014
TOT, 2015
TOT, 2016
TOT, 2017
TOT, 2018
TOT, 2019
Tm, 1947
Tm, 1948
Tm, 1949
Tm, 1950
Tm, 1951
Tm, 1952
Tm, 1953
Tm, 1954
Tm, 1955
Tm, 1956
Tm, 1957
Tm, 1958
Tm, 1959
Tm, 1960
Tm, 1961
Tm, 1962
Tm, 1963
Tm, 1964
Tm, 1965
Tm, 1966
Tm, 1967
Tm, 1968
Tm, 1969
Tm, 1970
Tm, 1971
Tm, 1972
Tm, 1973
Tm, 1974
Tm, 1975
Tm, 1976


In [50]:
# Ah, of course; there are two values that aren't really "teams" per se. Let's remove them.

teams.remove('TOT')
teams.remove('Tm')
teams

['NYK',
 'BOS',
 'DET',
 'PHI',
 'LAL',
 'MIL',
 'CHI',
 'CLE',
 'ATL',
 'PHO',
 'HOU',
 'GSW',
 'POR',
 'SAS',
 'DEN',
 'DAL',
 'IND',
 'UTA',
 'LAC',
 'SEA',
 'NJN',
 'SAC',
 'MIA',
 'ORL',
 'MIN',
 'TOR',
 'WAS',
 'WSB',
 'MEM',
 'CHH',
 'PHW',
 'CIN',
 'OKC',
 'STL',
 'CHA',
 'SYR',
 'NOH',
 'MNL',
 'BLB',
 'BAL',
 'KCK',
 'BRK',
 'FTW',
 'NOP',
 'KEN',
 'BUF',
 'SFW',
 'NYA',
 'INA',
 'ROC',
 'DNR',
 'VAN',
 'SDC',
 'CHO',
 'DLC',
 'NOJ',
 'CAR',
 'MLH',
 'VIR',
 'UTS',
 'WSC',
 'CHS',
 'STB',
 'PRO',
 'INO',
 'SDR',
 'KCO',
 'SDA',
 'PTP',
 'TRI',
 'NOB',
 'MMT',
 'HSM',
 'MMF',
 'PTC',
 'FLO',
 'NOK',
 'LAS',
 'MMP',
 'SAA',
 'OAK',
 'INJ',
 'WAT',
 'MNP',
 'TRH',
 'ANA',
 'SSL',
 'CLR',
 'NYN',
 'TEX',
 'SHE',
 'NJA',
 'PIT',
 'CHP',
 'CHZ',
 'AND',
 'DNN',
 'DTF',
 'MMS',
 'CAP',
 'WSA',
 'MNM',
 'DNA']

In [51]:
df5 = pd.DataFrame()

for team in teams:
    years = []
    for i in range(len(df)):
        if df.loc[i, 'Tm'] == team:
            years.append(int(df.loc[i, 'Year']))
    years = list(set(years))
    for year in years:
        url = f'https://www.basketball-reference.com/teams/{team}/{year}.html'
        stats_page = requests.get(url)
        content = stats_page.content
        soup = BeautifulSoup(content, 'html.parser')
        table = soup.find(name='table', attrs={'id': 'roster'})
        temp1 = pd.read_html(str(table))[0]
        temp2 = pd.DataFrame(temp1['Player']).rename(columns={'Player': f'{team} {year}'})
        
        df5 = pd.concat([df5, temp2], axis=1)
            
df5.head()

Unnamed: 0,NYK 1947,NYK 1948,NYK 1949,NYK 1950,NYK 1951,NYK 1952,NYK 1953,NYK 1954,NYK 1955,NYK 1956,...,CHP 1962,CHZ 1963,AND 1950,DNN 1950,DTF 1947,MMS 1975,CAP 1974,WSA 1970,MNM 1968,DNA 1975
0,Aud Brindley,Carl Braun,Carl Braun,Ed Bartels,Vince Boryla,Vince Boryla,Vince Boryla,Don Ackerman,Don Anielak,Dick Atha,...,Walt Bellamy,Walt Bellamy,Charlie Black,Ed Bartels,Chet Aubuchon,Roger Brown,Phil Chenier,Mike Barrett,Richard Clark,Byron Beck
1,Tommy Byrnes,Tommy Byrnes,Tommy Byrnes,Vince Boryla,Nat Clifton,Nat Clifton,Carl Braun,Jim Baechtold,Jim Baechtold,Jim Baechtold,...,George Bon Salle,Barney Cable,Frankie Brian,Bob Brown,Moe Becker,George Carter,Archie Clark,Rick Barry,Mel Daniels,Mack Calvin
2,Bob Cluggish,Leo Gottlieb,Joe Colone,Carl Braun,Ray Ellefson,Harry Gallatin,Dick Bunt,Vince Boryla,Carl Braun,Carl Braun,...,Barney Cable,Johnny Cox,Jake Carter,Jim Browne,Harold Brown,Mel Daniels,Elvin Hayes,Gary Bradds,Donnie Freeman,Mike Green
3,Bob Fitzgerald,Sonny Hertzberg,Harry Gallatin,Harry Donovan,Harry Gallatin,George Kaftan,Nat Clifton,Carl Braun,Fred Christ,Nat Clifton,...,Howie Carl,Terry Dischinger,Bill Closs,Jake Carter,Bob Dille,Larry Finch,Tom Kozelko,Larry Brown,Les Hunter,Bobby Jones
4,Frido Frey,Dick Holub,Gene James,Harry Gallatin,Gene James,Ray Lumpp,Jerry Fleishman,Nat Clifton,Nat Clifton,Walter Dukes,...,Ralph Davis,Al Ferrari,Frank Gates,Jack Cotton,John Janisch,Stew Johnson,Manny Leaks,Frank Card,Ervin Inniger,Patrick McFarland


In [52]:
df5.to_csv('raw-team-data.csv', index=None, header=True)

The last step is to get data on which players made the all-star team, and when.

[Richmond Macaspac](https://github.com/macr) helped me a lot with this part.

In [31]:
# Richmond's style is to use requests-html, which needs to be pip-installed.

pip install requests-html

SyntaxError: invalid syntax (<ipython-input-31-13716e638758>, line 3)

In [32]:
from requests_html import HTMLSession, HTML

session = HTMLSession()
r = session.get('https://www.basketball-reference.com/leagues/NBA_2018.html')
roster = r.html.find('#all_all_star_game_rosters', first=True)

In [33]:
roster.find('#div_all_star_game_rosters')

[]

In [34]:
print(roster.html)

<div class="grid_wrapper commented" id="all_all_star_game_rosters">
<div class="section_heading">
<span class="section_anchor" id="all_star_game_rosters_link" data-label="All-Star Game Rosters"/><h2>All-Star Game Rosters</h2> <div class="section_heading_text">
<ul>
</ul>
</div>
</div><!-- 	<div data-no-overall-control class="data_grid " id="div_all_star_game_rosters">
		
		<div class="data_grid_group solo">
		
		<div id="all_star_game_rosters_1" class="data_grid_box">
	
	<table>
		<caption>LeBron</caption>				<tr class="">
						<td class="left"><p>&nbsp;<a href='/players/a/aldrila01.html'>LaMarcus Aldridge</a>&nbsp;</p><p>&nbsp;<a href='/players/b/bealbr01.html'>Bradley Beal</a>&nbsp;</p><p>&nbsp;<a href='/players/c/couside01.html'>DeMarcus Cousins</a>&nbsp;<sup>(1)</sup>&nbsp;</p><p>&nbsp;<a href='/players/d/davisan02.html'>Anthony Davis</a>&nbsp;</p><p>&nbsp;<a href='/players/d/dragigo01.html'>Goran Dragić</a>&nbsp;</p><p>&nbsp;<a href='/players/d/drumman01.html'>Andre Drummond</a>&

In [35]:
roster = roster.html.split('<!--')[1].split('-->')[0]
roster = HTML(html=roster)
teams = roster.find('.data_grid_box')
data = [{'player': player.text, 'team': team.find('caption', first=True).text}
        for team in teams for player in team.find('.left p')]
df6 = pd.DataFrame(data)
df6

Unnamed: 0,player,team
0,LaMarcus Aldridge,LeBron
1,Bradley Beal,LeBron
2,DeMarcus Cousins (1),LeBron
3,Anthony Davis,LeBron
4,Goran Dragić,LeBron
5,Andre Drummond,LeBron
6,Kevin Durant,LeBron
7,Paul George,LeBron
8,Kyrie Irving,LeBron
9,LeBron James,LeBron


In [36]:
nba_years = np.arange(1951, 2020, 1) # There was no all-star game in 1950.
nba_url = 'https://www.basketball-reference.com/leagues/NBA_{}.html'

df7 = pd.DataFrame()

for year in nba_years:
    if year == 1999:
        continue
        # The first time I tried this, I didn't include the above if-statement, and ran into an error.
        # I then did try-except syntax, and discovered the problem was with the above year.
        # Due to a lockout the 1999 year was shortened, and there was no all-star game!
    else:
        req_url = nba_url.format(year)
        session = HTMLSession()
        r = session.get(req_url)
        roster = r.html.find('#all_all_star_game_rosters', first=True)
        roster.find('#div_all_star_game_rosters')
        roster = roster.html.split('<!--')[1].split('-->')[0]
        roster = HTML(html=roster)
        teams = roster.find('.data_grid_box')
        data = [{'Player': player.text} for team in teams for player in team.find('.left p')]
        df8 = pd.DataFrame(data)
        df8['Year'] = year # Like the first time, I need to keep the year/league data.
        df8['Lge'] = 'NBA'
    
        df7 = pd.concat([df7, df8])
    
df7.head()

Unnamed: 0,Player,Year,Lge
0,Paul Arizin*,1951,NBA
1,Vince Boryla,1951,NBA
2,Bob Cousy*,1951,NBA
3,Joe Fulks*,1951,NBA
4,Harry Gallatin*,1951,NBA


In [37]:
# I don't need to include the BAA this time because they didn't have all-star games.

aba_years = np.arange(1968, 1976, 1)
aba_url = 'https://www.basketball-reference.com/leagues/ABA_{}.html'

for year in aba_years:
    req_url = aba_url.format(year)
    session = HTMLSession()
    r = session.get(req_url)
    roster = r.html.find('#all_all_star_game_rosters', first=True)
    roster.find('#div_all_star_game_rosters')
    roster = roster.html.split('<!--')[1].split('-->')[0]
    roster = HTML(html=roster)
    teams = roster.find('.data_grid_box')
    data = [{'Player': player.text} for team in teams for player in team.find('.left p')]
    df8 = pd.DataFrame(data)
    df8['Year'] = year
    df8['Lge'] = 'ABA'
    
    df7 = pd.concat([df7, df8])
    
df7.head()

Unnamed: 0,Player,Year,Lge
0,Paul Arizin*,1951,NBA
1,Vince Boryla,1951,NBA
2,Bob Cousy*,1951,NBA
3,Joe Fulks*,1951,NBA
4,Harry Gallatin*,1951,NBA


In [38]:
df7['Lge'].describe() # Making sure I did this correctly.

count     1843
unique       2
top        NBA
freq      1663
Name: Lge, dtype: object

In [39]:
df7.to_csv('raw-allstar-data.csv', index=None, header=True)