# Wiki NBA Stat Scraper

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [16]:
url = 'https://en.wikipedia.org/wiki/Michael_Jordan'

In [None]:
# Check to see if this works on other players
# url = 'https://en.wikipedia.org/wiki/Kobe_Bryant'

In [50]:
# Some of the wiki pages for players have a different format to 
# scrape from. Use this line for pages like this:
# url = 'https://en.wikipedia.org/wiki/Shawn_Kemp'

# wikitable = soup.find('table', {'class':'wikitable'})

In [51]:
r = requests.get(url)
text = r.text
soup = BeautifulSoup(text, 'html.parser')
# Check to make sure you have a connection to the website
r.status_code

200

In [None]:
# Use prettify to get a good
# look at the HTML
# print(soup.prettify())

In [52]:
# I found two different formats for the stat data
# This Try/Except statement find which format to use
try:
    # Format 1
    # I'm using this method to create wikitable 
    wikitable = soup.find('table', {'class':'wikitable sortable'})
    # Find all 'th' and 'td'
    th = wikitable.find_all(['th'])
    td = wikitable.find_all(['td'])
    x = True
    y = False
    regular_season = []
except:
    # Format 2
    wikitable = soup.find('table', {'class':'wikitable'})
    # Find all 'th' and 'td'
    th = wikitable.find_all(['th'])
    td = wikitable.find_all(['td'])
    y = True
    x = False
    regular_season = []

In [None]:
# th # delete this later

In [None]:
# td # delete this later

In [53]:
stat_header = []
for col in th:
    col = col.get_text()
    col = col.rstrip()
    stat_header.append(col)

In [65]:
# stat_header # delete this later

In [55]:
stat = []
for row in td:
    row = row.get_text()
    row = row.rstrip()
    stat.append(row)

In [56]:
# Format 1
# Drop the last two rows starting @ Career
# These rows have a different shape 
# and will be a problem in merging the data
while x == True:
    for s in stat:
        if s != 'Career':
            regular_season.append(s)
            continue
        else:
            x = False
            break
while y == True:
    for s in stat:
        if s != 'Career totals':
            regular_season.append(s)
            continue
        else:
            x = False
            break   

In [58]:
# This line creates a list that matches 
# the length of the regular season stats
header = stat_header * (len(regular_season)//len(stat_header))

In [59]:
# Creates a list of tuples
# Unpacked the to list into tuple to save data
# Important to keep the header column and stat (key/value) pair connected
result = list(zip(header, regular_season))

In [60]:
result # delete later

[('Year', '1996–97'),
 ('Team', 'Phoenix'),
 ('GP', '65'),
 ('GS', '2'),
 ('MPG', '10.5'),
 ('FG%', '.423'),
 ('3P%', '.418'),
 ('FT%', '.824'),
 ('RPG', '1.0'),
 ('APG', '2.1'),
 ('SPG', '.3'),
 ('BPG', '.0'),
 ('PPG', '3.3'),
 ('Year', '1997–98'),
 ('Team', 'Phoenix'),
 ('GP', '76'),
 ('GS', '9'),
 ('MPG', '21.9'),
 ('FG%', '.459'),
 ('3P%', '.415'),
 ('FT%', '.860'),
 ('RPG', '2.1'),
 ('APG', '3.4'),
 ('SPG', '.8'),
 ('BPG', '.1'),
 ('PPG', '9.1'),
 ('Year', '1998–99'),
 ('Team', 'Dallas'),
 ('GP', '40'),
 ('GS', '40'),
 ('MPG', '31.7'),
 ('FG%', '.363'),
 ('3P%', '.374'),
 ('FT%', '.826'),
 ('RPG', '2.9'),
 ('APG', '5.5'),
 ('SPG', '.9'),
 ('BPG', '.1'),
 ('PPG', '7.9'),
 ('Year', '1999–00'),
 ('Team', 'Dallas'),
 ('GP', '56'),
 ('GS', '27'),
 ('MPG', '27.4'),
 ('FG%', '.477'),
 ('3P%', '.403'),
 ('FT%', '.882'),
 ('RPG', '2.2'),
 ('APG', '4.9'),
 ('SPG', '.7'),
 ('BPG', '.1'),
 ('PPG', '8.6'),
 ('Year', '2000–01'),
 ('Team', 'Dallas'),
 ('GP', '70'),
 ('GS', '70'),
 ('MPG', '34.1'

In [61]:
sYear = []
sSeason = []
sTeam = []
sGP = []
sGS = []
sMPG = []
sFG = []
s3P = []
sFT = []
sRPG = []
sAPG = []
sSPG = []
sBPG = []
sPPG = []

In [62]:
# Format 1
# loop through the results list of tuples 
# and add the stats to the correct list
for x in result:
    if x[0] == 'Year':
        sYear.append(x[1])
    elif x[0] == 'Season':
        sSeason.append(x[1])
    elif x[0] == 'Team':
        sTeam.append(x[1])
    elif x[0] == 'GP':
        sGP.append(x[1])
    elif x[0] == 'GS':
        sGS.append(x[1])
    elif x[0] == 'MPG':
        sMPG.append(x[1])
    elif x[0] == 'FG%':
        sFG.append(x[1])
    elif x[0] == '3P%':
        s3P.append(x[1])
    elif x[0] == 'FT%':
        sFT.append(x[1])
    elif x[0] == 'RPG':
        sRPG.append(x[1])
    elif x[0] == 'APG':
        sAPG.append(x[1])
    elif x[0] == 'SPG':
        sSPG.append(x[1])
    elif x[0] == 'BPG':
        sBPG.append(x[1])
    elif x[0] == 'PPG':
        sPPG.append(x[1])

In [63]:
# Format 1
# Now that the stats are in the correct 
# list create the dataframe
df = pd.DataFrame(list(zip(sYear,sTeam,sGP,sGS,sMPG,sFG,s3P,sFT,sRPG,sAPG,sSPG,sBPG,sPPG)), 
                   columns =['Year', 'Team', 'GP','GS','MPG','FG%','3P%','FT%','RPG','APG','SPG','BPG','PPG']) 

In [14]:
# Format 2
df1 = pd.DataFrame(list(zip(sSeason,sTeam,sGP,sGS,sMPG,sFG,sFT,sBPG,sRPG,sAPG,sPPG)), 
                   columns =['Season', 'Team', 'GP','GS','MPG','FG%','FT%','BPG','RPG','APG','PPG']) 

In [64]:
# All the successfully scraped from the website and now sits in this dataframe
df

Unnamed: 0,Year,Team,GP,GS,MPG,FG%,3P%,FT%,RPG,APG,SPG,BPG,PPG
0,1996–97,Phoenix,65,2,10.5,0.423,0.418,.824,1.0,2.1,0.3,0.0,3.3
1,1997–98,Phoenix,76,9,21.9,0.459,0.415,.860,2.1,3.4,0.8,0.1,9.1
2,1998–99,Dallas,40,40,31.7,0.363,0.374,.826,2.9,5.5,0.9,0.1,7.9
3,1999–00,Dallas,56,27,27.4,0.477,0.403,.882,2.2,4.9,0.7,0.1,8.6
4,2000–01,Dallas,70,70,34.1,0.487,0.406,.895,3.2,7.3,1.0,0.1,15.6
5,2001–02,Dallas,82,82,34.6,0.483,0.455,.887,3.1,7.7,0.6,0.0,17.9
6,2002–03,Dallas,82,82,33.1,0.465,0.413,.909,2.9,7.3,1.0,0.1,17.7
7,2003–04,Dallas,78,78,33.5,0.47,0.405,.916,3.0,8.8,0.9,0.1,14.5
8,2004–05,Phoenix,75,75,34.3,0.502,0.431,.887,3.3,11.5*,1.0,0.1,15.5
9,2005–06,Phoenix,79,79,35.4,0.512,0.439,.921*,4.2,10.5*,0.8,0.2,18.8


In [15]:
df1

Unnamed: 0,Season,Team,GP,GS,MPG,FG%,FT%,BPG,RPG,APG,PPG
0,1989–90,Seattle,81,1,13.8,0.479,0.736,0.9,4.3,0.3,6.5
1,1990–91,Seattle,81,66,30.1,0.508,0.661,1.5,8.4,1.8,15.0
2,1991–92,Seattle,64,23,28.3,0.504,0.748,1.9,10.4,1.3,15.5
3,1992–93,Seattle,78,68,33.1,0.492,0.712,1.9,10.7,2.0,17.8
4,1993–94,Seattle,79,73,32.9,0.538,0.741,2.1,10.8,2.6,18.1
5,1994–95,Seattle,82,79,32.7,0.547,0.749,1.5,10.9,1.8,18.7
6,1995–96,Seattle,79,76,33.3,0.561,0.742,1.6,11.4,2.2,19.6
7,1996–97,Seattle,81,75,34.0,0.51,0.742,1.0,10.0,1.9,18.7
8,1997–98,Cleveland,80,80,34.6,0.445,0.727,1.1,9.3,2.5,20.1
9,1998–99,Cleveland,42,42,35.1,0.482,0.789,1.1,9.2,2.4,20.5


In [None]:
# Repeat the same for the playoff numbers

In [None]:
# Upload in SQLite
# If play is in db do nothing
# Else upload data

In [None]:
# Create visualizations in matplotlib