# Player data

After looking for resources online. I was able to find a great columnist from ESPN by the name John Hollinger, he had great statistics on basketball players. Which could serve useful to our data set. So now I'm going to work on scraping information from that website. My plan is that when I try to use my model the input features will be players and their relevant stats, as well as the stats of the team as a whole. So team data from the same columnist will also be relevant.

In [82]:
import pandas as pd
import re
import urllib.request as ur

In [83]:
# For testing we will be using a local HTML saved from the source
# webpage = open('../GIT_NO/PLAYER_STATS_2scrape/Page_1.htm', 'r').read()
url = ''

fp = ur.urlopen(url).read().decode('utf-8')
webpage = fp

master_df = pd.DataFrame()

# Find the table
table = re.findall(r'<table class="tablehead" cellspacing="1" cellpadding="3">[\s\S].*?</table>', webpage)[0]

# Find the headers
table_headers = re.findall(r'<tr class="colhead" align="right">([\s\S].*?)</tr>', table)[0]
table_headers
table_headers = re.findall(r'<td.*?>(.*?)</td>', table_headers)
table_headers
# webpage

['RK',
 'PLAYER',
 'GP',
 'MPG',
 '<a title="True Shooting Percentage" href="//insider.espn.com/nba/hollinger/statistics/_/sort/trueShootingPct/qualified/false">TS%</a>',
 '<a title="Assist Ratio" href="//insider.espn.com/nba/hollinger/statistics/_/sort/assistRatio/qualified/false">AST</a>',
 '<a title="Turnover Ratio" href="//insider.espn.com/nba/hollinger/statistics/_/sort/turnoverRatio/order/false/qualified/false">TO</a>',
 '<a title="Usage Rate" href="//insider.espn.com/nba/hollinger/statistics/_/sort/usageRate/qualified/false">USG</a>',
 '<a title="Offensive Rebound Rate" href="//insider.espn.com/nba/hollinger/statistics/_/sort/offReboundRate/qualified/false">ORR</a>',
 '<a title="Defensive Rebound Rate" href="//insider.espn.com/nba/hollinger/statistics/_/sort/defReboundRate/qualified/false">DRR</a>',
 '<a title="Rebound Rate" href="//insider.espn.com/nba/hollinger/statistics/_/sort/reboundRate/qualified/false">REBR</a>',
 '<a title="Player Efficiency Rating" href="//insider.espn.

In [84]:
"""
Remove the HTML anchor tags from the headers
"""
table_headers = [re.sub(r'(<a .*\">|<\/a>)', '', header) for header in table_headers]
table_headers

['RK',
 'PLAYER',
 'GP',
 'MPG',
 'TS%',
 'AST',
 'TO',
 'USG',
 'ORR',
 'DRR',
 'REBR',
 'PER',
 'VA',
 'EWA']

In [85]:
"""
    Find the player stats
"""
# player_stats_raw = re.findall(r'(<td class=\".*\">[\s\S]*?<\/td>)', table)
player_stats_raw = re.findall(r'<tr class="(oddrow player|evenrow player)-\d+-\d+" align="right">(.*?)</tr>', table)
player_stats_raw = [re.findall(r'<td.*?>(.*?)</td>', player[1]) for player in player_stats_raw]
player_stats_raw = [re.sub(r'(<a href=.*">|</a>|,.*|[\s])', '', stat) for player in player_stats_raw for stat in player]
player_stats_raw

['1',
 'MarkquisNowell',
 '1',
 '4.0',
 '.532',
 '51.5',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '55.49',
 '2.7',
 '0.1',
 '2',
 'DrewPeterson',
 '1',
 '3.0',
 '1.500',
 '50.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '43.55',
 '1.5',
 '0.0',
 '3',
 'JoelEmbiid',
 '34',
 '34.0',
 '.645',
 '15.4',
 '10.1',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '34.35',
 '410.1',
 '13.7',
 '4',
 'HamidouDiallo',
 '2',
 '2.5',
 '.500',
 '25.0',
 '25.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '32.95',
 '0.0',
 '0.0',
 '5',
 'AdamaSanogo',
 '3',
 '3.7',
 '.926',
 '0.0',
 '18.8',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '32.68',
 '3.6',
 '0.1',
 '6',
 'NikolaJokic',
 '53',
 '33.8',
 '.646',
 '27.9',
 '9.2',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '31.22',
 '551.8',
 '18.4',
 '7',
 'ShaiGilgeous-Alexander',
 '53',
 '34.5',
 '.647',
 '20.0',
 '6.6',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '30.57',
 '0.0',
 '0.0',
 '8',
 'MamadiDiakite',
 '3',
 '5.3',
 '.785',
 '20.7',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '30.36',
 '0.0',
 '0.0',
 '9',
 

In [86]:
player_data = {table_headers[i]: player_stats_raw[i::len(table_headers)] for i in range(len(table_headers))}
player_data
df = pd.DataFrame(player_data)
master_df = pd.concat([master_df, df], ignore_index=True)
master_df

Unnamed: 0,RK,PLAYER,GP,MPG,TS%,AST,TO,USG,ORR,DRR,REBR,PER,VA,EWA
0,1.0,MarkquisNowell,1,4.0,0.532,51.5,0.0,0.0,0.0,0.0,0.0,55.49,2.7,0.1
1,2.0,DrewPeterson,1,3.0,1.5,50.0,0.0,0.0,0.0,0.0,0.0,43.55,1.5,0.0
2,3.0,JoelEmbiid,34,34.0,0.645,15.4,10.1,0.0,0.0,0.0,0.0,34.35,410.1,13.7
3,4.0,HamidouDiallo,2,2.5,0.5,25.0,25.0,0.0,0.0,0.0,0.0,32.95,0.0,0.0
4,5.0,AdamaSanogo,3,3.7,0.926,0.0,18.8,0.0,0.0,0.0,0.0,32.68,3.6,0.1
5,6.0,NikolaJokic,53,33.8,0.646,27.9,9.2,0.0,0.0,0.0,0.0,31.22,551.8,18.4
6,7.0,ShaiGilgeous-Alexander,53,34.5,0.647,20.0,6.6,0.0,0.0,0.0,0.0,30.57,0.0,0.0
7,8.0,MamadiDiakite,3,5.3,0.785,20.7,0.0,0.0,0.0,0.0,0.0,30.36,0.0,0.0
8,9.0,GiannisAntetokounmpo,53,34.9,0.648,18.8,10.5,0.0,0.0,0.0,0.0,29.59,0.0,0.0
9,10.0,LeonardMiller,6,3.5,0.75,8.3,8.3,0.0,0.0,0.0,0.0,28.72,0.0,0.0
