In [1]:
import json
import re
import requests
import scrapy
import pandas as pd

In [2]:
#access permission
headers = {'User-Agent': 'UNC Journo Class'}

#stores url page information in sel
resp = requests.get('http://goheels.com/roster.aspx?path=wbball&print=true', headers=headers)
body_str = resp.content.decode('utf-8')
sel = scrapy.Selector(text=body_str)

In [3]:
table = sel.css('table')[0]
table

<Selector xpath='descendant-or-self::table' data='<table class="sidearm-table sidearm-tabl'>

In [4]:
#extracts the string value of the table headers of the table of players
cols = table.css('th').xpath('string()').extract()
cols

['No.', 'Name', 'Pos.', 'Ht.', 'Yr.', 'Hometown / High School']

In [5]:
#stores table rows into rows
rows = table.css('tr')[1:]

In [6]:
players = []

#goes through every row and stores its link 
for r in rows:
    data = {}
    for i, d in enumerate(r.css('td')):
        a = d.css('a')
        if a:
            t = a.xpath('text()').extract()[0]
            data['href'] = a.xpath('@href').extract()[0]
        else:
            t = d.xpath('text()').extract()[0]
        data[cols[i]] = t
    players.append(data)

In [7]:
players

[{'Hometown / High School': 'Savage, Minn. / Shakopee',
  'Ht.': '6-2',
  'Name': 'Taylor Koenen',
  'No.': '1',
  'Pos.': 'G',
  'Yr.': 'So.',
  'href': '/roster.aspx?rp_id=12642'},
 {'Hometown / High School': 'Richmond, Va. / Cosby',
  'Ht.': '6-0',
  'Name': 'Jocelyn Jones',
  'No.': '4',
  'Pos.': 'G',
  'Yr.': 'R-Fr.',
  'href': '/roster.aspx?rp_id=12643'},
 {'Hometown / High School': 'Wesley Chapel, N.C. / Weddington',
  'Ht.': '5-11',
  'Name': 'Stephanie Watts',
  'No.': '5',
  'Pos.': 'G',
  'Yr.': 'Jr.',
  'href': '/roster.aspx?rp_id=12644'},
 {'Hometown / High School': 'Cove City, N.C. / West Craven',
  'Ht.': '5-8',
  'Name': 'Jamie Cherry',
  'No.': '10',
  'Pos.': 'G',
  'Yr.': 'Sr.',
  'href': '/roster.aspx?rp_id=12645'},
 {'Hometown / High School': 'Newburgh, Ind. / Reitz Memorial',
  'Ht.': '6-4',
  'Name': 'Emily Sullivan',
  'No.': '11',
  'Pos.': 'F',
  'Yr.': 'So.',
  'href': '/roster.aspx?rp_id=12646'},
 {'Hometown / High School': 'Raleigh, N.C. / Millbrook',
  'H

In [8]:
def getBio(player):
    player_url = 'http://goheels.com' + player['href']
    #print('bio', player_url)
    resp = requests.get(player_url, headers=headers)
    player_txt = resp.content.decode('utf-8')
    sel = scrapy.Selector(text=player_txt)
    player['sel'] = sel
    player['bio'] = sel.css('#sidearm-roster-player-bio').xpath('string()').extract()[0]
    player['img'] = sel.css('.sidearm-roster-player-image img').xpath('@src').extract()[0]

In [9]:
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?(?P<obj>{.*?})')

def getStats(player):
    text = player['sel'].xpath('string()').extract()[0]
    parts = text.split('$.getJSON("/services/')[1:]
    captured = js_obj_rx.findall(''.join(parts))
    clean_objs = []
    for obj_str in captured:
        if 'stats' not in obj_str:
            continue

        obj_str = obj_str.replace('{', '').replace('}', '')
        obj_str = obj_str.replace("'", '').replace('"', '')
        obj_pairs = obj_str.split(',')
        obj_pairs = [x.split(":") for x in obj_pairs]
        clean_pairs = []
        for pair in obj_pairs:
            clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
        colonized = [":".join(p) for p in clean_pairs]
        commas = ','.join(colonized)
        json_str = "{" + commas + "}"
        clean_objs.append(json.loads(json_str))
    
    player['stats_url'] = stats_url = (
        "http://goheels.com/services/responsive-roster-bio.ashx?"
        "type={type}&rp_id={rp_id}&path={path}&year={year}"
        "&player_id={player_id}"
    ).format(**clean_objs[0])
    
    #print('stats', stats_url)
    
    #PARSE AND CLEAN STATS DATA
    resp = requests.get(stats_url, headers=headers)
    stats_str = resp.content.decode('utf-8')
    sel = scrapy.Selector(text=stats_str)
    if sel.css('table').xpath('string()').extract() != []:
        table = sel.css('table')[0]
        caption = table.css('caption').xpath('string()').extract()
        #print(caption)
        if caption != ['Career Statistics']:
            table = sel.css('table')[1]
    
        #career statistics
        cols = table.css('thead').css('tr').css('th')[1:].xpath('string()').extract()
        #print(cols)
        
        list_stats = []
        #get total career statistics
        row_data = table.css('tfoot')[0].css('tr')[0].css('td').xpath('string()').extract()
        #print(row_data)
        total_cs = list(zip(cols, row_data))
        stat_item_1 = {"Total Career Statistics" : total_cs}
        list_stats.append(stat_item_1)
        
        #get other seasons statistics
        i = 0
        for seasons in table.css('tbody')[0].css('tr').css('th').xpath('string()').extract():
            row_data = table.css('tbody')[0].css('tr')[i].css('td').xpath('string()').extract()
            season = row_data = table.css('tbody')[0].css('tr')[i].css('th').xpath('string()').extract()
            i = i + 1
            #print(season)
            #print(row_data)
            season_cs = list(zip(cols, row_data))
            stat_item_2 = {str(season) : total_cs}
            list_stats.append(stat_item_2)
        
        player['raw_stats'] = list_stats
        
#         for p in player['raw_stats']:
#             print("hi")
#             clean_pairs = []
#             clean_p = str(p).replace("(", '').replace(")", '')
#             clean_p = clean_p.replace("[", '{').replace("]", '}')
#             print(clean_p[1])
#             print(clean_p)


In [10]:
for p in players:
    getBio(p)
    getStats(p)

In [11]:
players[0]

{'Hometown / High School': 'Savage, Minn. / Shakopee',
 'Ht.': '6-2',
 'Name': 'Taylor Koenen',
 'No.': '1',
 'Pos.': 'G',
 'Yr.': 'So.',
 'bio': "\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON 2016-17\r\nAveraged 6.6 points and 5.1 rebounds in first season at North Carolina • Played in all 31 games and started 12 of the final 13 games of the season • Finished the year strong by averaging 10.0 points and a team-leading 7.4 rebounds in the last five games of the season • Registered first career double-double with 11 points and 11 rebounds in the win over Pitt in the first round of the ACC Tournament • Scored a season-high 17 points in the victory over Georgia Tech • Scored in double figures in six games overall, including four of the last six games of 2016-17.\xa0\r\n\r\nPREP/PERSONAL\r\nFive-star rated by ESPN • Ranked 11th best guard and 43rd overall in 2016 class by ESPN • Top-ranked player 

In [12]:
#formats for json file
to_dump = [p.copy() for p in players]
for p in to_dump:
    p.pop('sel')
with open('scraped_players.json', 'w') as f:
    json.dump(to_dump, f)

In [13]:
cat scraped_players.json | cut -c 1-100

[{"No.": "1", "href": "/roster.aspx?rp_id=12642", "Name": "Taylor Koenen", "Pos.": "G", "Ht.": "6-2"


In [14]:
to_dump[0]

{'Hometown / High School': 'Savage, Minn. / Shakopee',
 'Ht.': '6-2',
 'Name': 'Taylor Koenen',
 'No.': '1',
 'Pos.': 'G',
 'Yr.': 'So.',
 'bio': "\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON 2016-17\r\nAveraged 6.6 points and 5.1 rebounds in first season at North Carolina • Played in all 31 games and started 12 of the final 13 games of the season • Finished the year strong by averaging 10.0 points and a team-leading 7.4 rebounds in the last five games of the season • Registered first career double-double with 11 points and 11 rebounds in the win over Pitt in the first round of the ACC Tournament • Scored a season-high 17 points in the victory over Georgia Tech • Scored in double figures in six games overall, including four of the last six games of 2016-17.\xa0\r\n\r\nPREP/PERSONAL\r\nFive-star rated by ESPN • Ranked 11th best guard and 43rd overall in 2016 class by ESPN • Top-ranked player 

In [15]:
df = pd.read_json("scraped_players.json")

In [16]:
#displays chart from json file
df

Unnamed: 0,Hometown / High School,Ht.,Name,No.,Pos.,Yr.,bio,href,img,raw_stats,stats_url
0,"Savage, Minn. / Shakopee",6-2,Taylor Koenen,1,G,So.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12642,/images/2017/10/2/Koenen_Taylor_2017_84.jpg?wi...,"[{'Total Career Statistics': [['GP', '62'], ['...",http://goheels.com/services/responsive-roster-...
1,"Richmond, Va. / Cosby",6-0,Jocelyn Jones,4,G,R-Fr.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12643,/images/2017/10/2/Jones_Jocelyn_2017_124.jpg?w...,"[{'Total Career Statistics': [['GP', '31'], ['...",http://goheels.com/services/responsive-roster-...
2,"Wesley Chapel, N.C. / Weddington",5-11,Stephanie Watts,5,G,Jr.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12644,/images/2017/10/2/Watts_Stephanie_2017_79.jpg?...,"[{'Total Career Statistics': [['GP', '58'], ['...",http://goheels.com/services/responsive-roster-...
3,"Cove City, N.C. / West Craven",5-8,Jamie Cherry,10,G,Sr.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12645,/images/2017/10/2/Cherry_Jamie_2017_76.jpg?wid...,"[{'Total Career Statistics': [['GP', '129'], [...",http://goheels.com/services/responsive-roster-...
4,"Newburgh, Ind. / Reitz Memorial",6-4,Emily Sullivan,11,F,So.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12646,/images/2017/10/2/Sullivan_Emily_2017_143.jpg?...,"[{'Total Career Statistics': [['GP', '43'], ['...",http://goheels.com/services/responsive-roster-...
5,"Raleigh, N.C. / Millbrook",5-10,Dazia Powell,13,G,Fr.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12647,/images/2017/10/2/Powell_Dazia_2017_94.jpg?wid...,,http://goheels.com/services/responsive-roster-...
6,"Chapel Hill, N.C. / Durham Academy",5-8,Liz Roberts,14,G,So.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12648,/images/2017/10/3/Roberts_Liz_2017_66.jpg?widt...,"[{'Total Career Statistics': [['GP', '14'], ['...",http://goheels.com/services/responsive-roster-...
7,"Raleigh, N.C. / Cardinal Gibbons",6-0,Olivia Smith,15,G,So.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12649,/images/2017/10/2/Smith_Olivia_2017_61.jpg?wid...,"[{'Total Career Statistics': [['GP', '47'], ['...",http://goheels.com/services/responsive-roster-...
8,"Purlear, N.C. / Forest Trail Academy",5-8,Leah Church,20,G,Fr.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12650,/images/2017/10/2/Church_Leah_2017_105.jpg?wid...,"[{'Total Career Statistics': [['GP', '28'], ['...",http://goheels.com/services/responsive-roster-...
9,"Tarboro, N.C. / Page/Vanderbilt",5-9,Paris Kea,22,G,R-Jr.,\r\n Biography\r\n ...,/roster.aspx?rp_id=12651,/images/2017/10/2/Kea_Paris_2017_14.jpg?width=300,"[{'Total Career Statistics': [['GP', '62'], ['...",http://goheels.com/services/responsive-roster-...


In [17]:
df.columns

Index(['Hometown / High School', 'Ht.', 'Name', 'No.', 'Pos.', 'Yr.', 'bio',
       'href', 'img', 'raw_stats', 'stats_url'],
      dtype='object')

In [18]:
df = df.set_index("Name")

In [19]:
df.loc["Paris Kea"]

Hometown / High School                      Tarboro, N.C. / Page/Vanderbilt
Ht.                                                                     5-9
No.                                                                      22
Pos.                                                                      G
Yr.                                                                   R-Jr.
bio                       \r\n                        Biography\r\n     ...
href                                               /roster.aspx?rp_id=12651
img                       /images/2017/10/2/Kea_Paris_2017_14.jpg?width=300
raw_stats                 [{'Total Career Statistics': [['GP', '62'], ['...
stats_url                 http://goheels.com/services/responsive-roster-...
Name: Paris Kea, dtype: object

In [20]:
#womxn's basketball team members grouped by year
grouped = df.groupby("Yr.")
type(grouped)
grouped.groups

{'Fr.': Index(['Dazia Powell', 'Leah Church', 'Jaelynn Murray', 'Janelle Bailey'], dtype='object', name='Name'),
 'Jr.': Index(['Stephanie Watts', 'Destinee Walker'], dtype='object', name='Name'),
 'R-Fr.': Index(['Jocelyn Jones'], dtype='object', name='Name'),
 'R-Jr.': Index(['Paris Kea'], dtype='object', name='Name'),
 'So.': Index(['Taylor Koenen', 'Emily Sullivan', 'Liz Roberts', 'Olivia Smith',
        'Naomi Van Nes'],
       dtype='object', name='Name'),
 'Sr.': Index(['Jamie Cherry'], dtype='object', name='Name')}

In [21]:
players[0]['raw_stats'][1]

{"['2016-17']": [('GP', '62'),
  ('GS', '43'),
  ('Minutes', '1916'),
  ('Totals', '30.9'),
  ('3-Point', '178'),
  ('Free-Throws', '523'),
  ('Rebounds', '.340'),
  ('PF', '32'),
  ('FO', '181'),
  ('AST', '.177'),
  ('AST/G', '67'),
  ('T/O', '111'),
  ('BLK', '.604'),
  ('STL', '114'),
  ('PTS', '255'),
  ('AVG', '369'),
  ('MIN', '6.0'),
  ('AVG', '137'),
  ('FG', '3'),
  ('FGA', '161'),
  ('PCT', '2.6'),
  ('FG', '150'),
  ('FGA', '44'),
  ('PCT', '56'),
  ('FT', '455'),
  ('FTA', '7.3')]}

In [22]:
#womxn's basketball team members grouped by position
grouped = df.groupby("Pos.")
type(grouped)
grouped.groups

{'C': Index(['Naomi Van Nes', 'Janelle Bailey'], dtype='object', name='Name'),
 'F': Index(['Emily Sullivan', 'Jaelynn Murray'], dtype='object', name='Name'),
 'G': Index(['Taylor Koenen', 'Jocelyn Jones', 'Stephanie Watts', 'Jamie Cherry',
        'Dazia Powell', 'Liz Roberts', 'Olivia Smith', 'Leah Church',
        'Paris Kea', 'Destinee Walker'],
       dtype='object', name='Name')}