In [1]:
import json 
import re 

import requests 
import scrapy

In [2]:
headers = {'User-Agent': 'UNC Journo Class'}

In [3]:
#source
base_url = 'http://goheels.com'
url = base_url + '/roster.aspx?path=mbball'

In [4]:
#get source
resp = requests.get(url, headers=headers)

In [5]:
#convert data to string
body_str = resp.content.decode('utf-8')

In [6]:
#select
sel = scrapy.Selector(text=body_str)

In [7]:
#table
table = sel.css('table')[0]

In [8]:
table

<Selector xpath='descendant-or-self::table' data='<table class="sidearm-table sidearm-tabl'>

In [9]:
#columns
cols = table.css('th').xpath('string()').extract()

In [10]:
cols

['No.', 'Name', 'Pos.', 'Ht.', 'Wt.', 'Yr.', 'Hometown / High School']

In [11]:
#rows starting from data
rows = table.css('tr')[1:]

In [16]:
#grabbing data and organizing it in a list
playerz = []
for r in rows:
    data = {}
    for i, d in enumerate(r.css('td')):
        a = d.css('a')
        if a:
            t = a.xpath('text()').extract()[0]
            data['href'] = a.xpath('@href').extract()[0]
        else:
            t = d.xpath('text()').extract()[0]
        data[cols[i]] = t
    playerz.append(data)

In [17]:
playerz

[{'Hometown / High School': 'Columbia, S.C. / Hammond School',
  'Ht.': '6-1',
  'Name': 'Seventh Woods',
  'No.': '0',
  'Pos.': 'G',
  'Wt.': '185',
  'Yr.': 'So.',
  'href': '/roster.aspx?rp_id=13521'},
 {'Hometown / High School': 'Greensboro, N.C. / Wesleyan Christian Academy',
  'Ht.': '6-6',
  'Name': 'Theo Pinson',
  'No.': '1',
  'Pos.': 'F/G',
  'Wt.': '220',
  'Yr.': 'Sr.',
  'href': '/roster.aspx?rp_id=13515'},
 {'Hometown / High School': 'Apopka, Fla. / Lake Highland Preparatory',
  'Ht.': '6-0',
  'Name': 'Joel Berry II',
  'No.': '2',
  'Pos.': 'G',
  'Wt.': '195',
  'Yr.': 'Sr.',
  'href': '/roster.aspx?rp_id=13508'},
 {'Hometown / High School': 'Guilderland, N.Y. / Northfield Mount Hermon School (Mass.)',
  'Ht.': '6-3',
  'Name': 'Andrew Platek',
  'No.': '3',
  'Pos.': 'G',
  'Wt.': '195',
  'Yr.': 'Fr.',
  'href': '/roster.aspx?rp_id=13528'},
 {'Hometown / High School': 'Douglasville, Ga. / Douglas County',
  'Ht.': '6-5',
  'Name': 'Brandon Robinson',
  'No.': '4',


In [18]:
#we gotta go deeper 
def fetch_bio(player):
    player_url = base_url + player['href']
    print('Fetch bio', player_url)
    resp = requests.get(player_url, headers=headers)
    player_txt = resp.content.decode('utf-8')
    sel = scrapy.Selector(text=player_txt)
    player['sel'] = sel
    player['bio'] = sel.css('#sidearm-roster-player-bio').xpath('string()').extract()[0]
    player['img'] = sel.css('.sidearm-roster-player-image img').xpath('@src').extract()[0]

In [19]:
#regex 
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?(?P<obj>{.*?})')

In [20]:
#getting juicy stats for each player
def fetch_stats(player):
    text = player['sel'].xpath('string()').extract()[0]
    parts = text.split('$.getJSON("/services/')[1:]
    captured = js_obj_rx.findall(''.join(parts))
    clean_objs = []
    for obj_str in captured:
        # We only want the stats object...
        if 'stats' not in obj_str:
            continue

        obj_str = obj_str.replace('{', '').replace('}', '')
        obj_str = obj_str.replace("'", '').replace('"', '')
        obj_pairs = obj_str.split(',')
        obj_pairs = [x.split(":") for x in obj_pairs]
        clean_pairs = []
        for pair in obj_pairs:
            clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
        colonized = [":".join(p) for p in clean_pairs]
        commas = ','.join(colonized)
        json_str = "{" + commas + "}"
        clean_objs.append(json.loads(json_str))
    
    player['stats_url'] = stats_url = (
        "http://goheels.com/services/responsive-roster-bio.ashx?"
        "type={type}&rp_id={rp_id}&path={path}&year={year}"
        "&player_id={player_id}"
    ).format(**clean_objs[0])
    
    print('Fetch stats', stats_url)
    resp = requests.get(stats_url, headers=headers)
    json_stats = json.loads(resp.content.decode("utf-8"))
    player['raw_stats'] = json_stats

In [22]:
for p in playerz:
    fetch_bio(p)
    fetch_stats(p)

Fetch bio http://goheels.com/roster.aspx?rp_id=13521
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13521&path=mbball&year=2017&player_id=4736
Fetch bio http://goheels.com/roster.aspx?rp_id=13515
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13515&path=mbball&year=2017&player_id=4636
Fetch bio http://goheels.com/roster.aspx?rp_id=13508
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13508&path=mbball&year=2017&player_id=4632
Fetch bio http://goheels.com/roster.aspx?rp_id=13528
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13528&path=mbball&year=2017&player_id=5171
Fetch bio http://goheels.com/roster.aspx?rp_id=13516
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=13516&path=mbball&year=2017&player_id=4733
Fetch bio http://goheels.com/roster.aspx?rp_id=13518
Fetch stats http://goheels.com/services/respons

In [23]:
#did we juice that lemon??? we did 
playerz[0]

{'Hometown / High School': 'Columbia, S.C. / Hammond School',
 'Ht.': '6-1',
 'Name': 'Seventh Woods',
 'No.': '0',
 'Pos.': 'G',
 'Wt.': '185',
 'Yr.': 'So.',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON (2016-17)\r\n\r\nTied a school record by appearing in all 40 games • Averaged 7.7 minutes per game in the primary relief role behind Joel Berry II at the point • Had 49 assists, 42 turnovers and 21 steals • Had 25 assists and 13 turnovers in ACC regular-season play • Made two steals five times • Was on the floor for five minutes vs. Louisville as UNC out-scored the Cards, 10-2 • Played his bestgame in conference play at Duke when he scored four points and had a team-high four assists in eight minutes • Played a seven and a half minute stretch in the first half as UNC cut Duke’s lead from three to one • Was the first time he led UNC in assists since Long Beach State on 11/15 • Had t

In [24]:
txt = p['raw_stats']['career_stats']

In [25]:
sel = scrapy.Selector(text=txt)

In [26]:
sel.css('section')

[]

In [27]:
#time to PARSE
def parse_stats(player):
    stats = {}
    for raw_key, raw_val in player['raw_stats'].items():
        txt = player['raw_stats'][raw_key]
        if not txt:
            print('Skipping {} for {}'.format(raw_key, player['Name']))
            continue
        sel = scrapy.Selector(text=txt)
        # Get all the tables
        for section in sel.css('section'):
            title = section.css('h5').xpath('string()').extract()[0]
            cols = section.css('tr')[0].css('th').xpath('string()').extract()
            print('NEW SECTION', title)
            print('COLS', cols)
            these_stats = []
            print('TRS', section.css('tr'))
            for r in section.css('tr')[1:]:
                print('row', r.xpath('string()').extract()[0].replace('\r', '').replace('\n', '').strip())
                s = {}
                for i, d in enumerate(r.css('td'), 0):
                    s[cols[i].lower()] = d.xpath('string()').extract()[0]
                yr = r.css('th').xpath('string()')
                if yr:
                    yr = yr.extract()[0]
                    if yr.lower() in ('total', 'season'):
                        print('SKIPPING...')
                        continue
                    print('THE YEAR IS', yr)
                    s['year'] = yr
                these_stats.append(s)
                print('THE STATS ARE', these_stats)
            existing = stats.get(raw_key, {})
            existing[title] = these_stats
            stats[raw_key] = existing
    player['stats'] = stats

In [28]:
#did it juice
playerz[0]

{'Hometown / High School': 'Columbia, S.C. / Hammond School',
 'Ht.': '6-1',
 'Name': 'Seventh Woods',
 'No.': '0',
 'Pos.': 'G',
 'Wt.': '185',
 'Yr.': 'So.',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON (2016-17)\r\n\r\nTied a school record by appearing in all 40 games • Averaged 7.7 minutes per game in the primary relief role behind Joel Berry II at the point • Had 49 assists, 42 turnovers and 21 steals • Had 25 assists and 13 turnovers in ACC regular-season play • Made two steals five times • Was on the floor for five minutes vs. Louisville as UNC out-scored the Cards, 10-2 • Played his bestgame in conference play at Duke when he scored four points and had a team-high four assists in eight minutes • Played a seven and a half minute stretch in the first half as UNC cut Duke’s lead from three to one • Was the first time he led UNC in assists since Long Beach State on 11/15 • Had t

In [30]:
for p in playerz:
    parse_stats(p)

NEW SECTION Game-By-Game Statistics
COLS ['Date', 'Opponent', 'GS', 'MIN', 'FGM/A', '%', '3FG/A', '%', 'FTM/A', '%', 'OFF', 'DEF', 'TOT', 'AVG', 'PF', 'AST', 'T/O', 'BLK', 'STL', 'PTS', 'AVG']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <th scope='>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>                        \r\n          '>, <Selector xpath='descendant-or-self::tr' data='<tr>       

THE STATS ARE [{'date': '11/10/17', 'opponent': '*', 'gs': '26', 'min': '1-6', 'fgm/a': '.167', '%': '0', '3fg/a': '.167', 'ftm/a': '.000', 'off': '3', 'def': '3', 'tot': '3.0', 'avg': '1', 'pf': '5', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '3', 'pts': '3.0', 'year': ' Northern Iowa'}, {'date': '11/15/17', 'opponent': '*', 'gs': '33', 'min': '5-12', 'fgm/a': '.417', '%': '0', '3fg/a': '.000', 'ftm/a': '.900', 'off': '5', 'def': '5', 'tot': '4.0', 'avg': '3', 'pf': '6', 'ast': '2', 't/o': '0', 'blk': '1', 'stl': '19', 'pts': '11.0', 'year': ' Bucknell'}, {'date': '11/20/17', 'opponent': '*', 'gs': '29', 'min': '4-8', 'fgm/a': '.500', '%': '1', '3fg/a': '.000', 'ftm/a': '.500', 'off': '4', 'def': '5', 'tot': '4.3', 'avg': '1', 'pf': '2', 'ast': '2', 't/o': '0', 'blk': '0', 'stl': '9', 'pts': '10.3', 'year': 'at STANFORD'}, {'date': '11/23/17', 'opponent': '*', 'gs': '25', 'min': '4-7', 'fgm/a': '.571', '%': '2', '3fg/a': '.333', 'ftm/a': '.500', 'off': '7', 'def': '9', 'tot': '5.5', '

row 12/01/17                            at Davidson                                                        13                            4-7                            .571                            0-1                            .000                            0-0                            .000                            1                            1                            2                            2.3                            3                            1                            0                            0                            0                            8                            3.0
THE YEAR IS at Davidson
THE STATS ARE [{'date': '11/10/17', 'opponent': '', 'gs': '16', 'min': '0-2', 'fgm/a': '.000', '%': '0', '3fg/a': '.000', 'ftm/a': '1.000', 'off': '2', 'def': '2', 'tot': '2.0', 'avg': '1', 'pf': '3', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '2', 'pts': '2.0', 'year': ' Northern Iowa'}, {'date': '11/23/17', 'opponent': '', 'gs': '12', 'min': '1-4', '

THE YEAR IS at Syracuse
THE STATS ARE [{'date': '11/10/17', 'opponent': '', 'gs': '14', 'min': '3-4', 'fgm/a': '.750', '%': '3', '3fg/a': '.000', 'ftm/a': '.600', 'off': '5', 'def': '8', 'tot': '8.0', 'avg': '0', 'pf': '1', 'ast': '0', 't/o': '0', 'blk': '0', 'stl': '9', 'pts': '9.0', 'year': ' Northern Iowa'}, {'date': '11/15/17', 'opponent': '', 'gs': '17', 'min': '6-10', 'fgm/a': '.600', '%': '7', '3fg/a': '.000', 'ftm/a': '.800', 'off': '6', 'def': '13', 'tot': '10.5', 'avg': '0', 'pf': '0', 'ast': '2', 't/o': '2', 'blk': '0', 'stl': '16', 'pts': '12.5', 'year': ' Bucknell'}, {'date': '11/20/17', 'opponent': '', 'gs': '12', 'min': '3-4', 'fgm/a': '.750', '%': '2', '3fg/a': '.000', 'ftm/a': '1.000', 'off': '3', 'def': '5', 'tot': '8.7', 'avg': '4', 'pf': '0', 'ast': '3', 't/o': '0', 'blk': '0', 'stl': '7', 'pts': '10.7', 'year': 'at STANFORD'}, {'date': '11/23/17', 'opponent': '', 'gs': '14', 'min': '1-4', 'fgm/a': '.250', '%': '1', '3fg/a': '.000', 'ftm/a': '.750', 'off': '7', 'def

THE STATS ARE [{'statistic': 'Points', 'value': '9', 'opponent': 'vs. Tulane'}, {'statistic': 'Minutes', 'value': '13', 'opponent': 'vs. Western Carolina'}, {'statistic': 'Field Goals Made', 'value': '4', 'opponent': 'vs. Tulane'}, {'statistic': 'Field Goal Attempts', 'value': '4', 'opponent': 'vs. Tulane'}, {'statistic': '3-Point Field Goals Made', 'value': '0', 'opponent': 'vs. Duke'}, {'statistic': '3-Point Field Goal Attempts', 'value': '0', 'opponent': 'vs. Duke'}, {'statistic': 'Free Throws Made', 'value': '4', 'opponent': 'vs. STANFORD'}, {'statistic': 'Free Throw Attempts', 'value': '4', 'opponent': 'vs. Tulane'}, {'statistic': 'Rebounds', 'value': '8', 'opponent': 'vs. Western Carolina'}, {'statistic': 'Assists', 'value': '0', 'opponent': 'vs. Duke'}, {'statistic': 'Blocks', 'value': '2', 'opponent': 'vs. Western Carolina'}, {'statistic': 'Steals', 'value': '0', 'opponent': 'vs. Duke'}]


In [32]:
#make json file to dump juicy stats in 
to_dump = [p.copy() for p in playerz]
for p in to_dump:
    p.pop('sel')
    for k in list(p.keys()):
        if 'raw' in k:
            p.pop(k)
with open('bball-scraped.json', 'w') as f:
    json.dump(to_dump, f)

In [34]:
cat bball-scraped.json | cut -c 1-100

[{"No.": "0", "href": "/roster.aspx?rp_id=13521", "Name": "Seventh Woods", "Pos.": "G", "Ht.": "6-1"


In [35]:
#checking the juice
to_dump[0]

{'Hometown / High School': 'Columbia, S.C. / Hammond School',
 'Ht.': '6-1',
 'Name': 'Seventh Woods',
 'No.': '0',
 'Pos.': 'G',
 'Wt.': '185',
 'Yr.': 'So.',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            FRESHMAN SEASON (2016-17)\r\n\r\nTied a school record by appearing in all 40 games • Averaged 7.7 minutes per game in the primary relief role behind Joel Berry II at the point • Had 49 assists, 42 turnovers and 21 steals • Had 25 assists and 13 turnovers in ACC regular-season play • Made two steals five times • Was on the floor for five minutes vs. Louisville as UNC out-scored the Cards, 10-2 • Played his bestgame in conference play at Duke when he scored four points and had a team-high four assists in eight minutes • Played a seven and a half minute stretch in the first half as UNC cut Duke’s lead from three to one • Was the first time he led UNC in assists since Long Beach State on 11/15 • Had t