In [None]:
import lxml.html
from lxml.cssselect import CSSSelector
import requests
import csv

In [None]:
team_page_links = ['https://www.geargeek.com/anaheim-ducks','https://www.geargeek.com/team/arizona-coyotes','https://www.geargeek.com/team/calgary-flames','https://www.geargeek.com/team/edmonton-oilers','https://www.geargeek.com/team/los-angeles-kings','https://www.geargeek.com/team/san-jose-sharks','https://www.geargeek.com/team/vancouver-canucks','https://www.geargeek.com/team/vegas-golden-knights','https://www.geargeek.com/team/chicago-blackhawks','https://www.geargeek.com/team/colorado-avalanche','https://www.geargeek.com/team/dallas-stars','https://www.geargeek.com/team/minnesota-wild','https://www.geargeek.com/team/nashville-predators','https://www.geargeek.com/team/st-louis-blues','https://www.geargeek.com/team/winnipeg-jets','https://www.geargeek.com/team/boston-bruins','https://www.geargeek.com/team/buffalo-sabres','https://www.geargeek.com/team/detroit-red-wings','https://www.geargeek.com/team/florida-panthers','https://www.geargeek.com/team/montreal-canadiens','https://www.geargeek.com/team/ottawa-senators','https://www.geargeek.com/team/tampa-bay-lightning','https://www.geargeek.com/team/toronto-maple-leafs','https://www.geargeek.com/team/carolina-hurricanes','https://www.geargeek.com/team/columbus-blue-jackets','https://www.geargeek.com/team/new-jersey-devils','https://www.geargeek.com/team/new-york-islanders','https://www.geargeek.com/team/new-york-rangers','https://www.geargeek.com/team/philadelphia-flyers','https://www.geargeek.com/team/pittsburgh-penguins','https://www.geargeek.com/team/washington-capital']

In [None]:
# read as: the link within the first column of a row within the table with id "player_roster"
selector_player_link = CSSSelector("table#player_roster td:first-child a")

In [None]:
with open('players.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
    for team_page_link in team_page_links:
        team_page = requests.get(team_page_link)
        team_tree = lxml.html.fromstring(team_page.text)
        for x in selector_player_link(team_tree):
            writer.writerow([x.text, x.get('href')])

In [None]:
# each equipment type is in a div, and the brand is in an adjacent div
# so select each block first to match them up in case the order is different
selector_block = CSSSelector("div.equipment_block")
selector_equip_name = CSSSelector("div.equip_name")
selector_equip_brand = CSSSelector("div.equip_brand")
selector_team = CSSSelector("div.team a")

In [None]:
# write header
with open('gear.csv', 'w', newline='') as writefile:
    writer = csv.writer(writefile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(['Team','Name','Link','Stick','Glove','Pants','Helmet','Skates'])

In [None]:
# for each player, parse data and add to gear file in append mode
with open('players.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    with open('gear.csv', 'a', newline='') as writefile:
        writer = csv.writer(writefile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        for row in reader:
            print("fetching gear data for " + row[0])
            player_page = requests.get('https://www.geargeek.com' + row[1])
            player_tree = lxml.html.fromstring(player_page.text)
            blocks = selector_block(player_tree)
            stick, glove, pants, helmet, skates, team = '','','','','', ''
            team = selector_team(player_tree)[0].text.strip()
            for block in blocks:
                name = selector_equip_name(block)[0].text.strip()
                brand = selector_equip_brand(block)[0].text.strip()
                if name == 'Stick':
                    stick = brand
                elif name == 'Glove':
                    glove = brand
                elif name == 'Pants':
                    pants = brand
                elif name == 'Helmet':
                    helmet = brand
                elif name == 'Skates':
                    skates = brand
            writer.writerow([team] + row + [stick,glove,pants,helmet,skates])

In [None]:
# everything is analagous for goalies but they have different equipment types
selector_goalie_link = CSSSelector("table#player_roster_goalie td:first-child a")
with open('goalies.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
    for team_page_link in team_page_links:
        team_page = requests.get(team_page_link)
        team_tree = lxml.html.fromstring(team_page.text)
        for x in selector_goalie_link(team_tree):
            writer.writerow([x.text, x.get('href')])

In [None]:
selector_block = CSSSelector("div.equipment_block")
selector_equip_name = CSSSelector("div.equip_name")
selector_equip_brand = CSSSelector("div.equip_brand")
selector_team = CSSSelector("div.team a")
with open('goaliegear.csv', 'w', newline='') as writefile:
    writer = csv.writer(writefile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(['Team','Name','Link','Stick','Skates','Mask','Gloves','Blocker','Pads'])

In [None]:
with open('goalies.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    with open('goaliegear.csv', 'a', newline='') as writefile:
        writer = csv.writer(writefile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        for row in reader:
            print("fetching gear data for " + row[0])
            player_page = requests.get('https://www.geargeek.com' + row[1])
            player_tree = lxml.html.fromstring(player_page.text)
            blocks = selector_block(player_tree)
            stick, skates, mask, gloves, blocker, pads, team = '','','','','','',''
            team = selector_team(player_tree)[0].text.strip()
            for block in blocks:
                name = selector_equip_name(block)[0].text.strip()
                brand = selector_equip_brand(block)[0].text.strip()
                if name == 'Stick':
                    stick = brand
                elif name == 'Skates':
                    skates = brand
                elif name == 'Mask':
                    mask = brand
                elif name == 'Gloves':
                    gloves = brand
                elif name == 'Blocker':
                    blocker = brand
                elif name == 'Pads':
                    pads = brand
            writer.writerow([team] + row + [stick, skates, mask, gloves, blocker, pads])