<a href="https://colab.research.google.com/github/dylanwalker/MGSC496/blob/main/MGSC496_NHLScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Here is the full code to create and run the Scrapy quotes scraper.
Use ***Runtime->Run All***

In [None]:
%%capture
!pip install scrapy

In [None]:
!scrapy startproject nhlscraper

In [None]:
%%writefile /content/nhlscraper/nhlscraper/spiders/nhlspider.py
import scrapy
from nhlscraper.items import NHLscraperItem

class NHLspiderSpider(scrapy.Spider):
    name = 'nhlspider'
    allowed_domains = ['www.scrapethissite.com']
    start_urls = ['http://www.scrapethissite.com/pages/forms/']

    def parse(self, response):
      teams = response.xpath('//table[@class="table"]//tr[@class="team"]') # seems like this xpath identifies each row which contains info about the team
      for team in teams: # for each row I look for all the data I want to extract for that row; many have simple class names, so they are relatively easy to get an xpath for then
        team_name = team.xpath('td[@class="name"]/text()').get().strip() # I added the .strip() because there was a newline character I wanted to get rid of.
        year = team.xpath('td[@class="year"]/text()').get().strip()
        wins = team.xpath('td[@class="wins"]/text()').get().strip()
        losses = team.xpath('td[@class="losses"]/text()').get().strip()
        ot_losses = team.xpath('td[@class="ot_losses"]/text()').get() # this one was tricky, because it is sometimes a None object, so I tested for that case below 
        if ot_losses is None:
          ot_losses = '' # I made it an empty string when there is no text
        ot_losses = ot_losses.strip() # otherwise, I process it like any other
        win_pct = team.xpath('td[contains(@class,"pct")]/text()').get().strip()
        goals_for = team.xpath('td[@class="gf"]/text()').get().strip()
        goals_against = team.xpath('td[@class="ga"]/text()').get().strip()
        diff = team.xpath('td[contains(@class,"diff")]/text()').get().strip()
        yield NHLscraperItem(team_name=team_name, year=year, wins=wins, losses=losses, ot_losses=ot_losses, win_pct=win_pct, goals_for=goals_for, goals_against=goals_against, diff=diff)
      next_link = response.xpath('//a[@aria-label="Next"]/@href').get()
      if next_link is not None:
        yield response.follow(url=next_link, callback=self.parse) 

In [None]:
%%writefile /content/nhlscraper/nhlscraper/items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class NHLscraperItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    team_name = scrapy.Field()
    year = scrapy.Field()
    wins = scrapy.Field()
    losses = scrapy.Field()
    ot_losses = scrapy.Field()
    win_pct = scrapy.Field()
    goals_for = scrapy.Field()
    goals_against = scrapy.Field()
    diff = scrapy.Field()
    


In [None]:
!cd /content/nhlscraper && scrapy crawl nhlspider -O nhl.json