In [1]:
import scrapy, re, urllib, datetime, io, requests
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup as bs

In [2]:
class item_statslinks(scrapy.Item):
    """Scrapy Item Team History"""
    Team = scrapy.Field()
    Year = scrapy.Field()
    Stat = scrapy.Field()
    Link = scrapy.Field()

In [3]:
class StatsLinkCSVPipeline(object):
    """Scrapy Pipeline Object for History"""
    
    def open_spider(self,spider):
        fileName = "StatsLinks.csv"
        self.file = open(fileName, 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()
 
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
 
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

In [4]:
class StatsSpider(scrapy.Spider): 
    name = "Stats"
    regex = '.*\/stats\?id\=\d+\&year_stat_category_id\=\d+'
    xpath = "//body//div[@id='contentarea']//div[@id='stats_div']//table"
    
    def start_requests(self): 
        file_in = 'links.csv'

        # Pull in the data 
        data = pd.read_csv(file_in)
        stats = data[data.key == 'stats'].drop_duplicates()
        stats.sort_values(['team'], inplace = True)
        stats_list = stats.to_dict(orient='record')
        
        # Loop through the links 
        for url in stats_list[:1]: 
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse,
                                 meta = {'team':url['team']}
                                )
    
    def parse(self, response): 
        # Year 
        year = response.selector.xpath("//body//div[@id='contentarea']//fieldset//div/" +
                 "/form[@id='change_sport_form']//select[@id='year_list']//option[@select" +
                 "ed='selected']//text()").extract()[0]
        
        # Get all the links on the page
        le = LinkExtractor(restrict_xpaths = (self.xpath,),
                           allow = self.regex
                          ) 
        
        links = le.extract_links(response)
        
        # Build up the output 
        output = []
        for link in links: 
            yield item_statslinks({'Team':response.meta["team"], 
                                   'Year':year,
                                   'Link':link.url, 
                                   'Stat':link.text,
                                  })

In [5]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
                          'ITEM_PIPELINES': {'__main__.StatsLinkCSVPipeline':100}
                         })

process.crawl(StatsSpider)
process.start()

2017-12-10 16:43:02 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-10 16:43:02 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-10 16:43:02 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-10 16:43:02 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc