In [1]:
import scrapy, re, datetime, os
import pandas as pd
from scrapy.exporters import CsvItemExporter
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor

In [2]:
class item_coaches(scrapy.Item):
    """Scrapy Item for Coaching Profiles"""
    Year = scrapy.Field()
    Org = scrapy.Field()
    Division = scrapy.Field()
    Wins = scrapy.Field()
    Losses = scrapy.Field()
    Ties = scrapy.Field()
    WL = scrapy.Field()
    Notes = scrapy.Field()
    Name = scrapy.Field()

In [3]:
class CoachesCSVPipeline(object):
    """Scrapy Pipeline Object for Coaching Profiles"""
    def open_spider(self,spider):
        fileName = "Coaches.csv"
        self.file = open(os.path.join("Data", fileName), 'wb')
        self.exporter = CsvItemExporter(self.file, fields_to_export=["Name", "Year", "Org", "Division", "Wins", "Losses", "Ties", "WL", "Notes"])
        self.exporter.start_exporting()
 
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
 
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

In [4]:
class CoachSpider(scrapy.Spider): 
    name = "Coach"
    
    # Read in data
    file_in = 'teaminfolinks.csv'
    data = pd.read_csv(os.path.join("Links",file_in))
    df = data[data.key == 'people'].drop_duplicates()
    df.sort_values(['team'], inplace = True)
    _list = df.to_dict(orient='record')
    
    def start_requests(self): 
        
        for url in self._list[:1]: 
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse, 
                                 meta={'name':url['txt']}
                                )
    
    def parse(self, response):         
        # Create table 
        tables = pd.read_html(response.body)
        tables[1]['Name'] = response.meta['name']
        tables[1].rename(columns={"WL%":"WL"}, inplace = True)
        for record in tables[1][:-1].to_dict(orient='record'): 
            yield item_coaches(record)        
        

In [5]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
                          'ITEM_PIPELINES': {'__main__.CoachesCSVPipeline':100}
                         })

process.crawl(CoachSpider)
process.start() 

2017-12-11 10:11:22 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-11 10:11:22 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-11 10:11:22 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-11 10:11:22 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc