In [1]:
import scrapy, re, urllib, datetime, io, requests, os
import pandas as pd
from scrapy.exporters import CsvItemExporter
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup as bs

In [2]:
class item_history(scrapy.Item):
    """Scrapy Item Team History"""
    Team = scrapy.Field()
    Year = scrapy.Field()
    HeadCoaches = scrapy.Field()
    Division = scrapy.Field()
    Conference = scrapy.Field()
    Wins = scrapy.Field()
    Losses = scrapy.Field()
    Ties = scrapy.Field()
    WL = scrapy.Field()
    Notes = scrapy.Field()
    

In [3]:
class RosterCSVPipeline(object):
    """Scrapy Pipeline Object for History"""
    
    def open_spider(self,spider):
        fileName = "History.csv"
        self.file = open(os.path.join("Data", fileName), 'wb')
        self.exporter = CsvItemExporter(self.file, fields_to_export=["Team","Year","Head Coaches","Division","Conference","Wins","Losses","Ties","WL","Notes"])
        self.exporter.start_exporting()
 
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
 
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

In [4]:
class HistorySpider(scrapy.Spider): 
    name = "History"
    file_in = 'links.csv'
    
    # Read in data
    data = pd.read_csv(os.path.join("Data", file_in))
    df = data[data.key == 'history'].drop_duplicates()
    df.sort_values(['team'], inplace = True)
    _list = df.to_dict(orient='record')
    
    def start_requests(self): 
        
        for url in self._list[:1]: 
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse, 
                                 meta={'team':url['team']}
                                )
    
    def parse(self, response):         
        # Create tables 
        tables = pd.read_html(response.body)
        tables[0]['Team'] = response.meta['team']
        tables[0].rename(columns={"WL%":"WL", "Head Coaches":"HeadCoaches"}, inplace = True)
        
        for record in tables[0][:-1].to_dict(orient='record'): 
            yield item_history(record)
        

In [5]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
                          'ITEM_PIPELINES': {'__main__.RosterCSVPipeline':100}
                         })

process.crawl(HistorySpider)
process.start() 

2017-12-11 09:35:32 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-11 09:35:32 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-11 09:35:32 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-11 09:35:32 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc

2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/history/MFB/721>
{'Conference': 'Mountain West',
 'Division': 'FBS',
 'HeadCoaches': 'Fisher DeBerry',
 'Losses': 6,
 'Notes': nan,
 'Team': 'Air Force Falcons',
 'Ties': 0,
 'WL': 0.455,
 'Wins': 5,
 'Year': '2004-05'}
2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/history/MFB/721>
{'Conference': 'Mountain West',
 'Division': 'FBS',
 'HeadCoaches': 'Fisher DeBerry',
 'Losses': 5,
 'Notes': nan,
 'Team': 'Air Force Falcons',
 'Ties': 0,
 'WL': 0.583,
 'Wins': 7,
 'Year': '2003-04'}
2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/history/MFB/721>
{'Conference': 'Mountain West',
 'Division': 'FBS',
 'HeadCoaches': 'Fisher DeBerry',
 'Losses': 5,
 'Notes': 'San Francisco Bowl',
 'Team': 'Air Force Falcons',
 'Ties': 0,
 'WL': 0.615,
 'Wins': 8,
 'Year': '2002-03'}
2017-12-11 09:35:38 [scrapy.core.scrap

2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/history/MFB/721>
{'Conference': '-',
 'Division': nan,
 'HeadCoaches': 'Bill Parcells',
 'Losses': 8,
 'Notes': nan,
 'Team': 'Air Force Falcons',
 'Ties': 0,
 'WL': 0.273,
 'Wins': 3,
 'Year': '1978-79'}
2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/history/MFB/721>
{'Conference': '-',
 'Division': 'FBS',
 'HeadCoaches': 'Ben Martin',
 'Losses': 8,
 'Notes': nan,
 'Team': 'Air Force Falcons',
 'Ties': 1,
 'WL': 0.22699999999999998,
 'Wins': 2,
 'Year': '1977-78'}
2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/history/MFB/721>
{'Conference': '-',
 'Division': 'FBS',
 'HeadCoaches': 'Ben Martin',
 'Losses': 7,
 'Notes': nan,
 'Team': 'Air Force Falcons',
 'Ties': 0,
 'WL': 0.364,
 'Wins': 4,
 'Year': '1976-77'}
2017-12-11 09:35:38 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org