In [1]:
import scrapy, os
import pandas as pd
from scrapy.exporters import CsvItemExporter
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor

In [2]:
class item_roster(scrapy.Item):
    """Scrapy Item for yearly roster"""
    Year = scrapy.Field()
    Team = scrapy.Field()
    Jersey = scrapy.Field()
    Player = scrapy.Field()
    Pos = scrapy.Field()
    Yr = scrapy.Field()
    GP = scrapy.Field()
    GS = scrapy.Field()
    

In [3]:
class RosterCSVPipeline(object):
    """Scrapy Pipeline Object for yearly roster"""
    def open_spider(self,spider):
        fileName = "Roster.csv"
        self.file = open(os.path.join("Data", fileName), 'wb')
        self.exporter = CsvItemExporter(self.file, fields_to_export=["Year", "Team", "Jersey","Player","Pos","Yr","GP","GS"])
        self.exporter.start_exporting()
 
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
 
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

In [4]:
class rosterSpider(scrapy.Spider): 
    name = "Roster"
    file_in = 'teaminfolinks.csv'
    
    # Read in data
    data = pd.read_csv(os.path.join("Links", file_in))
    df = data[data.key == 'roster'].drop_duplicates()
    df.sort_values(['team'], inplace = True)
    _list = df.to_dict(orient='record')
    
    
    def start_requests(self): 
        
        for url in self._list[:1]: 
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse, 
                                 meta={'team':url['team']}
                                )
    
    def parse(self, response):
        # year 
        year = response.selector.xpath("//body//div[@id='contentarea']//fieldset//div/" +
                 "/form[@id='change_sport_form']//select[@id='year_list']//option[@select" +
                 "ed='selected']//text()").extract()

        # Create tables 
        tables = pd.read_html(response.body)
        tables[0].columns = tables[0].columns.droplevel(0)
        tables[0]['Year'] = year[0]
        tables[0]['Team'] = response.meta['team']

        for record in tables[0].to_dict(orient='record'): 
            yield item_roster(record)

In [5]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 
                          'ITEM_PIPELINES': {'__main__.RosterCSVPipeline':100}
                        })

process.crawl(rosterSpider)
process.start() 

2017-12-11 10:13:17 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-11 10:13:17 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-11 10:13:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-11 10:13:17 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc

2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 1,
 'GS': 0,
 'Jersey': '92',
 'Player': 'Deeks, Lochlin',
 'Pos': nan,
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': nan}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 11,
 'GS': 0,
 'Jersey': '46',
 'Player': 'Dreslinski, Nate',
 'Pos': 'TE',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Jr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 4,
 'GS': 0,
 'Jersey': '81',
 'Player': 'Duncavage, Luke',
 'Pos': 'WR',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Jr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 1,
 'GS': 0,
 'Jersey': '3A',
 'Player': 'Dunn, Brett',
 'Pos': nan,
 'Team': 'Air Force Falcons',
 'Year': '2013-

2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 0,
 'GS': 0,
 'Jersey': '5A',
 'Player': 'Lacy, Jalen',
 'Pos': 'LB',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Fr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 12,
 'GS': 0,
 'Jersey': '25',
 'Player': 'Ladipo, Roland',
 'Pos': 'DB',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Fr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 10,
 'GS': 4,
 'Jersey': '24',
 'Player': 'Lee, Jonathan',
 'Pos': 'RB',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Jr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 12,
 'GS': 0,
 'Jersey': '82',
 'Player': 'Link, Keith',
 'Pos': 'TE',
 'Team': 'Air Force Falcons',
 'Year': '2013-14

2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 5,
 'GS': 0,
 'Jersey': '22',
 'Player': 'Solano, Paco',
 'Pos': 'RB',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Jr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 12,
 'GS': 12,
 'Jersey': '21',
 'Player': 'Spears, Christian',
 'Pos': 'DB',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Jr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 8,
 'GS': 0,
 'Jersey': '6A',
 'Player': 'Steelhammer, Weston',
 'Pos': 'DB',
 'Team': 'Air Force Falcons',
 'Year': '2013-14',
 'Yr': 'Fr'}
2017-12-11 10:13:21 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/721/roster/11520>
{'GP': 1,
 'GS': 0,
 'Jersey': '9A',
 'Player': 'Stevenson, Marcus',
 'Pos': 'RB',
 'Team': 'Air Force Falcons',
 '