In [1]:
import scrapy, re, urllib, datetime, io, requests
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup as bs

In [12]:
class rosterSpider(scrapy.Spider): 
    name = "Roster"
    
    file_in = "NewLinks2017-12-03 20:28:18.862583.csv"
    file_out = "Roster_{0}.csv"
    variables = ["Year", "Team", "Jersey","Player","Pos","Yr","GP","GS"]
    header = ','.join(variables) + "\n"
    target = 'roster'
    
    # Read in data
    data = pd.read_csv(file_in)
    df = data[data.key == target].drop_duplicates()
    df.sort_values(['team'], inplace = True)
    _list = df.to_dict(orient='record')
    
    # Create file to append to 
    fileName = file_out.format(datetime.datetime.now())
    
    with open(fileName, 'w') as f: 
        f.write(header)
    
    def start_requests(self): 
        
        for url in self._list[:1]: 
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse, 
                                 meta={'team':url['team']}
                                )
    
    def parse(self, response):
        # year 
        year = response.selector.xpath("//body//div[@id='contentarea']//fieldset//div/" +
                 "/form[@id='change_sport_form']//select[@id='year_list']//option[@select" +
                 "ed='selected']//text()").extract()

        # Create tables 
        tables = pd.read_html(response.body)
        tables[0].columns = tables[0].columns.droplevel(0)
        tables[0]['Year'] = year[0]
        tables[0]['Team'] = response.meta['team']

        # Create a pandas df and append to csv 
        c = io.StringIO()
        tables[0][self.variables].to_csv(c, index=False, header=False)
        
        # Append links to CSV 
        with open(self.fileName, 'a') as f:
            f.write(c.getvalue())
        

In [13]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(rosterSpider)
process.start() 

2017-12-05 22:27:36 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-05 22:27:36 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-05 22:27:36 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-05 22:27:37 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc