In [1]:
import scrapy, re, urllib, datetime, io, requests
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup as bs

In [2]:
data = pd.read_csv("NewLinks2017-12-03 20:28:18.862583.csv")
stats = data[data.key == 'stats'].drop_duplicates()
stats.sort_values(['team'], inplace = True)
stats_list = stats.to_dict(orient='record')

In [3]:
stats_list[0]['link']

'http://stats.ncaa.org/team/721/stats/11520'

In [4]:
class StatsSpider(scrapy.Spider): 
    name = "Stats"
    regex = '.*\/stats\?id\=\d+\&year_stat_category_id\=\d+'
    xpath = "//body//div[@id='contentarea']//div[@id='stats_div']//table"
    header = "Team,Year,Stat,Link\n"
    file_out = "StatsLinks_{0}.csv"
    
    # Create file to append to 
    fileName = file_out.format(datetime.datetime.now())
    
    with open(fileName, 'w') as f: 
        f.write(header)
    
    def start_requests(self): 
        # Pull in the data 
        data = pd.read_csv("NewLinks2017-12-03 20:28:18.862583.csv")
        stats = data[data.key == 'stats'].drop_duplicates()
        stats.sort_values(['team'], inplace = True)
        stats_list = stats.to_dict(orient='record')
        
        # Loop through the links 
        for url in stats_list[:1]: 
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse,
                                 meta = {'team':url['team']}
                                )
    
    def parse(self, response): 
        # Year 
        year = response.selector.xpath("//body//div[@id='contentarea']//fieldset//div/" +
                 "/form[@id='change_sport_form']//select[@id='year_list']//option[@select" +
                 "ed='selected']//text()").extract()[0]
        
        # Get all the links on the page
        le = LinkExtractor(restrict_xpaths = (self.xpath,),
                           allow = self.regex
                          ) 
        
        links = le.extract_links(response)
        
        # Build up the output 
        output = []
        for link in links: 
            output.append({'team':response.meta["team"], 
                           'year':year,
                           'link':link.url, 
                           'text':link.text,
                          }
                         )
        
        # Build up pandas DF - write it out
        df = pd.DataFrame(output)
        
        # Create a pandas df and append to csv 
        c = io.StringIO()
        df[['team','year','text','link']].to_csv(c, index=False, header=False)
        
        # Append links to CSV 
        with open(self.fileName, 'a') as f:
            f.write(c.getvalue())


In [5]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(StatsSpider)
process.start()

2017-12-09 11:37:05 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-09 11:37:05 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-09 11:37:05 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-09 11:37:05 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc