In [1]:
import scrapy, re, urllib, datetime, io
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup as bs

In [2]:
class TeamSpider(scrapy.Spider): 
    name = "Team"
    
    # Create file to append to 
    fileName = "Teams_{0}.csv".format(datetime.datetime.now())
    header = "link,team,year,\n"
    
    with open(fileName, 'w') as f: 
        f.write(header)
    
    def start_requests(self): 
        years = [2014, 2015, 2016, 2017] # Available years
        baseURL = "http://stats.ncaa.org/team/inst_team_list?academic_year={year}&conf_id=-1&division=11&sport_code=MFB"
        
        for year in years: 
            url = baseURL.format(year=year)
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response): 
        
        # Extract current year from the returned object
        curr_year = urllib.parse.parse_qs(urllib.parse.urlparse(response.url).query)['academic_year'][0]
        
        # Get all the links on the page
        le = LinkExtractor() 
        links = le.extract_links(response)
        
        # Regex pattern to find just the team links 
        team_url_pattern = "http\:\/\/stats\.ncaa\.org\/team\/\d+\/\d+"
        
        # Extract the links and save to a list
        data = []
        for link in links: 
            match = re.search(team_url_pattern, link.url)
            if match != None: 
                data.append({'link':link.url, 'team':link.text, 'year':curr_year })
        
        # Create a pandas df and append to csv 
        c = io.StringIO()
        df = pd.DataFrame(data)
        df[['link', 'team', 'year']].to_csv(c, index=False, header=False)
        
        # Append links to CSV 
        with open(self.fileName, 'a') as f:
            f.write(c.getvalue())
        

In [3]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(TeamSpider)
process.start() 

2017-12-02 14:55:51 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-02 14:55:51 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-02 14:55:51 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-02 14:55:51 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc