In [1]:
import scrapy, re, urllib, os
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.exporters import CsvItemExporter

In [2]:
class item_teamlinks(scrapy.Item):
    """Scrapy Item for Team Links for 2014-2017"""
    Link = scrapy.Field()
    Team = scrapy.Field()
    Year = scrapy.Field()

In [3]:
class TeamLinksCSVPipeline(object):
    """Scrapy Pipeline Object for Team Links 2014-2017"""
    def __init__(self):
        fileName = "TeamLinks.csv"
        self.file = open(os.path.join("Links", fileName), 'wb')
        self.exporter = CsvItemExporter(self.file, fields_to_export=['Team', 'Year', 'Link'])
        self.exporter.start_exporting()
 
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
 
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

In [4]:
class TeamSpider(scrapy.Spider): 
    name = "Team"
    
    # List of years to build up URLs to crawl 
    years = [2014, 2015, 2016, 2017] 
    
    # Build list of URLs and crawl 
    start_urls = ["http://stats.ncaa.org/team/inst_team_list?academic_year={year}&conf_id=-1&division=11&sport_code=MFB".format(year = year)
            for year in years]
    
    def parse(self, response): 
        """Parse the crawled pages"""
        # Extract current year from the returned object
        curr_year = urllib.parse.parse_qs(urllib.parse.urlparse(response.url).query)['academic_year'][0]
        
        # Get all the links on the page
        le = LinkExtractor() 
        links = le.extract_links(response)
        
        # Regex pattern to find just the team links 
        team_url_pattern = "http\:\/\/stats\.ncaa\.org\/team\/\d+\/\d+"
        
        # Extract the links, put into item object, yield item object to pipeline which saves results 
        for link in links: 
            match = re.search(team_url_pattern, link.url)
            if match != None: 
                yield item_teamlinks({'Link':link.url, 'Team':link.text, 'Year':curr_year })


In [5]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
                          'ITEM_PIPELINES': {'__main__.TeamLinksCSVPipeline':100}
                         })

process.crawl(TeamSpider)
process.start() 

2017-12-11 10:09:44 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-11 10:09:44 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-11 10:09:44 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-11 10:09:44 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/193/11520', 'Team': 'Duke', 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/196/11520',
 'Team': 'East Carolina',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/204/11520',
 'Team': 'Eastern Mich.',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/231/11520', 'Team': 'FIU', 'Ye

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/671/11520',
 'Team': 'Louisiana',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/366/11520',
 'Team': 'Louisiana Tech',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/367/11520',
 'Team': 'Louisville',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/388/11520',
 'Team': 'Mar

{'Link': 'http://stats.ncaa.org/team/77/12424', 'Team': 'BYU', 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/129/12240',
 'Team': 'Central Mich.',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/458/12240',
 'Team': 'Charlotte',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/140/12240',
 'Team': 'Cincinnati',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/96/12240',
 'Team': 'Fresno St.',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/253/12240',
 'Team': 'Ga. Southern',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/257/12240',
 'Team': 'Georgia',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/254/12240',
 'Team': 'Georgia 

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/328/12240',
 'Team': 'Kansas',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/327/12240',
 'Team': 'Kansas St.',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/331/12240',
 'Team': 'Kent St.',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/334/12240',
 'Team': 'Kentucky',
 

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/646/11520',
 'Team': 'South Ala.',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/648/11520',
 'Team': 'South Carolina',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/651/11520',
 'Team': 'South Fla.',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/657/11520',
 'Team': 'So

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/709/11520',
 'Team': 'Toledo',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/716/11520', 'Team': 'Troy', 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/718/11520',
 'Team': 'Tulane',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/719/11520',
 'Team': 'Tulsa',
 'Year': '2014

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2014&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/736/11520',
 'Team': 'Vanderbilt',
 'Year': '2014'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/416/12424',
 'Team': 'Michigan St.',
 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/419/12424',
 'Team': 'Middle Tenn.',
 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/428/12424',
 'Team': 'Mi

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/509/12424',
 'Team': 'Northwestern',
 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/559/12240',
 'Team': 'Purdue',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/574/12240', 'Team': 'Rice', 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/587/12240',
 'Team': 'Rutgers',
 'Year

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/587/12424',
 'Team': 'Rutgers',
 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/663/12424', 'Team': 'SMU', 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/697/12240',
 'Team': 'Texas A&M',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/670/12240',
 'Team': 'Texas St.',
 'Year'

2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/742/12240',
 'Team': 'Virginia Tech',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2016&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/749/12240',
 'Team': 'Wake Forest',
 'Year': '2016'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/670/12424',
 'Team': 'Texas St.',
 'Year': '2017'}
2017-12-11 10:09:48 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/700/12424',
 'Team': 'Tex

2017-12-11 10:09:49 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2017&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/811/12424',
 'Team': 'Wyoming',
 'Year': '2017'}
2017-12-11 10:09:51 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB> (referer: None)
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/721/11980',
 'Team': 'Air Force',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/5/11980', 'Team': 'Akron', 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from 

2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/96/11980',
 'Team': 'Fresno St.',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/253/11980',
 'Team': 'Ga. Southern',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/257/11980',
 'Team': 'Georgia',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/254/11980',
 'Team': 'Georgia 

2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/490/11980',
 'Team': 'NC State',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/726/11980', 'Team': 'Navy', 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/463/11980',
 'Team': 'Nebraska',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/466/11980',
 'Team': 'Nevada',
 'Year': 

2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/688/11980',
 'Team': 'Syracuse',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/698/11980', 'Team': 'TCU', 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/690/11980',
 'Team': 'Temple',
 'Year': '2015'}
2017-12-11 10:09:52 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/team/inst_team_list?academic_year=2015&conf_id=-1&division=11&sport_code=MFB>
{'Link': 'http://stats.ncaa.org/team/694/11980',
 'Team': 'Tennessee',
 'Year': 

2017-12-11 10:09:52 [scrapy.core.engine] INFO: Spider closed (finished)
