In [1]:
import re, urllib, datetime, io, scrapy, os
import pandas as pd
from scrapy.exporters import CsvItemExporter
from scrapy import Spider, Request, signals
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.http.cookies import CookieJar
from bs4 import BeautifulSoup as bs

In [2]:
class item_Links(scrapy.Item):
    """Scrapy Item for Team Links for 2014-2017"""
    team = scrapy.Field()
    link = scrapy.Field()
    txt = scrapy.Field()
    key = scrapy.Field()
    year = scrapy.Field()

In [3]:
class item_Results(scrapy.Item):
    """Scrapy Item yearly team results"""
    Team = scrapy.Field()
    Date = scrapy.Field()
    Opponent = scrapy.Field()
    Result = scrapy.Field()

In [4]:
class item_TeamStats(scrapy.Item):
    """Scrapy Item yearly team stats"""
    Team = scrapy.Field()
    Stat = scrapy.Field()
    Rank = scrapy.Field()
    Value = scrapy.Field()
    Year = scrapy.Field()

In [5]:
class item_IndividualLeaders(scrapy.Item):
    """Scrapy Item yearly individual leaders"""
    Team = scrapy.Field()
    Stat = scrapy.Field()
    Player = scrapy.Field()
    Value = scrapy.Field()
    Year = scrapy.Field()

In [6]:
class LinksResultsTeamIndividualCSVPipeline(object):
    """Scrapy Pipeline Object for Links, Results, Team, Individual"""
    types = ['links', 'results', 'teamstats', 'individualleaders']
    
    def open_spider(self, spider):
        # CreateFiles 
        self.files = {t: open(os.path.join("Data", t + '.csv'), 'wb') for t in self.types}
        self.exporters = {t:CsvItemExporter(self.files[t]) for t in self.types}
        [e.start_exporting() for e in self.exporters.values()]
    
    def close_spider(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]
 
    def process_item(self, item, spider):
        # Define what item type it is 
        itemType = type(item).__name__.replace("item_", "").lower()
        
        # Export 
        self.exporters[itemType].export_item(item)
        
        return item

In [7]:
class DataSpider(Spider): 
    name = "Data"
    
    # Settings 
    custom_settings = {'DOWNLOAD_DELAY': '1.5'}
        
    # Regex patterns 
    patterns = {'people' : ".*/\d+\?sport_code\=MFB",
                'history' : ".*\/teams\/history\/MFB\/\d+",
                'roster' : ".*\/team\/\d+\/roster\/\d+",
                'stats' : ".*\/team\/\d+\/stats\/\d+",
               }
    
    def start_requests(self): 
        """Read in TeamLinks and start crawling"""
        links = pd.read_csv(os.path.join("Data", "TeamLinks.csv"))
        links = links.to_dict(orient='record')
        
        for link in links[:1]: 
            yield Request(url=link['Link'], 
                          callback=self.parse,
                          meta= {'REDIRECT_ENABLED': True, 'year':link['Year']}
                         )
    
    def parse(self, response): 
        # Extract Team Name
        team = response.selector.xpath("//body//div//fieldset//legend//a/text()").extract()[0]
        
        # Get all the links on the page
        le = LinkExtractor() 
        links = le.extract_links(response)
                
        # Extract the links pass it to the pipeline for saving
        linksOut = []
        for link in links: 
            for k, pattern in self.patterns.items():
                if re.search(pattern, link.url) != None: 
                    yield item_Links({'team':team, 
                                      'link':link.url, 
                                      'txt':link.text, 
                                      'key':k, 
                                      'year':response.meta['year']
                                     })
        
        # Find all the tables 
        tables = pd.read_html(response.body)
        
        # Create a results table
        tables[1].rename(columns=tables[1].iloc[1], inplace = True)
        tables[1].drop([0,1], inplace = True)
        tables[1]['Team'] = team
        
        # Convert to table to a list, use dict to create scrapy item, send item to pipeline 
        for record in tables[1].to_dict(orient='record'):
            yield item_Results(record)
                
        # Team stats 
        tables[2].rename(columns=tables[2].iloc[1], inplace = True)
        tables[2].drop([0,1], inplace = True)
        tables[2]['Team'] = team
        tables[2]['Year'] = response.meta['year']
        
        # Convert to table to a list, use dict to create scrapy item, send item to pipeline 
        for record in tables[2][:-1].to_dict(orient='record'):
            yield item_TeamStats(record)
                
        # Individual stats
        tables[3].rename(columns=tables[3].iloc[1], inplace = True)
        tables[3].drop([0,1], inplace = True)
        tables[3]['Team'] = team
        tables[3]['Year'] = response.meta['year']
        
        # Convert to table to a list, use dict to create scrapy item, send item to pipeline 
        for record in tables[3][:-1].to_dict(orient='record'):
            yield item_IndividualLeaders(record)
            

In [8]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
                          'ITEM_PIPELINES': {'__main__.LinksResultsTeamIndividualCSVPipeline':100}
                         })

process.crawl(DataSpider)
process.start() 

2017-12-11 09:31:38 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-11 09:31:38 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-11 09:31:38 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-11 09:31:38 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc

2017-12-11 09:31:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/62793>
{'Rank': '29',
 'Stat': '4th Down Conversion Pct',
 'Team': 'Air Force Falcons',
 'Value': '0.593',
 'Year': 2014}
2017-12-11 09:31:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/62793>
{'Rank': '123',
 'Stat': '3rd Down Conversion Pct Defense',
 'Team': 'Air Force Falcons',
 'Value': '0.589',
 'Year': 2014}
2017-12-11 09:31:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/62793>
{'Rank': 'T-89',
 'Stat': '4th Down Conversion Pct Defense',
 'Team': 'Air Force Falcons',
 'Value': '0.579',
 'Year': 2014}
2017-12-11 09:31:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/62793>
{'Rank': '5',
 'Stat': 'Red Zone Offense',
 'Team': 'Air Force Falcons',
 'Value': '0.946',
 'Year': 2014}
2017-12-11 09:31:47 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/62793>
{'Rank': '102',
 'S