In [1]:
import re, urllib, datetime, io
import pandas as pd
from scrapy import Spider, Request, signals
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.http.cookies import CookieJar
from bs4 import BeautifulSoup as bs

In [2]:
class DataSpider(Spider): 
    name = "Data"
    
    # Settings 
    custom_settings = {'DOWNLOAD_DELAY': '1.5'}
    
    ts=datetime.datetime.now()
    
    # Regex patterns 
    patterns = {'people' : ".*/\d+\?sport_code\=MFB",
                'history' : ".*\/teams\/history\/MFB\/\d+",
                'roster' : ".*\/team\/\d+\/roster\/\d+",
                'stats' : ".*\/team\/\d+\/stats\/\d+",
               }
    
    # Files 
    _files = {'people':{'file':"People_{0}.csv".format(ts), 
                        'header':"team,person,link,\n",
                       }, 
              'history':{'file':"History_{0}.csv".format(ts),
                         'header':"team,link,\n",
                        }, 
              'roster':{'file':"Roster_{0}.csv".format(ts), 
                        'header':"team,link,\n",
                       }, 
              'stats':{'file':"Stats_{0}.csv".format(ts), 
                       'header':"team,link,\n"
                      },
             }
    
    newLinksFile = "NewLinks{0}.csv".format(ts)
    
    with open(newLinksFile, 'w') as f: 
        f.write('team,link,txt,key,match\n')
        
    # Table results 
    with open("Results.csv", "w") as f: 
        f.write("Team,Date,Opponent,Result\n")
                
    # Table Team stats 
    with open("TeamStats.csv", "w") as f: 
        f.write("Team,Stat,Rank,Value,Year\n")
                
    # Individual 
    with open("Individual.csv", "w") as f: 
        f.write("Team,Stat,Player,Value,Year\n")
                
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(DataSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider
    
    def start_requests(self): 
        links = pd.read_csv("Teams_2017-12-02 14:55:51.037193.csv")
        links = links.to_dict(orient='record')
        
        for link in links[:1]: 
            yield Request(url=link['link'], 
                          callback=self.parse,
                          meta= {'REDIRECT_ENABLED': True, 'year':link['year']}
                         )
    
    def parse(self, response): 
        # Team 
        team = response.selector.xpath("//body//div//fieldset//legend//a/text()").extract()[0]
        print(team)
        # Get all the links on the page
        le = LinkExtractor() 
        links = le.extract_links(response)
                
        # Extract the links and save to a list
        linksOut = []
        for link in links: 
            for k, pattern in self.patterns.items():
                if re.search(pattern, link.url) != None: 
                    linksOut.append({'team':team, 
                                     'link':link.url, 
                                     'txt':link.text, 
                                     'key':k, 
                                     'match':True, 
                                     'year':response.meta['year']
                                    }
                                   )
        
        # Tables 
        tables = pd.read_html(response.body)
        
        # Results 
        tables[1].rename(columns=tables[1].iloc[1], inplace = True)
        tables[1].drop([0,1], inplace = True)
        tables[1]['team'] = team
                
        # Team stats 
        tables[2].rename(columns=tables[2].iloc[1], inplace = True)
        tables[2].drop([0,1], inplace = True)
        tables[2]['team'] = team
        tables[2]['year'] = response.meta['year']
                
        # Individual stats
        tables[3].rename(columns=tables[3].iloc[1], inplace = True)
        tables[3].drop([0,1], inplace = True)
        tables[3]['team'] = team
        tables[3]['year'] = response.meta['year']

        # Create a pandas df and append to csv 
        c = io.StringIO()
        df = pd.DataFrame(linksOut)
        df[['team', 'link', 'txt', 'key', 'match']].to_csv(c, index=False, header=False)
                
        io_results = io.StringIO()
        io_teamstats = io.StringIO()
        io_individual = io.StringIO()
                
        tables[1][['team', 'Date','Opponent','Result']].to_csv(io_results, index=False, header=False)
        tables[2][['team','Stat','Rank','Value', 'year']][:-1].to_csv(io_teamstats, index=False, header=False)
        tables[3][['team','Stat','Player','Value', 'year']][:-1].to_csv(io_individual, index=False, header=False)
        
        # Append  
        with open(self.newLinksFile, 'a') as f:
            f.write(c.getvalue())
        
        with open('Results.csv', 'a') as f: 
            f.write(io_results.getvalue())
            
        with open("TeamStats.csv", 'a') as f: 
            f.write(io_teamstats.getvalue())
        
        with open("Individual.csv", 'a') as f: 
            f.write(io_individual.getvalue())
        
    def spider_closed(self, spider): 
        print("Spider is done")
            

In [3]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
})

process.crawl(DataSpider)
process.start() 

2017-12-06 21:24:15 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-06 21:24:15 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
2017-12-06 21:24:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-06 21:24:15 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddle

Air Force Falcons
Spider is done
