In [1]:
import scrapy, os
import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter
from scrapy.linkextractors import LinkExtractor
from scrapy.item import DictItem, Field

In [2]:
def create_item_class(class_name,field_list):
    field_dict = {}
    for field_name in field_list:
        field_dict[field_name] = Field()

    return type(class_name, (DictItem,), {'fields':field_dict})


In [3]:
class Pipeline(object):
    """Scrapy Pipeline Object for Links, Results, Team, Individual"""
    
    def __init__(self):
        self.files = {}
        self.exporters = {}
    
    def create_file(self, file, _dir): 
        # if the file doesn't exist create it 
        if file not in self.files:
            self.files[file] = open(os.path.join(_dir, file + '.csv'), 'wb')
            self.exporters[file] = CsvItemExporter(self.files[file])
            self.exporters[file].start_exporting()
    
    def close_spider(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]
 
    def process_item(self, item, spider):
        # Define what item type it is 
        file = type(item).__name__.replace("item_", "").lower()
        if "link" in file: 
            _dir = "Links" 
        else: 
            _dir = "Data"
        
        self.create_file(file, _dir)
        
        # Export 
        self.exporters[file].export_item(item)
        
        return item

In [4]:
class TeamNameSpider(scrapy.Spider): 
    name = "TeamNameSpider"
    
    def start_requests(self): 
        file_in = 'links_gamebygame.csv'

        # Pull in the data 
        data = pd.read_csv(os.path.join("Links", file_in))
        _list = data.to_dict(orient='record')
        
        # Loop through the links 
        for url in _list[:1]: 
            print(url['link'])
            yield scrapy.Request(url=url['link'], 
                                 callback=self.parse,
                                 meta = {'team':url['team'], 
                                         'year':url['year'],
                                        }
                                )
    
    def parse(self, response): 
        regex = "http\:\/\/stats\.ncaa\.org\/team\/\d+\/\d+"

        # Get all the links on the page
        le = LinkExtractor(allow = regex) 
        links = le.extract_links(response)
        
        # Build up the output 
        for link in links:   
            yield scrapy.Request(url = link.url, 
                                 callback = self.parse_data, 
                                 meta = {'team': link.text,}
                                )
            
            
    def parse_data(self, response): 
        
        teamName = response.xpath('//*[@id="contentarea"]/fieldset/legend/a/text()').extract_first()
        record = {'shortName': response.meta['team'], 
                  'longName':teamName
                 }
        
        # Create dynamic items
        field_list = record.keys()
        DynamicItem = create_item_class('gamebygame_teamNames', field_list)
        item = DynamicItem()
        
        # yield items
        for k, v in record.items(): 
            item[k] = v
        yield item
        

In [5]:
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
                          'ITEM_PIPELINES': {'__main__.Pipeline':100}
                         })

process.crawl(TeamNameSpider)
process.start()

2017-12-30 09:54:11 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-12-30 09:54:11 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2017-12-30 09:54:11 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2017-12-30 09:54:11 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'sc

http://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=12240&org_id=721&stats_player_seq=-100


2017-12-30 09:54:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=12240&org_id=721&stats_player_seq=-100> (referer: None)
2017-12-30 09:54:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://stats.ncaa.org/teams/62793> from <GET http://stats.ncaa.org/team/721/11520>
2017-12-30 09:54:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://stats.ncaa.org/teams/19630> from <GET http://stats.ncaa.org/team/107/12240>
2017-12-30 09:54:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://stats.ncaa.org/teams/19729> from <GET http://stats.ncaa.org/team/731/12240>
2017-12-30 09:54:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://stats.ncaa.org/teams/19727> from <GET http://stats.ncaa.org/team/725/12240>
2017-12-30 09:54:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://stats.nc

2017-12-30 09:54:31 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET http://stats.ncaa.org/teams/19726> from <GET http://stats.ncaa.org/team/721/12240>
2017-12-30 09:54:31 [scrapy.core.scraper] DEBUG: Scraped from <200 http://stats.ncaa.org/teams/19703>
{'longName': 'San Jose St. Spartans', 'shortName': ' San Jose St. '}
2017-12-30 09:54:31 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://stats.ncaa.org/teams/19630> (failed 2 times): 500 Internal Server Error
2017-12-30 09:54:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://stats.ncaa.org/teams/66692> (referer: http://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=12240&org_id=721&stats_player_seq=-100)
2017-12-30 09:54:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://stats.ncaa.org/teams/19728> (referer: http://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=12240&org_id=721&stats_player_seq=-100)
2017-12-30 09:54:32 [scrapy.core.scraper] DEBUG: Scraped from 