# Collect This Year's Tournament Data
Authors: Connor Finn, Riley Greene <br>
Data: March 14, 2020 <br>
Note: <dir>
This should only be ran one time, and on the correct day - when all regular season games are finished

In [12]:
# our imports for the model
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup 
import lxml.html as lh

## Create a Data Pipeline
This will define where we save the collected data. It will create a folder titled "tournament_data" if one does not already exist. This will overwrite any data you have saved in this folder already. So be sure to run this on the correct date.

In [18]:
import os

class collect_pipeline(object):
    
    def make_folder(self):
        if not os.path.exists('./tournament_data'):
            os.mkdir('./tournament_data')
    def join_data(self, spider):
        season_stats = pd.merge(spider.data_one , spider.data_two , on=[ 'Team'  ] , how = 'left')
        cols_keep = ['Team' , 'FG' , 'FGA', 'FG%', '2P' , '2PA', '2P%', '3P' , '3PA', '3P%', 'FT' , 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB' , 'AST', 'STL', 'BLK', 'TOV', 'PF',  'PTS/G','W-L%', 'SRS', 'SOS','PTS.1','Seed' ]
        season_stats = season_stats[cols_keep]
        season_stats = season_stats.rename(columns = {"PTS.1": "opp_PPG"})
        self.joined_data = season_stats
        
    def write_data(self):
        self.joined_data.to_csv('tournament_data/prelim_data.csv' , index = False)


## Create a Web Scraper
This class will collect the data in the exact same format as the origional data collection. <br>
We will be scraping from sports-reference.com websites in order to collect a consistent dataset.

In [20]:

class bball_scraper():
   

    def __init__(self ):
        self.start_urls = []
        self.data_one = None
        self.data_two = None
    
    # url_list is a list of dictionary's. each has a 'name' id' , 'year' , 'url'
    # for the second group of urls - data will be none type
    def set_urls(self , url_list):
        self.start_urls = url_list
    
    # this is the first set of urls. only need the first url from one table
    def parse_one(self):
        season_stats = pd.DataFrame()   # start with an empty dataframe
        print('here')
        for item in self.start_urls:
            # extact the information we need
            team_name = item['name']
            url = item['url']
            year = item['year']
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "lxml")
            tables = soup.findAll('table')    # search for tables 
            # convert to a dataframe and label the data
            df = pd.read_html(str(tables[1]))[0]      # select the table of interest into a pandas dataframe  
            df.drop([1 , 2, 3] , inplace=True)
            df = df.replace('Team' , team_name)   # want the name of the team 
            df['Date'] = year  # only want the year in this column
            df = df.rename(columns={'Unnamed: 0': 'Team'})

            
            season_stats = season_stats.append(df , ignore_index = True , sort=False)
           # season_stats = pd.concat([season_stats, df] ,  axis=0) # add this to the season stats empty dataframe we started with
              # simply clean up the columns 
           
            # save the data
        self.data_one = season_stats  
            
        # this is the second set of urls - more than one row from the table
    def parse_two(self):
        wl_teams = pd.DataFrame()      # initialize and empty dataframe
        for item in self.start_urls:
                # extract the info
            team_name = item['name']
            url = item['url']
                # parse the page
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "lxml")
            tables = soup.findAll('table')
        
                # build a dataframe of the win loss data
            win_loss_df = pd.read_html(str(tables))[0]
            win_loss_6_df = win_loss_df.head(1)
            win_loss_6_df['Team' , 'Team'] = team_name
            win_loss_6_df['Unnamed: 17_level_0', 'Seed'] = item["seed"] 
            wl_teams = wl_teams.append(win_loss_6_df , ignore_index=True)    

        # clean the dataframe
        wl_teams.columns = wl_teams.columns.droplevel(level=0)
        wl_teams.columns =  ['Rk', 'Season', 'Conf', 'W', 'L', 'W-L%', 'W.1', 'L.1', 'W-L%.1', 'SRS', 'SOS', 'PTS', 'PTS.1', 'AP Pre', 'AP High', 'AP Final', 'NCAA Tournament', 'Seed', 'Coach(es)', 'Team']

        
        self.data_two = wl_teams

### Generate URLS 
The defined scraper takes in a dictionary of urls which it will visit and collect data from. this part of the notebook will be generating the urls to be used.

In [21]:
# recall, we stored the ncaa_teams dataset in ncaa_framework
%store -r ncaa_teams

In [22]:
class url_generator():
    # teams is a dictionary of names. defined by region, a list of sports -reference
    # url based names of teams. the teams are in seed order.
    def __init__(self, team_names):
        self.teams = []
        self.team_names = team_names
        self.url_list_one = []
        self.url_list_two = []
        self.year = 2020

    def build_array(self):
        # for this, we want to build a 3 row array -> name , year , seed. (we don't necessarily know 
        # if seed will be readily availible in time)
       
        for region in self.team_names:
            team_list = self.team_names[region]
            for i in range(len(team_list)):
                seed = i+1
                sr_name = team_list[i]
                year = self.year
                self.teams += [[sr_name , year , seed]]
                
                 
    # there are two webpages per team on sports reference we would like to scrape
    # for this webpage, we only want one row from the first table
    def build_url_one(self):
        url_one = []
        for i in range(len(self.teams)):   # this is over every team
            team = {}
            team_name = self.teams[i][0]
            year = self.year

            url = "https://www.sports-reference.com/cbb/schools/" + str(team_name) + "/" + str(year) + ".html" # season data
            # build the team dictionary
            team['name'] = team_name
            team['year'] = self.year
            team['url']  = url 
            url_one.append(team)
       
        self.url_list_one = url_one
        
    # this webpage gives overview data for a team over a span of years.
    # we only want a few of these years.
    def build_url_two(self):
        url_two = []
        for i in range(len(self.teams)):
            team = {}
            team_name = self.teams[i][0]
            seed = self.teams[i][2]
            url = "https://www.sports-reference.com/cbb/schools/" + team_name + "/"
            
            # build the team dictionary
            team['name'] = team_name
            team['year'] = None
            team['url']  = url 
            team['seed'] = seed
            url_two.append(team)
       
        self.url_list_two = url_two
        

## Collect the Data 
This part of the notebook is where we employ the created data scraper and data pipelines. 

In [25]:
%%capture [--no-stderr] 
print("start")
# create data pipeline object
pipe = collect_pipeline()
pipe.make_folder()

# make the spider object
spider = bball_scraper()

# make url_generator object
go_daddy = url_generator( ncaa_teams )

# Step 1: Make the urls 
go_daddy.build_array()
go_daddy.build_url_one()
go_daddy.build_url_two()

# parse the first
spider.set_urls(go_daddy.url_list_one)
spider.parse_one()


# parse the second
spider.set_urls(go_daddy.url_list_two)
spider.parse_two()



# save the other
pipe = collect_pipeline()
pipe.join_data(spider)
pipe.write_data()

print("finished_running")

KeyboardInterrupt: 