# Scrape Sports Reference for CBB data 

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

head_names = ['School', 'W%', 'SRS', 'SOS', 'TmPts', 'OppPts', 
              'Pace', 'ORtg', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 
              'STL%', 'BLK%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'sched_url']

def num(s):
    try:
        return int(s)
    except ValueError:
        return float(s)


years = ['2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010']


In [2]:
years = [str(yr) for yr in np.arange(2017,1984,-1)]
years

['2017',
 '2016',
 '2015',
 '2014',
 '2013',
 '2012',
 '2011',
 '2010',
 '2009',
 '2008',
 '2007',
 '2006',
 '2005',
 '2004',
 '2003',
 '2002',
 '2001',
 '2000',
 '1999',
 '1998',
 '1997',
 '1996',
 '1995',
 '1994',
 '1993',
 '1992',
 '1991',
 '1990',
 '1989',
 '1988',
 '1987',
 '1986',
 '1985']

In [3]:
decode = "utf-8"
for year in years:

    print("Compiling the data for " + year)

    url = "http://www.sports-reference.com/cbb/seasons/"+year+"-advanced-school-stats.html"
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html5lib")

    team_names = soup.findAll("td", {"data-stat": "school_name"})
    team_schedurl = [node.find('a')['href'][:-5]+'-schedule.html' for node in team_names]
    team_names = [node.getText().encode(decode) for node in team_names]
#     team_names = [node.lower().replace(" *", "").replace(" ", "-") for node in team_names]
    team_names=[node.lower().replace(b" *",b" ").replace(b" ", b"-") for node in team_names]
    team_names=[node.split(b'\xc2')[0].decode('ascii') for node in team_names]

    full_names=soup.findAll("td", {"data-stat": "school_name"})
    full_names=[node.getText().encode(decode) for node in full_names]
    full_names=[node.replace(b" *",b" ") for node in full_names]
    full_names=[node.split(b'\xc2')[0].decode('ascii') for node in full_names]
    
    team_WLpct = soup.findAll("td", {"data-stat": "win_loss_pct"})
    team_WLpct = [node.getText().encode(decode) for node in team_WLpct]
    team_WLpct = [num(stat) if stat else 'nan'  for stat in team_WLpct]

    team_srs = soup.findAll("td", {"data-stat": "srs"})
    team_srs = [node.getText().encode(decode) for node in team_srs]
    team_srs = [num(stat)  if stat else 'nan' for stat in team_srs]

    team_sos = soup.findAll("td", {"data-stat": "sos"})
    team_sos = [node.getText().encode(decode) for node in team_sos]
    team_sos = [num(stat)  if stat else 'nan' for stat in team_sos]

    team_TmPts = soup.findAll("td", {"data-stat": "pts"})
    team_TmPts = [node.getText().encode(decode) for node in team_TmPts]
    team_TmPts = [num(stat)  if stat else 'nan' for stat in team_TmPts]

    team_OppPts = soup.findAll("td", {"data-stat": "opp_pts"})
    team_OppPts = [node.getText().encode(decode) for node in team_OppPts]
    team_OppPts = [num(stat)  if stat else 'nan' for stat in team_OppPts]

    team_pace = soup.findAll("td", {"data-stat": "pace"})
    team_pace = [node.getText().encode(decode) for node in team_pace]
    team_pace = [num(stat)  if stat else 'nan' for stat in team_pace]

    team_ORtg = soup.findAll("td", {"data-stat": "off_rtg"})
    team_ORtg = [node.getText().encode(decode) for node in team_ORtg]
    team_ORtg = [num(stat)  if stat else 'nan' for stat in team_ORtg]

    team_FTr = soup.findAll("td", {"data-stat": "fta_per_fga_pct"})
    team_FTr = [node.getText().encode(decode) for node in team_FTr]
    team_FTr = [num(stat)  if stat else 'nan' for stat in team_FTr]

    team_3Ar = soup.findAll("td", {"data-stat": "fg3a_per_fga_pct"})
    team_3Ar = [node.getText().encode(decode) for node in team_3Ar]
    team_3Ar = [num(stat)  if stat else 'nan' for stat in team_3Ar]

    team_TSpct = soup.findAll("td", {"data-stat": "ts_pct"})
    team_TSpct = [node.getText().encode(decode) for node in team_TSpct]
    team_TSpct = [num(stat)  if stat else 'nan' for stat in team_TSpct]

    team_TRBpct = soup.findAll("td", {"data-stat": "trb_pct"})
    team_TRBpct = [node.getText().encode(decode) for node in team_TRBpct]
    team_TRBpct = [num(stat)  if stat else 'nan' for stat in team_TRBpct]

    team_ASTpct = soup.findAll("td", {"data-stat": "ast_pct"})
    team_ASTpct = [node.getText().encode(decode) for node in team_ASTpct]
    team_ASTpct = [num(stat)  if stat else 'nan' for stat in team_ASTpct]

    team_STLpct = soup.findAll("td", {"data-stat": "stl_pct"})
    team_STLpct = [node.getText().encode(decode) for node in team_STLpct]
    team_STLpct = [num(stat)  if stat else 'nan' for stat in team_STLpct]

    team_BLKpct = soup.findAll("td", {"data-stat": "blk_pct"})
    team_BLKpct = [node.getText().encode(decode) for node in team_BLKpct]
    team_BLKpct = [num(stat)  if stat else 'nan' for stat in team_BLKpct]

    team_eFGpct = soup.findAll("td", {"data-stat": "efg_pct"})
    team_eFGpct = [node.getText().encode(decode) for node in team_eFGpct]
    team_eFGpct = [num(stat)  if stat else 'nan' for stat in team_eFGpct]

    team_TOVpct = soup.findAll("td", {"data-stat": "tov_pct"})
    team_TOVpct = [node.getText().encode(decode) for node in team_TOVpct]
    team_TOVpct = [num(stat)  if stat else 'nan' for stat in team_TOVpct]

    team_ORBpct = soup.findAll("td", {"data-stat": "orb_pct"})
    team_ORBpct = [node.getText().encode(decode) for node in team_ORBpct]
    team_ORBpct = [num(stat)  if stat else 'nan' for stat in team_ORBpct]

    team_FTr = soup.findAll("td", {"data-stat": "ft_rate"})
    team_FTr = [node.getText().encode(decode) for node in team_FTr]
    team_FTr = [num(stat)  if stat else 'nan' for stat in team_FTr]

    stats_list = [team_names, team_WLpct, team_srs, team_sos, team_TmPts,
             team_OppPts, team_pace, team_ORtg, team_FTr, team_3Ar,
             team_TSpct, team_TRBpct, team_ASTpct, team_STLpct, 
             team_BLKpct, team_eFGpct, team_TOVpct, team_ORBpct, 
             team_FTr, team_schedurl]

    team_dict = {}
    for i, head in enumerate(head_names):
        team_dict[head] = stats_list[i]

    team_data = pd.DataFrame(team_dict)

    team_data = team_data[head_names]

    
    team_data['name']=[t.split('/')[3] for t in team_schedurl]
    team_data['fullName']=full_names
    
    team_data.to_csv("data/team/team_stats_"+year+".csv", index=False)

    

Compiling the data for 2017
Compiling the data for 2016
Compiling the data for 2015
Compiling the data for 2014
Compiling the data for 2013
Compiling the data for 2012
Compiling the data for 2011
Compiling the data for 2010
Compiling the data for 2009
Compiling the data for 2008
Compiling the data for 2007
Compiling the data for 2006
Compiling the data for 2005
Compiling the data for 2004
Compiling the data for 2003
Compiling the data for 2002
Compiling the data for 2001
Compiling the data for 2000
Compiling the data for 1999
Compiling the data for 1998
Compiling the data for 1997
Compiling the data for 1996
Compiling the data for 1995
Compiling the data for 1994
Compiling the data for 1993
Compiling the data for 1992
Compiling the data for 1991
Compiling the data for 1990
Compiling the data for 1989
Compiling the data for 1988
Compiling the data for 1987
Compiling the data for 1986
Compiling the data for 1985
