# Scrape 2018 NCAA Tourney Results

### 1. Use BeautifulSoup to Scrape from NCAA Site

In [197]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd

import psycopg2 as pg
from psycopg2.extras import execute_values

In [24]:
url = "https://www.ncaa.com/interactive-bracket/basketball-men/d1"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
connection = urlopen(req)

In [25]:
soup = BeautifulSoup(connection, "lxml")

In [82]:
# html
entire_bracket = soup.find('section', attrs={'id': 'bracket-section'})
all_games = entire_bracket.findChildren('section', attrs={'class': 'game-set'})

In [131]:
# extract attributes from html
rows = []

for game in all_games:
    teams = game.findAll(attrs={'class': 'team-name'})
    scores = game.findAll(attrs={'class': 'team-score'})
    game_id = game.find(attrs={'class': 'bracket-game'}).text
    
    row = (game_id, teams[0].text, int(scores[0].text), teams[1].text, int(scores[1].text))
    rows.append(row)

# store as df
df_results = pd.DataFrame(rows, columns=['gid', 'Team1', 'Score1', 'Team2', 'Score2'])
df_results.head()

Unnamed: 0,gid,Team1,Score1,Team2,Score2
0,201,Virginia,54,UMBC,74
1,202,Creighton,59,Kansas St.,69
2,203,Kentucky,78,Davidson,73
3,204,Arizona,68,Buffalo,89
4,205,Miami (Fla.),62,Loyola Chicago,64


### 2. Match to TeamID using team spellings file

In [145]:
extract_path = './data/extracted/'

spellings = pd.read_csv(extract_path+'TeamSpellings.csv', encoding='ISO-8859-1')
spellings.head()

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101


In [181]:
df_results['team1_lower'] = df_results['Team1'].str.lower()
df_results['team2_lower'] = df_results['Team2'].str.lower()   

df_id = pd.merge(df_results, spellings, how='left', left_on='team1_lower', right_on='TeamNameSpelling')
df_id.rename(columns={'TeamID': 'TeamID_1'}, inplace=True)

df_id2 = pd.merge(df_id, spellings, how='left', left_on='team2_lower', right_on='TeamNameSpelling')
df_id2.rename(columns={'TeamID': 'TeamID_2'}, inplace=True)

df_id2.drop(['TeamNameSpelling_x', 'TeamNameSpelling_y', 'team1_lower', 'team2_lower'], axis=1, inplace=True)
df_id2.set_index('gid', inplace=True)
df_id2.head()

Unnamed: 0_level_0,Team1,Score1,Team2,Score2,TeamID_1,TeamID_2
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
201,Virginia,54,UMBC,74,1438.0,1420.0
202,Creighton,59,Kansas St.,69,1166.0,1243.0
203,Kentucky,78,Davidson,73,1246.0,1172.0
204,Arizona,68,Buffalo,89,1112.0,1138.0
205,Miami (Fla.),62,Loyola Chicago,64,1274.0,1260.0


Some had to be manually entered because the spelling is different

In [182]:
# find missing
# df_id2[df_id2['TeamID_2'].isnull()]

# Ohio State
df_id2.at['211', 'TeamID_1'] = 1326
df_id2.at['306', 'TeamID_1'] = 1326
# Michigan State
df_id2.at['230', 'TeamID_1'] = 1277
df_id2.at['315', 'TeamID_2'] = 1277
# CSU Fullerton
df_id2.at['224', 'TeamID_2'] = 1168

# typecast
df_id2.TeamID_1 = df_id2.TeamID_1.astype(int)
df_id2.TeamID_2 = df_id2.TeamID_2.astype(int)

In [186]:
df_id2['Win1'] = df_id2['Score1'] > df_id2['Score2']
df_id2['Win1'] = df_id2['Win1'].astype(int)
df_id2.head()

Unnamed: 0_level_0,Team1,Score1,Team2,Score2,TeamID_1,TeamID_2,Win1
gid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
201,Virginia,54,UMBC,74,1438,1420,0
202,Creighton,59,Kansas St.,69,1166,1243,0
203,Kentucky,78,Davidson,73,1246,1172,1
204,Arizona,68,Buffalo,89,1112,1138,0
205,Miami (Fla.),62,Loyola Chicago,64,1274,1260,0


### 3. Load into PostgreSQL

In [198]:
# stack em into tuples
datarows = []
for i in df_id2.itertuples(index=False):
    row = (int(i.TeamID_1,), int(i.TeamID_2), int(i.Score1), int(i.Score2), int(i.Win1))
    datarows.append(row)
for i in df_id2.itertuples(index=False):
    row = (int(i.TeamID_2,), int(i.TeamID_1), int(i.Score2), int(i.Score1), abs(int(i.Win1-1)))
    datarows.append(row)

In [200]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

update = '''
    UPDATE prod.features as f
       SET "Score" = data."Score",
           "OpponentScore" = data."OpponentScore",
           "Outcome" = data."Outcome"
      FROM (VALUES %s) AS data (
             "Team",
             "Opponent",
             "Score",
             "OpponentScore",
             "Outcome"
             )
     WHERE f."Season" = 2018
       and f."Season Type" = 'NCAA Tourney'
       and f."Team" = data."Team"
       and f."Opponent" = data."Opponent"
'''

c = conn.cursor()
execute_values(c, update, datarows)
conn.commit()
conn.close()