## Goal:
Load `NCAA boxscore` data into a database table.

Import Libraries

In [8]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
import getpass
import psycopg2
from psycopg2.extensions import adapt, register_adapter, AsIs
pd.set_option('display.max_columns', 500)

Read in CSV

In [2]:
ncaa_boxes = pd.read_csv('../Data/ncaa_simple_boxscores.csv')

Inspect the data

In [3]:
ncaa_boxes.shape

(253918, 16)

In [4]:
ncaa_boxes.head()

Unnamed: 0,boxscore,away_name,away_abbr,away_score,away_rank,home_name,home_abbr,home_score,home_rank,non_di,top_25,winning_name,winning_abbr,losing_name,losing_abbr,season
0,2018-11-06-19-kansas,Michigan State,michigan-state,87,10.0,Kansas,kansas,92,1.0,False,True,Kansas,kansas,Michigan State,michigan-state,2019
1,2018-11-06-21-kentucky,Duke,duke,118,4.0,Kentucky,kentucky,84,2.0,False,True,Duke,duke,Kentucky,kentucky,2019
2,2018-11-06-21-gonzaga,Idaho State,idaho-state,79,,Gonzaga,gonzaga,120,3.0,False,True,Gonzaga,gonzaga,Idaho State,idaho-state,2019
3,2018-11-06-19-virginia,Towson,towson,42,,Virginia,virginia,73,5.0,False,True,Virginia,virginia,Towson,towson,2019
4,2018-11-06-19-tennessee,Lenoir-Rhyne,Lenoir-Rhyne,41,,Tennessee,tennessee,86,6.0,True,True,Tennessee,tennessee,Lenoir-Rhyne,Lenoir-Rhyne,2019


In [5]:
ncaa_boxes.tail()

Unnamed: 0,boxscore,away_name,away_abbr,away_score,away_rank,home_name,home_abbr,home_score,home_rank,non_di,top_25,winning_name,winning_abbr,losing_name,losing_abbr,season
253913,,Temple,temple,53,,,,40,,True,False,Temple,temple,,,1938
253914,,Colorado,colorado,48,,,,47,,True,False,Colorado,colorado,,,1938
253915,,Temple,temple,56,,,,44,,True,False,Temple,temple,,,1938
253916,,Oklahoma State,oklahoma-state,37,,,,24,,True,False,Oklahoma State,oklahoma-state,,,1938
253917,,Temple,temple,60,,,,36,,True,False,Temple,temple,,,1938


In [6]:
ncaa_boxes.columns

Index(['boxscore', 'away_name', 'away_abbr', 'away_score', 'away_rank',
       'home_name', 'home_abbr', 'home_score', 'home_rank', 'non_di', 'top_25',
       'winning_name', 'winning_abbr', 'losing_name', 'losing_abbr', 'season'],
      dtype='object')

Create the table with SQL code.

In [7]:
create_table = """
DROP TABLE IF EXISTS ncaa_simple_boxscores;
CREATE TABLE ncaa_simple_boxscores (
boxscore varchar(50)
, away_name varchar(100)
, away_abbr varchar(100)
, away_score numeric
, away_rank numeric
, home_name varchar(100)
, home_abbr varchar(100)
, home_score numeric
, home_rank numeric
, non_di boolean
, top_25 boolean
, winning_name varchar(100)
, winning_abbr varchar(100)
, losing_name varchar(100)
, losing_abbr varchar(100)
, season numeric
)
"""

Establish database connection

In [9]:
mypasswd = getpass.getpass()
conn = psycopg2.connect(database = 'cs20_group4',
                              user = 'mwkmr',
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)

········


Define cursor that will be used to write to the database.

In [10]:
cursor = conn.cursor()

Execute the create SQL.

In [11]:
cursor.execute(create_table)

In [12]:
conn.commit()

Replace `NaN` with nones in the dataset, and then insert the dataset into the newly created table.

In [13]:
ncaa_boxes = ncaa_boxes.where(pd.notnull(ncaa_boxes), None)

INSERT_SQL = 'INSERT INTO ncaa_simple_boxscores'
INSERT_SQL += '(boxscore,away_name,away_abbr,away_score,away_rank,'
INSERT_SQL += 'home_name,home_abbr,home_score,home_rank,non_di,top_25,winning_name,winning_abbr,'
INSERT_SQL += 'losing_name,losing_abbr,season) VALUES '
INSERT_SQL += '(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'

with conn, conn.cursor() as cursor:
    for row in ncaa_boxes.itertuples(index=False, name=None):
        cursor.execute(INSERT_SQL,row)