# Create Prod Features Table

In [1]:
import psycopg2 as pg
import pandas as pd
import numpy as np

from psycopg2.extras import execute_values

## Pull Raw Data Regular Season Games

In [2]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "DayNum", "WTeamID", "LTeamID"
    FROM prod."RegSeasonDetailedResults"
    '''

df = pd.read_sql_query(query, conn)
conn.close()

In [3]:
# create win/loss column
# set team1 and team2 with the same rules as the prediction csv files
# e.g. team1 is always the lower numbered team
df['Outcome'] = np.where(df['WTeamID'] < df['LTeamID'], 1, 0)
df['Team1'] = np.where(df['Outcome'] == 1, df['WTeamID'], df['LTeamID'])
df['Team2'] = np.where(df['Outcome'] == 1, df['LTeamID'], df['WTeamID'])

In [4]:
# vertical stack data so every team has row
tmp1 = df[['Season', 'DayNum', 'Team1', 'Team2', 'Outcome']].copy()
tmp1.rename(columns={'Team1': 'Team', 'Team2': 'Opponent'}, inplace=True)

tmp2 = df[['Season', 'DayNum', 'Team1', 'Team2', 'Outcome']].copy()
tmp2.rename(columns={'Team2': 'Team', 'Team1': 'Opponent'}, inplace=True)
tmp2['Outcome'] = 1 - tmp2['Outcome'] # inverse outcome since we reverse team/opponent

df_final = pd.concat((tmp1, tmp2))

In [6]:
df_final.head()
# df_final.shape

(164082, 5)

### Parse and Insert Regular Season Games

In [9]:
datarows = []
for i in df_final.itertuples(index=False):
    row = (int(i.Season), int(i.DayNum), int(i.Team), int(i.Opponent), int(i.Outcome))
    datarows.append(row)

In [11]:
insert = '''
    INSERT INTO prod.features (
        "Season", "DayNum", "Team", "Opponent", "Outcome"
    )
    VALUES %s
    -- ON CONFLICT DO NOTHING
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
try:
    execute_values(c, insert, datarows, page_size=5000)
    conn.commit()
    conn.close()
except pg.IntegrityError as e:
    print(e)
    print("To force update use ON CONFLICT DO NOTHING")
    conn.close()

duplicate key value violates unique constraint "features_pkey"
DETAIL:  Key ("Season", "DayNum", "Team", "Opponent")=(2003, 10, 1104, 1328) already exists.

To force update use ON CONFLICT DO NOTHING


## Pull Raw Data NCAA Tourney Games

In [12]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "DayNum", "WTeamID", "LTeamID"
    FROM prod."TourneyDetailedResults"
    '''

df = pd.read_sql_query(query, conn)
conn.close()

In [13]:
# create win/loss column
# set team1 and team2 with the same rules as the prediction csv files
# e.g. team1 is always the lower numbered team
df['Outcome'] = np.where(df['WTeamID'] < df['LTeamID'], 1, 0)
df['Team1'] = np.where(df['Outcome'] == 1, df['WTeamID'], df['LTeamID'])
df['Team2'] = np.where(df['Outcome'] == 1, df['LTeamID'], df['WTeamID'])

In [14]:
# vertical stack data so every team has row
tmp1 = df[['Season', 'DayNum', 'Team1', 'Team2', 'Outcome']].copy()
tmp1.rename(columns={'Team1': 'Team', 'Team2': 'Opponent'}, inplace=True)

tmp2 = df[['Season', 'DayNum', 'Team1', 'Team2', 'Outcome']].copy()
tmp2.rename(columns={'Team2': 'Team', 'Team1': 'Opponent'}, inplace=True)
tmp2['Outcome'] = 1 - tmp2['Outcome'] # inverse outcome since we reverse team/opponent

df_final = pd.concat((tmp1, tmp2))

In [16]:
df_final.head()
# df_final.shape

Unnamed: 0,DayNum,Opponent,Outcome,Season,Team
0,134,1421,0,2003,1411
1,136,1436,1,2003,1112
2,136,1272,1,2003,1113
3,136,1166,1,2003,1141
4,136,1301,1,2003,1143


## Parse and Insert NCAA Tourney Games

In [17]:
datarows = []
for i in df_final.itertuples(index=False):
    row = (int(i.Season), int(i.DayNum), int(i.Team), int(i.Opponent), int(i.Outcome))
    datarows.append(row)

In [20]:
insert = '''
    INSERT INTO prod.features (
        "Season", "DayNum", "Team", "Opponent", "Outcome"
    )
    VALUES %s
    -- ON CONFLICT DO NOTHING
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
try:
    execute_values(c, insert, datarows, page_size=5000)
    conn.commit()
    conn.close()
except pg.IntegrityError as e:
    print(e)
    print("To force update use ON CONFLICT DO NOTHING")
    conn.close()

duplicate key value violates unique constraint "features_pkey"
DETAIL:  Key ("Season", "DayNum", "Team", "Opponent")=(2003, 134, 1411, 1421) already exists.

To force update use ON CONFLICT DO NOTHING


## Add Game Attributes

These are important attributes about the games that are listed in the features table.

Using file description from "Data Section 1" on Kaggle to use DayNum to determine regular season versus NCAA tourney games.

In [21]:
update1 = '''
    UPDATE prod.features
    SET "Season Type" = 'Regular'
    WHERE "DayNum" >= 0 and "DayNum" <= 132
'''

update2 = '''
    UPDATE prod.features
    SET "Season Type" = 'NCAA Tourney'
    WHERE "DayNum" >= 134 and "DayNum" <= 154
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(update1)
c.execute(update2)
conn.commit()
conn.close()

### Stage 1 Holdout Period

In [22]:
alter = 'ALTER TABLE prod.features ADD COLUMN IF NOT EXISTS holdout_s1 int'
update1 = '''
    UPDATE prod.features
      SET holdout_s1 = (CASE
        WHEN "Season" >= 2014 and "Season" <= 2017 THEN 1
        ELSE 0
        END
      )
      WHERE "Season Type" = 'NCAA Tourney'
'''
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
c.execute(update1)
conn.commit()
conn.close()

### Stage 2 Holdout Period

In [23]:
alter = 'ALTER TABLE prod.features ADD COLUMN IF NOT EXISTS holdout_s2 int'
update1 = '''
    UPDATE prod.features
      SET holdout_s2 = (CASE
        WHEN "Season" = 2018 THEN 1
        ELSE 0
        END
      )
      WHERE "Season Type" = 'NCAA Tourney'
'''
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
c.execute(update1)
conn.commit()
conn.close()

## Create and Insert Features

1. Regular Season Win Percentage
2. Regular Season Opponents Win Percentage
3. Regular Season Win Percentage Differential
4. Conference
5. Conference Difficulty per Season
6. Schedule Difficulty
7. NCAA Tourney Seed Differential