# Add Features

In [1]:
import psycopg2 as pg
import pandas as pd
import numpy as np

from psycopg2.extras import execute_values

1. Regular Season Win Percentage
2. Regular Season Opponents Win Percentage
3. Regular Season Win Percentage Differential
4. Conference
5. Conference Difficulty per Season
6. Schedule Difficulty
7. NCAA Tourney Seed Differential

## Win Percentage

### Regular Season (Cumulative Average)

In [12]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "DayNum", "Team", "Outcome"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
    ORDER BY "Season", "Team", "DayNum"
'''

df = pd.read_sql_query(query, conn)
conn.close()

In [13]:
df.shape
df.head()

Unnamed: 0,Season,DayNum,Team,Outcome
0,2003,19,1102,0
1,2003,22,1102,1
2,2003,25,1102,1
3,2003,27,1102,0
4,2003,31,1102,1


In [14]:
df['LagOutcome'] = df.groupby(['Season', 'Team'])['Outcome'].shift()

# running totals
cumsum = df.groupby(['Season', 'Team'])['LagOutcome'].expanding().sum()
cumcount = df.groupby(['Season', 'Team'])['LagOutcome'].expanding().count()

# insert back into df
df['CumWins'] = cumsum.reset_index()['LagOutcome']
df['CumCount'] = cumcount.reset_index()['LagOutcome']
df['winpct'] = df['CumWins'] / df['CumCount']

In [15]:
df.head()

Unnamed: 0,Season,DayNum,Team,Outcome,LagOutcome,CumWins,CumCount,winpct
0,2003,19,1102,0,,,0.0,
1,2003,22,1102,1,0.0,0.0,1.0,0.0
2,2003,25,1102,1,1.0,1.0,2.0,0.5
3,2003,27,1102,0,1.0,2.0,3.0,0.666667
4,2003,31,1102,1,0.0,2.0,4.0,0.5


In [35]:
# pull regular season matchups
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

query = '''
    SELECT DISTINCT "Season", "Team", "Opponent", "DayNum"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
'''

matchups = pd.read_sql_query(query, conn)
conn.close()

In [36]:
matchups.head()
matchups.shape

(164082, 4)

In [23]:
wins_matchup = pd.merge(matchups, df, how='left', on=['Season', 'Team', 'DayNum'])
df2 = df[['Season', 'Team', 'DayNum', 'winpct']].rename(columns={'Team': 'Opponent', 'winpct': 'OpponentWinpct'})

wins_matchup2 = pd.merge(wins_matchup, df2, how='left', on=['Season', 'Opponent', 'DayNum'])
wins_matchup2.head()

Unnamed: 0,Season,Team,Opponent,DayNum,Outcome,LagOutcome,CumWins,CumCount,winpct,OpponentWinpct
0,2015,1105,1115,70,1,1.0,2.0,12.0,0.166667,0.1875
1,2006,1110,1159,104,1,0.0,7.0,22.0,0.318182,0.318182
2,2018,1457,1149,117,0,0.0,15.0,25.0,0.6,0.4
3,2007,1232,1320,98,1,0.0,10.0,21.0,0.47619,0.681818
4,2005,1395,1394,26,0,0.0,2.0,3.0,0.666667,0.5


In [69]:
to_load = wins_matchup2[['Season', 'Team', 'Opponent', 'DayNum','winpct', 'OpponentWinpct']].copy()
to_load['winpctDiff'] = to_load['winpct'] - to_load['OpponentWinpct']

to_load = to_load.where(pd.notnull(to_load), None)

In [70]:
to_load.shape
to_load.head()

Unnamed: 0,Season,Team,Opponent,DayNum,winpct,OpponentWinpct,winpctDiff
0,2015,1105,1115,70,0.166667,0.1875,-0.0208333
1,2006,1110,1159,104,0.318182,0.318182,0.0
2,2018,1457,1149,117,0.6,0.4,0.2
3,2007,1232,1320,98,0.47619,0.681818,-0.205628
4,2005,1395,1394,26,0.666667,0.5,0.166667


In [81]:
datarows = []
for i in to_load.itertuples(index=False):
    row = (int(i.Season), int(i.Team), int(i.Opponent),
           int(i.DayNum), i.winpct, i.OpponentWinpct,
           i.winpctDiff
    )
    datarows.append(row)

In [82]:
len(datarows)

164082

In [83]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "WinPct" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentWinPct" REAL,
      ADD COLUMN IF NOT EXISTS "WinPctDiff" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "WinPct" = data."WinPct",
           "OpponentWinPct" = data."OpponentWinPct",
           "WinPctDiff" = data."WinPctDiff"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "Opponent",
             "DayNum",
             "WinPct",
             "OpponentWinPct",
             "WinPctDiff"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Opponent" = data."Opponent"
       and f."DayNum" = data."DayNum"
       and f."Season Type" = 'Regular'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

### Tournament

In [84]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "Team", "Outcome"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
'''

df = pd.read_sql_query(query, conn)
conn.close()

In [85]:
# wins per team
wins = df[['Season', 'Team', 'Outcome']].groupby(by=["Season", "Team"]).agg(['sum', 'count'])
wins.reset_index(inplace=True)
wins.columns = [' '.join(col).strip() for col in wins.columns.values]
wins.rename(columns={'Outcome sum': 'wins', 'Outcome count': 'games'}, inplace=True)
wins['winpct'] = wins['wins'] / wins['games']

In [86]:
# pull tourney matchups
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

query = '''
    SELECT DISTINCT "Season", "Team", "Opponent"
    FROM prod.features
    WHERE "Season Type" = 'NCAA Tourney'
'''

matchups = pd.read_sql_query(query, conn)
conn.close()

In [87]:
matchups.head()
matchups.shape

(6518, 3)

In [88]:
wins_matchup = pd.merge(matchups, wins, how='left', on=['Season', 'Team'])
wins2 = wins[['Season', 'Team', 'winpct']].rename(columns={'Team': 'Opponent', 'winpct': 'OpponentWinpct'})
wins_matchup2 = pd.merge(wins_matchup, wins2, how='left', on=['Season', 'Opponent'])
wins_matchup2.head()

Unnamed: 0,Season,Team,Opponent,wins,games,winpct,OpponentWinpct
0,2018,1277,1355,29,33,0.878788,0.8
1,2018,1181,1293,26,33,0.787879,0.827586
2,2006,1261,1417,22,30,0.733333,0.818182
3,2014,1124,1304,22,33,0.666667,0.612903
4,2015,1246,1214,34,34,1.0,0.484848


In [89]:
to_load = wins_matchup2[['Season', 'Team', 'Opponent', 'winpct', 'OpponentWinpct']].copy()
to_load['winpctDiff'] = to_load['winpct'] - to_load['OpponentWinpct']

In [90]:
to_load.shape

(6518, 6)

In [91]:
datarows = []
for i in to_load.itertuples(index=False):
    row = (int(i.Season), int(i.Team), int(i.Opponent), float(i.winpct), float(i.OpponentWinpct), float(i.winpctDiff))
    datarows.append(row)

In [92]:
len(datarows)

6518

In [93]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "WinPct" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentWinPct" REAL,
      ADD COLUMN IF NOT EXISTS "WinPctDiff" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "WinPct" = data."WinPct",
           "OpponentWinPct" = data."OpponentWinPct",
           "WinPctDiff" = data."WinPctDiff"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "Opponent",
             "WinPct",
             "OpponentWinPct",
             "WinPctDiff"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Opponent" = data."Opponent"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

## Points For / Against

- Average points scored in the regular season
- Average points scored by opponents faced in the regular season
- Average of Difference between points scored versus points scored by opponents in the regular season
  - Note this is not the difference between the two teams in the tourney match up
  - We can calculate Average Points For differential if we want to compare the two teams in the tournament
- Do all of the above for both Team and Opponent
- Find Differential


### Regular Season (Cumulative Avg)

In [101]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "Team", "DayNum",
        AVG("Score") OVER w as "AvgPointsFor",
        AVG("OpponentScore") OVER w as "AvgPointsAgainst",
        AVG("Score" - "OpponentScore") OVER w as "AvgNetPointsFor"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
    WINDOW w as (
        PARTITION BY "Season", "Team"
        ORDER BY "DayNum"
        ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    )
'''

df = pd.read_sql_query(query, conn)
conn.close()

# convert nan to None
df = df.where(pd.notnull(df), None)

In [102]:
df.shape
df.head()

Unnamed: 0,Season,Team,DayNum,AvgPointsFor,AvgPointsAgainst,AvgNetPointsFor
0,2003,1102,19,,,
1,2003,1102,22,47.0,65.0,-18.0
2,2003,1102,25,59.5,54.0,5.5
3,2003,1102,27,58.6667,53.3333,5.33333
4,2003,1102,31,55.75,55.0,0.75


In [105]:
datarows = []
for i in df.itertuples(index=False):
    row = (int(i.Season), int(i.Team), int(i.DayNum), i.AvgPointsFor, i.AvgPointsAgainst, i.AvgNetPointsFor)
    datarows.append(row)

In [106]:
len(datarows)

164082

In [107]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "AvgPointsFor" REAL,
      ADD COLUMN IF NOT EXISTS "AvgPointsAgainst" REAL,
      ADD COLUMN IF NOT EXISTS "AvgNetPointsFor" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "AvgPointsFor" = data."AvgPointsFor",
           "AvgPointsAgainst" = data."AvgPointsAgainst",
           "AvgNetPointsFor" = data."AvgNetPointsFor"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "DayNum",
             "AvgPointsFor",
             "AvgPointsAgainst",
             "AvgNetPointsFor"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."DayNum" = data."DayNum"
       and f."Season Type" = 'Regular'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

In [108]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "OpponentAvgPointsFor" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentAvgPointsAgainst" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentAvgNetPointsFor" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "OpponentAvgPointsFor" = data."OpponentAvgPointsFor",
           "OpponentAvgPointsAgainst" = data."OpponentAvgPointsAgainst",
           "OpponentAvgNetPointsFor" = data."OpponentAvgNetPointsFor"
      FROM (VALUES %s) AS data (
             "Season",
             "Opponent",
             "DayNum",
             "OpponentAvgPointsFor",
             "OpponentAvgPointsAgainst",
             "OpponentAvgNetPointsFor"
             )
     WHERE f."Season" = data."Season"
       and f."Opponent" = data."Opponent"
       and f."DayNum" = data."DayNum"
       and f."Season Type" = 'Regular'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

### Tournament

In [None]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "Team",
        AVG("Score") as "AvgPointsFor",
        AVG("OpponentScore") as "AvgPointsAgainst",
        AVG("Score" - "OpponentScore") as "AvgNetPointsFor"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
    GROUP BY "Season", "Team"
'''

df = pd.read_sql_query(query, conn)
conn.close()

In [None]:
df.head()

In [None]:
datarows = []
for i in df.itertuples(index=False):
    row = (int(i.Season), int(i.Team), float(i.AvgPointsFor), float(i.AvgPointsAgainst), float(i.AvgNetPointsFor))
    datarows.append(row)

In [None]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "AvgPointsFor" REAL,
      ADD COLUMN IF NOT EXISTS "AvgPointsAgainst" REAL,
      ADD COLUMN IF NOT EXISTS "AvgNetPointsFor" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "AvgPointsFor" = data."AvgPointsFor",
           "AvgPointsAgainst" = data."AvgPointsAgainst",
           "AvgNetPointsFor" = data."AvgNetPointsFor"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "AvgPointsFor",
             "AvgPointsAgainst",
             "AvgNetPointsFor"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

In [None]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "OpponentAvgPointsFor" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentAvgPointsAgainst" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentAvgNetPointsFor" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "OpponentAvgPointsFor" = data."OpponentAvgPointsFor",
           "OpponentAvgPointsAgainst" = data."OpponentAvgPointsAgainst",
           "OpponentAvgNetPointsFor" = data."OpponentAvgNetPointsFor"
      FROM (VALUES %s) AS data (
             "Season",
             "Opponent",
             "OpponentAvgPointsFor",
             "OpponentAvgPointsAgainst",
             "OpponentAvgNetPointsFor"
             )
     WHERE f."Season" = data."Season"
       and f."Opponent" = data."Opponent"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

## Seed

- Pre-Tournament Seed Differential
- A negative number indicates that team is seeded higher than the opponent

In [None]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "TeamID" as "Team", "Seed"
    FROM prod."TourneySeeds"
'''

df = pd.read_sql_query(query, conn)
conn.close()

def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int

df['seed_int'] = df.Seed.apply(seed_to_int)
df.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
df.head()

In [None]:
# pull tourney matchups
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

query = '''
    SELECT DISTINCT "Season", "Team", "Opponent"
    FROM prod.features
    WHERE "Season Type" = 'NCAA Tourney'
'''

matchups = pd.read_sql_query(query, conn)
matchups.shape

In [None]:
seed_matchup = pd.merge(matchups, df, how='left', on=['Season', 'Team'])
seed2 = df.rename(columns={'Team': 'Opponent', 'seed_int': 'opponent_seed_int'})
seed_matchup2 = pd.merge(seed_matchup, seed2, how='left', on=['Season', 'Opponent'])
seed_matchup2['SeedDiff'] = seed_matchup2['seed_int'] - seed_matchup2['opponent_seed_int']
seed_matchup2.head()

In [None]:
datarows = []
for i in seed_matchup2.itertuples(index=False):
    row = (int(i.Season), int(i.Team), int(i.Opponent), int(i.SeedDiff))
    datarows.append(row)

In [None]:
len(datarows)

In [None]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "SeedDiff" int
'''

update = '''
    UPDATE prod.features as f
       SET "SeedDiff" = data."SeedDiff"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "Opponent",
             "SeedDiff"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Opponent" = data."Opponent"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()