# Add Features

In [1]:
import psycopg2 as pg
import pandas as pd
import numpy as np

from psycopg2.extras import execute_values

1. Regular Season Win Percentage
2. Regular Season Opponents Win Percentage
3. Regular Season Win Percentage Differential
4. Conference
5. Conference Difficulty per Season
6. Schedule Difficulty
7. NCAA Tourney Seed Differential

## Win Percentage

In [2]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "DayNum", "Team", "Outcome"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
'''

df = pd.read_sql_query(query, conn)
conn.close()

In [3]:
df.shape

(164082, 4)

In [None]:
# wins per team
wins = df[['Season', 'Team', 'Outcome']].groupby(by=["Season", "Team"]).agg(['sum', 'count'])
wins.reset_index(inplace=True)
wins.columns = [' '.join(col).strip() for col in wins.columns.values]
wins.rename(columns={'Outcome sum': 'wins', 'Outcome count': 'games'}, inplace=True)
wins['winpct'] = wins['wins'] / wins['games']

In [None]:
wins.head()
wins.shape

In [None]:
# pull tourney matchups
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

query = '''
    SELECT DISTINCT "Season", "Team", "Opponent"
    FROM prod.features
    WHERE "Season Type" = 'NCAA Tourney'
'''

matchups = pd.read_sql_query(query, conn)

In [None]:
matchups.head()
matchups.shape

In [None]:
wins2.head()

In [None]:
wins_matchup = pd.merge(matchups, wins, how='left', on=['Season', 'Team'])
wins2 = wins[['Season', 'Team', 'winpct']].rename(columns={'Team': 'Opponent', 'winpct': 'OpponentWinpct'})
wins_matchup2 = pd.merge(wins_matchup, wins2, how='left', on=['Season', 'Opponent'])
wins_matchup2.head()

In [None]:
to_load = wins_matchup2[['Season', 'Team', 'Opponent', 'winpct', 'OpponentWinpct']].copy()
to_load['winpctDiff'] = to_load['winpct'] - to_load['OpponentWinpct']

In [None]:
datarows = []
for i in to_load.itertuples(index=False):
    row = (int(i.Season), int(i.Team), int(i.Opponent), float(i.winpct), float(i.OpponentWinpct), float(i.winpctDiff))
    datarows.append(row)

In [None]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "WinPct" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentWinPct" REAL,
      ADD COLUMN IF NOT EXISTS "WinPctDiff" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "WinPct" = data."WinPct",
           "OpponentWinPct" = data."OpponentWinPct",
           "WinPctDiff" = data."WinPctDiff"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "Opponent",
             "WinPct",
             "OpponentWinPct",
             "WinPctDiff"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Opponent" = data."Opponent"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

## Points For / Against

- Average points scored in the regular season
- Average points scored by opponents faced in the regular season
- Average of Difference between points scored versus points scored by opponents in the regular season
  - Note this is not the difference between the two teams in the tourney match up
  - We can calculate Average Points For differential if we want to compare the two teams in the tournament
- Do all of the above for both Team and Opponent
- Find Differential


In [4]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "Team",
        AVG("Score") as "AvgPointsFor",
        AVG("OpponentScore") as "AvgPointsAgainst",
        AVG("Score" - "OpponentScore") as "AvgNetPointsFor"
    FROM prod.features
    WHERE "Season Type" = 'Regular'
    GROUP BY "Season", "Team"
'''

df = pd.read_sql_query(query, conn)
conn.close()

In [5]:
df.head()

Unnamed: 0,Season,Team,AvgPointsFor,AvgPointsAgainst,AvgNetPointsFor
0,2011,1104,66.65625,59.25,7.40625
1,2003,1358,71.777778,65.259259,6.518519
2,2003,1276,69.0,67.266667,1.733333
3,2004,1433,70.8,63.566667,7.233333
4,2007,1205,63.172414,72.758621,-9.586207


In [6]:
datarows = []
for i in df.itertuples(index=False):
    row = (int(i.Season), int(i.Team), float(i.AvgPointsFor), float(i.AvgPointsAgainst), float(i.AvgNetPointsFor))
    datarows.append(row)

In [None]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "AvgPointsFor" REAL,
      ADD COLUMN IF NOT EXISTS "AvgPointsAgainst" REAL,
      ADD COLUMN IF NOT EXISTS "AvgNetPointsFor" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "AvgPointsFor" = data."AvgPointsFor",
           "AvgPointsAgainst" = data."AvgPointsAgainst",
           "AvgNetPointsFor" = data."AvgNetPointsFor"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "AvgPointsFor",
             "AvgPointsAgainst",
             "AvgNetPointsFor"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

In [8]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "OpponentAvgPointsFor" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentAvgPointsAgainst" REAL,
      ADD COLUMN IF NOT EXISTS "OpponentAvgNetPointsFor" REAL
'''

update = '''
    UPDATE prod.features as f
       SET "OpponentAvgPointsFor" = data."OpponentAvgPointsFor",
           "OpponentAvgPointsAgainst" = data."OpponentAvgPointsAgainst",
           "OpponentAvgNetPointsFor" = data."OpponentAvgNetPointsFor"
      FROM (VALUES %s) AS data (
             "Season",
             "Opponent",
             "OpponentAvgPointsFor",
             "OpponentAvgPointsAgainst",
             "OpponentAvgNetPointsFor"
             )
     WHERE f."Season" = data."Season"
       and f."Opponent" = data."Opponent"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()

## Seed

- Pre-Tournament Seed Differential
- A negative number indicates that team is seeded higher than the opponent

In [None]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

# pull regular season and tourney data to build indexes
query = '''
    SELECT "Season", "TeamID" as "Team", "Seed"
    FROM prod."TourneySeeds"
'''

df = pd.read_sql_query(query, conn)
conn.close()

def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int

df['seed_int'] = df.Seed.apply(seed_to_int)
df.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
df.head()

In [None]:
# pull tourney matchups
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

query = '''
    SELECT DISTINCT "Season", "Team", "Opponent"
    FROM prod.features
    WHERE "Season Type" = 'NCAA Tourney'
'''

matchups = pd.read_sql_query(query, conn)
matchups.shape

In [None]:
seed_matchup = pd.merge(matchups, df, how='left', on=['Season', 'Team'])
seed2 = df.rename(columns={'Team': 'Opponent', 'seed_int': 'opponent_seed_int'})
seed_matchup2 = pd.merge(seed_matchup, seed2, how='left', on=['Season', 'Opponent'])
seed_matchup2['SeedDiff'] = seed_matchup2['seed_int'] - seed_matchup2['opponent_seed_int']
seed_matchup2.head()

In [None]:
datarows = []
for i in seed_matchup2.itertuples(index=False):
    row = (int(i.Season), int(i.Team), int(i.Opponent), int(i.SeedDiff))
    datarows.append(row)

In [None]:
len(datarows)

In [None]:
alter = '''
    ALTER TABLE prod.features
      ADD COLUMN IF NOT EXISTS "SeedDiff" int
'''

update = '''
    UPDATE prod.features as f
       SET "SeedDiff" = data."SeedDiff"
      FROM (VALUES %s) AS data (
             "Season",
             "Team",
             "Opponent",
             "SeedDiff"
             )
     WHERE f."Season" = data."Season"
       and f."Team" = data."Team"
       and f."Opponent" = data."Opponent"
       and f."Season Type" = 'NCAA Tourney'
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

c = conn.cursor()
c.execute(alter)
execute_values(c, update, datarows, page_size=8000)
conn.commit()
conn.close()