# Preliminary Models

In [109]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2 as pg

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler

## Data

In [103]:
query = '''
    SELECT *
    FROM prod.features
    WHERE "Season Type" = 'NCAA Tourney' and
        holdout_s2 = 0
'''

conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

df = pd.read_sql_query(query, conn)

# "Diff" Features
df['AvgRankDiff'] = df['AvgRank'] - df['OpponentAvgRank']
df['AvgNetPointsForDiff'] = df['AvgNetPointsFor'] - df['OpponentAvgNetPointsFor']
df['TwoPointPctDiff'] = df['TwoPointPct_Team'] - df['TwoPointPct_Opponent']
df['ThreePointPctDiff'] = df['ThreePointPct_Team'] - df['ThreePointPct_Opponent']
df['FreeThrowPctDiff'] = df['FreeThrowPct_Team'] - df['FreeThrowPct_Opponent']
df['OffensiveReboundsDiff'] = df['OffensiveRebounds_Team'] - df['OffensiveRebounds_Opponent']
df['DefensiveReboundsDiff'] = df['DefensiveRebounds_Team'] - df['DefensiveRebounds_Opponent']

df_train = df[df['holdout_s1'] == 0]
df_test = df[df['holdout_s1'] == 1]

In [104]:
df_train.head()

Unnamed: 0,Season,DayNum,Team,Opponent,Outcome,Score,OpponentScore,NumOT,WLoc,Season Type,...,OpponentAvgPointsFor,OpponentAvgPointsAgainst,OpponentAvgNetPointsFor,AvgRankDiff,AvgNetPointsForDiff,TwoPointPctDiff,ThreePointPctDiff,FreeThrowPctDiff,OffensiveReboundsDiff,DefensiveReboundsDiff
0,2003,139,1448,1120,0,62,68,0,N,NCAA Tourney,...,70.1,65.5667,4.53333,-32.7316,6.25977,-0.011875,-0.018186,0.104738,2.786946,4.45197
1,2003,144,1328,1139,1,65,54,0,N,NCAA Tourney,...,68.5862,60.931,7.65517,-34.94485,3.34483,-0.066387,-0.003997,-0.003401,4.417488,4.75
2,2003,137,1280,1139,0,46,47,0,N,NCAA Tourney,...,68.5862,60.931,7.65517,-24.4154,2.34483,-0.012475,-0.049262,-0.057372,4.727833,4.543103
3,2003,138,1328,1143,1,74,65,0,N,NCAA Tourney,...,74.4828,69.7586,4.72414,-30.28855,6.27586,-0.028375,0.012736,0.028606,1.203202,0.642857
4,2003,136,1211,1153,1,74,69,0,N,NCAA Tourney,...,67.3214,61.5,5.82143,0.6563,3.04954,0.082245,0.019794,0.03656,-0.185185,2.137037


In [105]:
df_test.head()

Unnamed: 0,Season,DayNum,Team,Opponent,Outcome,Score,OpponentScore,NumOT,WLoc,Season Type,...,OpponentAvgPointsFor,OpponentAvgPointsAgainst,OpponentAvgNetPointsFor,AvgRankDiff,AvgNetPointsForDiff,TwoPointPctDiff,ThreePointPctDiff,FreeThrowPctDiff,OffensiveReboundsDiff,DefensiveReboundsDiff
1369,2014,134,1291,1107,0,64,71,0,N,NCAA Tourney,...,66.0312,63.8438,2.1875,20.885,-3.90625,0.029961,0.022103,0.0099,-0.322581,-2.129032
1370,2014,136,1196,1107,1,67,55,0,N,NCAA Tourney,...,66.0312,63.8438,2.1875,-189.89169,10.6066,0.050423,0.028743,-0.087985,1.184751,-0.223851
1371,2014,139,1211,1112,0,61,84,0,N,NCAA Tourney,...,73.0588,58.1471,14.9118,20.18678,-3.6391,0.028233,0.031169,0.03325,-2.883523,-0.024621
1372,2014,143,1361,1112,0,64,70,0,N,NCAA Tourney,...,73.0588,58.1471,14.9118,14.38458,-4.3957,-0.049669,-0.010373,-0.026477,0.106061,-2.045455
1373,2014,136,1400,1113,1,87,85,0,N,NCAA Tourney,...,75.0312,68.875,6.15625,-7.5944,-2.06534,-0.01497,-0.068176,-0.012095,6.167339,-0.449597


## Baseline Metrics

- Log Loss value for all predicted probabilites == 0.5
- Log Loss value if every prediction was wrong with 100% confidence

In [85]:
target = 'Outcome'

# baseline of 0.5
baseline = np.full(536, 0.5)
print("Baseline: ",log_loss(df_test[target],baseline))

# worst case scenario -- every prediction was wrong with 100% confidence
print("Completely Wrong: ",log_loss(df_test[target],1-df_test[target]))

Baseline:  0.69314718056
Completely Wrong:  34.5391761936


In [86]:
target = 'Outcome'
features = ['SeedDiff']

lr = LogisticRegression()
lr.fit(df_train[features], df_train[target])
pred = lr.predict(df_test[features])
probs = lr.predict_proba(df_test[features])

print(classification_report(df_test[target], pred))

print("Log Loss: ",log_loss(df_test[target], probs[:,1]))

             precision    recall  f1-score   support

          0       0.69      0.75      0.72       268
          1       0.73      0.67      0.70       268

avg / total       0.71      0.71      0.71       536

Log Loss:  0.571490292973


## Logistic Regression

In [98]:
target = 'Outcome'
features = ['SeedDiff',
            'WinPct', 'OpponentWinPct',
            'AvgNetPointsFor', 'OpponentAvgNetPointsFor',
            'AvgRank', 'OpponentAvgRank', 
            'TwoPointPct_Team', 'TwoPointPct_Opponent',
            'ThreePointPct_Team', 'ThreePointPct_Opponent',
            'FreeThrowPct_Team', 'FreeThrowPct_Opponent',
            'OffensiveRebounds_Team', 'OffensiveRebounds_Opponent',
            'DefensiveRebounds_Team', 'DefensiveRebounds_Opponent'
           ]

lr = LogisticRegression(penalty='l2')
lr.fit(df_train[features], df_train[target])
pred = lr.predict(df_test[features])
probs = lr.predict_proba(df_test[features])

print(classification_report(df_test[target], pred))

print("Log Loss: ",log_loss(df_test[target], probs[:,1]))

             precision    recall  f1-score   support

          0       0.72      0.72      0.72       268
          1       0.72      0.72      0.72       268

avg / total       0.72      0.72      0.72       536

Log Loss:  0.543473089265


In [99]:
for i in zip(features, lr.coef_.flatten()):
    print("{:<30} {:>8.4f}".format(i[0],i[1]))

SeedDiff                        -0.0710
WinPct                          -0.4502
OpponentWinPct                   0.4502
AvgNetPointsFor                  0.0890
OpponentAvgNetPointsFor         -0.0890
AvgRank                         -0.0117
OpponentAvgRank                  0.0117
TwoPointPct_Team                 0.1337
TwoPointPct_Opponent            -0.1337
ThreePointPct_Team              -0.7135
ThreePointPct_Opponent           0.7135
FreeThrowPct_Team                0.4088
FreeThrowPct_Opponent           -0.4088
OffensiveRebounds_Team           0.0694
OffensiveRebounds_Opponent      -0.0694
DefensiveRebounds_Team          -0.0913
DefensiveRebounds_Opponent       0.0913


### "Diff" Features

In [107]:
target = 'Outcome'
features = ['SeedDiff',
            'WinPctDiff',
            'AvgNetPointsForDiff',
            'AvgRankDiff', 
            'TwoPointPctDiff',
            'ThreePointPctDiff',
            'FreeThrowPctDiff',
            'OffensiveReboundsDiff',
            'DefensiveReboundsDiff'
           ]

lr = LogisticRegression(penalty='l2')
lr.fit(df_train[features], df_train[target])
pred = lr.predict(df_test[features])
probs = lr.predict_proba(df_test[features])

print(classification_report(df_test[target], pred))

print("Log Loss: ",log_loss(df_test[target], probs[:,1]))

             precision    recall  f1-score   support

          0       0.72      0.72      0.72       268
          1       0.72      0.72      0.72       268

avg / total       0.72      0.72      0.72       536

Log Loss:  0.543233549517


In [108]:
for i in zip(features, lr.coef_.flatten()):
    print("{:<30} {:>8.4f}".format(i[0],i[1]))

SeedDiff                        -0.0719
WinPctDiff                      -0.6195
AvgNetPointsForDiff              0.0931
AvgRankDiff                     -0.0116
TwoPointPctDiff                  0.2425
ThreePointPctDiff               -1.2849
FreeThrowPctDiff                 0.6975
OffensiveReboundsDiff            0.0687
DefensiveReboundsDiff           -0.0914


### Normalized

In [111]:
target = 'Outcome'
features = ['SeedDiff',
            'WinPctDiff',
            'AvgNetPointsForDiff',
            'AvgRankDiff', 
            'TwoPointPctDiff',
            'ThreePointPctDiff',
            'FreeThrowPctDiff',
            'OffensiveReboundsDiff',
            'DefensiveReboundsDiff'
           ]

# fit means/std
ss = StandardScaler()
ss.fit(df_train[features])

lr = LogisticRegression(penalty='l2')
lr.fit(ss.transform(df_train[features]), df_train[target])
pred = lr.predict(ss.transform(df_test[features]))
probs = lr.predict_proba(ss.transform(df_test[features]))

print(classification_report(df_test[target], pred))

print("Log Loss: ",log_loss(df_test[target], probs[:,1]))

             precision    recall  f1-score   support

          0       0.74      0.74      0.74       268
          1       0.74      0.74      0.74       268

avg / total       0.74      0.74      0.74       536

Log Loss:  0.546836443034


In [112]:
for i in zip(features, lr.coef_.flatten()):
    print("{:<30} {:>8.4f}".format(i[0],i[1]))

SeedDiff                        -0.5855
WinPctDiff                      -0.1711
AvgNetPointsForDiff              0.7110
AvgRankDiff                     -0.6650
TwoPointPctDiff                  0.0590
ThreePointPctDiff               -0.2500
FreeThrowPctDiff                 0.1308
OffensiveReboundsDiff            0.1473
DefensiveReboundsDiff           -0.2275


## Naive Bayes

In [34]:
target = 'Outcome'
features = ['SeedDiff', 'WinPctDiff', 'AvgNetPointsFor',
            'AvgRank', 'OpponentAvgRank', 
            'TwoPointPct_Team', 'TwoPointPct_Opponent',
            'ThreePointPct_Team', 'ThreePointPct_Opponent',
            'FreeThrowPct_Team', 'FreeThrowPct_Opponent',
            'OffensiveRebounds_Team', 'OffensiveRebounds_Opponent',
            'DefensiveRebounds_Team', 'DefensiveRebounds_Opponent'
           ]

nb = GaussianNB()
nb.fit(df_train[features], df_train[target])

pred = nb.predict(df_test[features])
probs = nb.predict_proba(df_test[features])

print(classification_report(df_test[target], pred))

print("Log Loss: ",log_loss(df_test[target], probs[:,1]))

             precision    recall  f1-score   support

          0       0.71      0.70      0.71       268
          1       0.70      0.71      0.71       268

avg / total       0.71      0.71      0.71       536

Log Loss:  0.885715789799


## Decision Tree

In [44]:
target = 'Outcome'
features = ['SeedDiff', 'WinPct', 'OpponentWinPct', 'AvgNetPointsFor',
            'AvgRank', 'OpponentAvgRank', 
            'TwoPointPct_Team', 'TwoPointPct_Opponent',
            'ThreePointPct_Team', 'ThreePointPct_Opponent',
            'FreeThrowPct_Team', 'FreeThrowPct_Opponent',
            'OffensiveRebounds_Team', 'OffensiveRebounds_Opponent',
            'DefensiveRebounds_Team', 'DefensiveRebounds_Opponent'
           ]

dt = DecisionTreeClassifier(max_depth=3)
dt.fit(df_train[features], df_train[target])

pred = dt.predict(df_test[features])
probs = dt.predict_proba(df_test[features])

print(classification_report(df_test[target], pred))

print("Log Loss: ",log_loss(df_test[target], probs[:,1]))

             precision    recall  f1-score   support

          0       0.64      0.70      0.67       268
          1       0.67      0.61      0.64       268

avg / total       0.66      0.66      0.66       536

Log Loss:  0.616227641986


## Test Data