# This Demonstrates Creating an Ensemble using Data from Approach 1 and 2

In [64]:
import psycopg2 as pg
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error

In [65]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

### Get the train/test data from the database

In [66]:
approach1TrainingDataQuery = '''  SELECT * FROM  prod."features" f WHERE f."Season Type"='NCAA Tourney' AND f."Season" < 2014 ORDER BY "Season", "DayNum", "Team"  '''
df_approach1_training = pd.read_sql_query(approach1TrainingDataQuery, conn)

In [67]:
approach1TestingDataQuery = '''  SELECT * FROM  prod."features" f WHERE (f."Season Type"='NCAA Tourney') AND (f."Season">2013 and f."Season"<2018) ORDER BY "Season", "DayNum", "Team"  '''
df_approach1_test = pd.read_sql_query(approach1TestingDataQuery, conn)

In [68]:
approach2TrainingDataQuery = '''  SELECT * FROM prod."features" f WHERE (f."Season Type"='Regular') OR (f."Season" < 2014) ORDER BY "Season", "DayNum", "Team"  '''
df_approach2_training = pd.read_sql_query(approach2TrainingDataQuery, conn)

In [69]:
approach2TestingDataQuery = ''' SELECT * FROM prod."features" f WHERE (f."Season Type"='NCAA Tourney') AND (f."Season" > 2013) AND (f."Season" < 2018) order by "Season", "DayNum", "Team" '''
df_approach2_test = pd.read_sql_query(approach2TrainingDataQuery, conn)

In [70]:
conn.close()

### Remove NaNs

In [71]:
df_approach1_training.dropna(how='any', inplace=True, subset=['Outcome', 'AvgRank', 'OpponentAvgRank', 'TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'SeedDiff', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff', 'FieldGoalPct_Team', 'TwoPointAttPct_Team', 'ThreePointAttPct_Team', 'FieldGoalAtt_Team', 'TwoPointAtt_Team', 'ThreePointAtt_Team', 'FreeThrowAtt_Team', 'Assists_Team', 'Turnovers_Team', 'Steals_Team', 'Blocks_Team', 'PersonalFouls_Team', 'FieldGoalPct_Opponent', 'TwoPointAttPct_Opponent', 'ThreePointAttPct_Opponent', 'FieldGoalAtt_Opponent', 'TwoPointAtt_Opponent', 'ThreePointAtt_Opponent', 'FreeThrowAtt_Opponent', 'Assists_Opponent', 'Turnovers_Opponent', 'Steals_Opponent', 'Blocks_Opponent', 'PersonalFouls_Opponent'])

In [72]:
df_approach1_test.dropna(how='any', inplace=True, subset=['Outcome', 'AvgRank', 'OpponentAvgRank', 'TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'SeedDiff', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff', 'FieldGoalPct_Team', 'TwoPointAttPct_Team', 'ThreePointAttPct_Team', 'FieldGoalAtt_Team', 'TwoPointAtt_Team', 'ThreePointAtt_Team', 'FreeThrowAtt_Team', 'Assists_Team', 'Turnovers_Team', 'Steals_Team', 'Blocks_Team', 'PersonalFouls_Team', 'FieldGoalPct_Opponent', 'TwoPointAttPct_Opponent', 'ThreePointAttPct_Opponent', 'FieldGoalAtt_Opponent', 'TwoPointAtt_Opponent', 'ThreePointAtt_Opponent', 'FreeThrowAtt_Opponent', 'Assists_Opponent', 'Turnovers_Opponent', 'Steals_Opponent', 'Blocks_Opponent', 'PersonalFouls_Opponent'])

In [73]:
df_approach2_training.dropna(how='any', inplace=True, subset=['Outcome', 'AvgRank', 'OpponentAvgRank', 'TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff', 'FieldGoalPct_Team', 'TwoPointAttPct_Team', 'ThreePointAttPct_Team', 'FieldGoalAtt_Team', 'TwoPointAtt_Team', 'ThreePointAtt_Team', 'FreeThrowAtt_Team', 'Assists_Team', 'Turnovers_Team', 'Steals_Team', 'Blocks_Team', 'PersonalFouls_Team', 'FieldGoalPct_Opponent', 'TwoPointAttPct_Opponent', 'ThreePointAttPct_Opponent', 'FieldGoalAtt_Opponent', 'TwoPointAtt_Opponent', 'ThreePointAtt_Opponent', 'FreeThrowAtt_Opponent', 'Assists_Opponent', 'Turnovers_Opponent', 'Steals_Opponent', 'Blocks_Opponent', 'PersonalFouls_Opponent'])

In [74]:
df_approach2_test.dropna(how='any', inplace=True, subset=['Outcome', 'AvgRank', 'OpponentAvgRank', 'TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff', 'FieldGoalPct_Team', 'TwoPointAttPct_Team', 'ThreePointAttPct_Team', 'FieldGoalAtt_Team', 'TwoPointAtt_Team', 'ThreePointAtt_Team', 'FreeThrowAtt_Team', 'Assists_Team', 'Turnovers_Team', 'Steals_Team', 'Blocks_Team', 'PersonalFouls_Team', 'FieldGoalPct_Opponent', 'TwoPointAttPct_Opponent', 'ThreePointAttPct_Opponent', 'FieldGoalAtt_Opponent', 'TwoPointAtt_Opponent', 'ThreePointAtt_Opponent', 'FreeThrowAtt_Opponent', 'Assists_Opponent', 'Turnovers_Opponent', 'Steals_Opponent', 'Blocks_Opponent', 'PersonalFouls_Opponent'])

### Create Train and Test Sets

In [75]:
target1 = 'Outcome'
features1 = ['AvgRank', 'OpponentAvgRank', 'TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'SeedDiff', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff', 'FieldGoalPct_Team', 'TwoPointAttPct_Team', 'ThreePointAttPct_Team', 'FieldGoalAtt_Team', 'TwoPointAtt_Team', 'ThreePointAtt_Team', 'FreeThrowAtt_Team', 'Assists_Team', 'Turnovers_Team', 'Steals_Team', 'Blocks_Team', 'PersonalFouls_Team', 'FieldGoalPct_Opponent', 'TwoPointAttPct_Opponent', 'ThreePointAttPct_Opponent', 'FieldGoalAtt_Opponent', 'TwoPointAtt_Opponent', 'ThreePointAtt_Opponent', 'FreeThrowAtt_Opponent', 'Assists_Opponent', 'Turnovers_Opponent', 'Steals_Opponent', 'Blocks_Opponent', 'PersonalFouls_Opponent']

target2 = 'Outcome'
features2 = ['AvgRank', 'OpponentAvgRank', 'TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff', 'FieldGoalPct_Team', 'TwoPointAttPct_Team', 'ThreePointAttPct_Team', 'FieldGoalAtt_Team', 'TwoPointAtt_Team', 'ThreePointAtt_Team', 'FreeThrowAtt_Team', 'Assists_Team', 'Turnovers_Team', 'Steals_Team', 'Blocks_Team', 'PersonalFouls_Team', 'FieldGoalPct_Opponent', 'TwoPointAttPct_Opponent', 'ThreePointAttPct_Opponent', 'FieldGoalAtt_Opponent', 'TwoPointAtt_Opponent', 'ThreePointAtt_Opponent', 'FreeThrowAtt_Opponent', 'Assists_Opponent', 'Turnovers_Opponent', 'Steals_Opponent', 'Blocks_Opponent', 'PersonalFouls_Opponent']

In [76]:
X_train1 = df_approach1_training[features1]
y_train1 = df_approach1_training[target1]
X_test1 = df_approach1_test[features1]
y_test1 = df_approach1_test[target1]

In [77]:
X_train2 = df_approach2_training[features2]
y_train2 = df_approach2_training[target2]
X_test2 = df_approach2_test[features2]
y_test2 = df_approach2_test[target2]

#### This is the same data as X_test1, but has narrower features to support Approach 2 in the ensemble

In [78]:
X_test1_2 = df_approach1_test[features2]
y_test1_2 = df_approach1_test[target2]

### Approach 1  - (note that this includes 'SeedDiff')

In [79]:
lrc1 = LogisticRegression(penalty='l1')
params = {'C': [25,50,75,100,125,150,175,200,225,250,275,300]}
gscv1 = GridSearchCV(lrc1, params, scoring='neg_log_loss', refit=True)
gscv1.fit(X_train1, y_train1)
print('Best neg log-loss: {:.4}, with best C: {}'.format(gscv1.best_score_, gscv1.best_params_['C']))

Best accuracy: 0.7048, with best C: 300


In [80]:
lrc1 = LogisticRegression(penalty='l1', C=gscv1.best_params_['C'])
lrc1.fit(X_train1, y_train1)
lrc1.score(X_test1, y_test1)

0.73880597014925375

In [81]:
from sklearn.metrics import classification_report
y_pred1 = lrc1.predict(X_test1)
y_pred_prob1 = lrc1.predict_proba(X_test1)
print(classification_report(y_test1, y_pred1))

             precision    recall  f1-score   support

          0       0.74      0.74      0.74       268
          1       0.74      0.74      0.74       268

avg / total       0.74      0.74      0.74       536



In [82]:
print("\nROC-AUC score: %.3f" % roc_auc_score(y_test1, y_pred_prob1[:, 1]))


ROC-AUC score: 0.807


In [83]:
# Determine log-loss
print 'Log loss: {}'.format(log_loss(y_test1, y_pred_prob1[:,1]))
# Determine RMSE
print 'RMSE: {}'.format(mean_squared_error(y_test1, y_pred_prob1[:,1]))

Log loss: 0.530689611158
RMSE: 0.179117924015


### Approach 2

In [84]:
lrc2 = LogisticRegression(penalty='l1')
params = {'C': [50,75,100,125,150,175,200,225]}
gscv2 = GridSearchCV(lrc2, params, scoring='neg_log_loss', refit=True)
gscv2.fit(X_train2, y_train2)
print('Best neg log-loss: {:.4}, with best C: {}'.format(gscv2.best_score_, gscv2.best_params_['C']))

Best accuracy: 0.7099, with best C: 75


In [85]:
lrc2 = LogisticRegression(penalty='l1', C=gscv2.best_params_['C'])
lrc2.fit(X_train2, y_train2)
lrc2.score(X_test2, y_test2)

0.7102643856920684

In [86]:
from sklearn.metrics import classification_report
y_pred2 = lrc2.predict(X_test2)
y_pred_prob2 = lrc2.predict_proba(X_test2)
print(classification_report(y_test2, y_pred2))

             precision    recall  f1-score   support

          0       0.71      0.71      0.71     77160
          1       0.71      0.71      0.71     77160

avg / total       0.71      0.71      0.71    154320



In [87]:
print("\nROC-AUC score: %.3f" % roc_auc_score(y_test2, y_pred_prob2[:, 1]))


ROC-AUC score: 0.788


In [88]:
# Determine log-loss
print 'Log loss: {}'.format(log_loss(y_test2, y_pred_prob2[:,1]))
# Determine RMSE
print 'RMSE: {}'.format(mean_squared_error(y_test2, y_pred_prob2[:,1]))

Log loss: 0.551180443636
RMSE: 0.18737933068


### Simple Ensemble of Approach 1 and Approach 2

In [89]:
# Get prediction probabilities from both models
probs1 = lrc1.predict_proba(X_test1)
probs2 = lrc2.predict_proba(X_test1_2)

In [90]:
# Get the mean of the probabilities
loss_prob_df = pd.DataFrame(data={'approach1':probs1[:,0], 'approach2':probs2[:,0]})
win_prob_df = pd.DataFrame(data={'approach1':probs1[:,1], 'approach2':probs2[:,1]})
loss_win_df = pd.DataFrame(data={'loss_prob':loss_prob_df.mean(axis=1), 'win_prob':win_prob_df.mean(axis=1), 'predict':0})

In [91]:
# Determine loss / win depending on which probability is larger
ensemble_predictions = []
ensemble_probs = []

for index in range(len(win_prob_df)):
    if loss_win_df['loss_prob'][index] > loss_win_df['win_prob'][index] :
        ensemble_predictions.append(0)
        ensemble_probs.append(1-loss_win_df['loss_prob'][index])
    else:
        ensemble_predictions.append(1)
        ensemble_probs.append(loss_win_df['win_prob'][index])

In [92]:
print(classification_report(y_test1, ensemble_predictions))

             precision    recall  f1-score   support

          0       0.75      0.75      0.75       268
          1       0.75      0.75      0.75       268

avg / total       0.75      0.75      0.75       536



In [93]:
print("\nROC-AUC score: %.3f" % roc_auc_score(y_test1, ensemble_probs))


ROC-AUC score: 0.816


In [94]:
# Determine log-loss
print 'Log loss: {}'.format(log_loss(y_test1, ensemble_probs))
# Determine RMSE
print 'RMSE: {}'.format(mean_squared_error(y_test1, ensemble_probs))

Log loss: 0.524006109785
RMSE: 0.175857375446
