# This Demonstrates Creating an Ensemble using Data from Approach 1 and 2

In [104]:
import psycopg2 as pg
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error

In [105]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

### Get the train/test data from the database

In [106]:
approach1TrainingDataQuery = '''  SELECT * FROM  prod."features" f WHERE f."Season Type"='NCAA Tourney' AND f."Season" < 2014 ORDER BY "Season", "DayNum", "Team"  '''
df_approach1_training = pd.read_sql_query(approach1TrainingDataQuery, conn)

In [107]:
approach1TestingDataQuery = '''  SELECT * FROM  prod."features" f WHERE (f."Season Type"='NCAA Tourney') AND (f."Season">2013 and f."Season"<2018) ORDER BY "Season", "DayNum", "Team"  '''
df_approach1_test = pd.read_sql_query(approach1TestingDataQuery, conn)

In [108]:
approach2TrainingDataQuery = '''  SELECT * FROM prod."features" f WHERE (f."Season Type"='Regular') OR (f."Season" < 2014) ORDER BY "Season", "DayNum", "Team"  '''
df_approach2_training = pd.read_sql_query(approach2TrainingDataQuery, conn)

In [109]:
approach2TestingDataQuery = ''' SELECT * FROM prod."features" f WHERE (f."Season Type"='NCAA Tourney') AND (f."Season" > 2013) AND (f."Season" < 2018) order by "Season", "DayNum", "Team" '''
df_approach2_test = pd.read_sql_query(approach2TrainingDataQuery, conn)

In [110]:
conn.close()

### Remove NaNs

In [111]:
df_approach1_training = df_approach1_training.drop( df_approach1_training[
                                                                    (np.isnan(df_approach1_training.Outcome)) |    
                                                                    (np.isnan(df_approach1_training.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach1_training.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach1_training.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach1_training.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_training.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_training.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_training.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_training.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach1_training.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach1_training.DefensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach1_training.AvgRank)) |
                                                                    (np.isnan(df_approach1_training.OpponentAvgRank)) |
                                                                    (np.isnan(df_approach1_training.WinPct)) |
                                                                    (np.isnan(df_approach1_training.OpponentWinPct)) |
                                                                    (np.isnan(df_approach1_training.WinPctDiff)) |
                                                                    (np.isnan(df_approach1_training.AvgPointsFor)) |
                                                                    (np.isnan(df_approach1_training.AvgPointsAgainst)) |
                                                                    (np.isnan(df_approach1_training.AvgNetPointsFor)) |
                                                                    (np.isnan(df_approach1_training.OpponentAvgPointsFor)) |
                                                                    (np.isnan(df_approach1_training.OpponentAvgPointsAgainst)) |
                                                                    (np.isnan(df_approach1_training.OpponentAvgNetPointsFor)) |
                                                                    (np.isnan(df_approach1_training.TourWins)) |
                                                                    (np.isnan(df_approach1_training.OpponentTourWins)) |
                                                                    (np.isnan(df_approach1_training.TourWinsDiff))
].index)

In [112]:
df_approach1_test = df_approach1_test.drop( df_approach1_test[
                                                                    (np.isnan(df_approach1_test.Outcome)) |  
                                                                    (np.isnan(df_approach1_test.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach1_test.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach1_test.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach1_test.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_test.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_test.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_test.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_test.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach1_test.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach1_test.DefensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach1_test.AvgRank)) |
                                                                    (np.isnan(df_approach1_test.OpponentAvgRank)) |
                                                                    (np.isnan(df_approach1_test.WinPct)) |
                                                                    (np.isnan(df_approach1_test.OpponentWinPct)) |
                                                                    (np.isnan(df_approach1_test.WinPctDiff)) |
                                                                    (np.isnan(df_approach1_test.AvgPointsFor)) |
                                                                    (np.isnan(df_approach1_test.AvgPointsAgainst)) |
                                                                    (np.isnan(df_approach1_test.AvgNetPointsFor)) |
                                                                    (np.isnan(df_approach1_test.OpponentAvgPointsFor)) |
                                                                    (np.isnan(df_approach1_test.OpponentAvgPointsAgainst)) |
                                                                    (np.isnan(df_approach1_test.OpponentAvgNetPointsFor)) |
                                                                    (np.isnan(df_approach1_test.TourWins)) |
                                                                    (np.isnan(df_approach1_test.OpponentTourWins)) |
                                                                    (np.isnan(df_approach1_test.TourWinsDiff))
].index)

In [113]:
df_approach2_training = df_approach2_training.drop( df_approach2_training[
                                                                    (np.isnan(df_approach2_training.Outcome)) |  
                                                                    (np.isnan(df_approach2_training.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach2_training.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach2_training.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach2_training.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_training.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_training.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_training.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_training.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach2_training.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_training.DefensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_training.AvgRank)) |
                                                                    (np.isnan(df_approach2_training.OpponentAvgRank)) |
                                                                    (np.isnan(df_approach2_training.WinPct)) |
                                                                    (np.isnan(df_approach2_training.OpponentWinPct)) |
                                                                    (np.isnan(df_approach2_training.WinPctDiff)) |
                                                                    (np.isnan(df_approach2_training.AvgPointsFor)) |
                                                                    (np.isnan(df_approach2_training.AvgPointsAgainst)) |
                                                                    (np.isnan(df_approach2_training.AvgNetPointsFor)) |
                                                                    (np.isnan(df_approach2_training.OpponentAvgPointsFor)) |
                                                                    (np.isnan(df_approach2_training.OpponentAvgPointsAgainst)) |
                                                                    (np.isnan(df_approach2_training.OpponentAvgNetPointsFor)) |
                                                                    (np.isnan(df_approach2_training.TourWins)) |
                                                                    (np.isnan(df_approach2_training.OpponentTourWins)) |
                                                                    (np.isnan(df_approach2_training.TourWinsDiff))                                                                     
                                                                     
].index)

In [114]:
df_approach2_test = df_approach2_test.drop( df_approach2_test[
                                                                    (np.isnan(df_approach2_test.Outcome)) |  
                                                                    (np.isnan(df_approach2_test.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach2_test.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach2_test.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach2_test.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_test.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_test.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_test.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_test.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach2_test.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_test.DefensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_test.AvgRank)) |
                                                                    (np.isnan(df_approach2_test.OpponentAvgRank)) |
                                                                    (np.isnan(df_approach2_test.WinPct)) |
                                                                    (np.isnan(df_approach2_test.OpponentWinPct)) |
                                                                    (np.isnan(df_approach2_test.WinPctDiff)) |
                                                                    (np.isnan(df_approach2_test.AvgPointsFor)) |
                                                                    (np.isnan(df_approach2_test.AvgPointsAgainst)) |
                                                                    (np.isnan(df_approach2_test.AvgNetPointsFor)) |
                                                                    (np.isnan(df_approach2_test.OpponentAvgPointsFor)) |
                                                                    (np.isnan(df_approach2_test.OpponentAvgPointsAgainst)) |
                                                                    (np.isnan(df_approach2_test.OpponentAvgNetPointsFor)) |
                                                                    (np.isnan(df_approach2_test.TourWins)) |
                                                                    (np.isnan(df_approach2_test.OpponentTourWins)) |
                                                                    (np.isnan(df_approach2_test.TourWinsDiff))
].index)

### Create Train and Test Sets

In [115]:
target1 = 'Outcome'
features1 = ['TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'SeedDiff', 'AvgRank', 'OpponentAvgRank', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff']

target2 = 'Outcome'
features2 = ['TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'AvgRank', 'OpponentAvgRank', 'WinPct', 'OpponentWinPct', 'WinPctDiff', 'AvgPointsFor', 'AvgPointsAgainst', 'AvgNetPointsFor', 'OpponentAvgPointsFor', 'OpponentAvgPointsAgainst', 'OpponentAvgNetPointsFor', 'TourWins', 'OpponentTourWins', 'TourWinsDiff']

In [116]:
X_train1 = df_approach1_training[features1]
y_train1 = df_approach1_training[target1]
X_test1 = df_approach1_test[features1]
y_test1 = df_approach1_test[target1]

In [117]:
X_train2 = df_approach2_training[features2]
y_train2 = df_approach2_training[target2]
X_test2 = df_approach2_test[features2]
y_test2 = df_approach2_test[target2]

#### This is the same data as X_test1, but has narrower features to support Approach 2 in the ensemble

In [118]:
X_test1_2 = df_approach1_test[features2]
y_test1_2 = df_approach1_test[target2]

### Approach 1  - (note that this includes 'SeedDiff')

In [119]:
lrc1 = LogisticRegression(penalty='l1')
params = {'C': [25,50,75,100,125,150,175,200,225,250,275,300]}
gscv1 = GridSearchCV(lrc1, params, scoring='accuracy', refit=True)
gscv1.fit(X_train1, y_train1)
print('Best log_loss: {:.4}, with best C: {}'.format(gscv1.best_score_, gscv1.best_params_['C']))

Best log_loss: 0.7006, with best C: 75


In [120]:
lrc1 = LogisticRegression(penalty='l1', C=gscv1.best_params_['C'])
lrc1.fit(X_train1, y_train1)
lrc1.score(X_test1, y_test1)

0.73880597014925375

In [121]:
from sklearn.metrics import classification_report
y_pred1 = lrc1.predict(X_test1)
y_pred_prob1 = lrc1.predict_proba(X_test1)
print(classification_report(y_test1, y_pred1))

             precision    recall  f1-score   support

          0       0.74      0.74      0.74       268
          1       0.74      0.74      0.74       268

avg / total       0.74      0.74      0.74       536



In [122]:
print("\nROC-AUC score: %.3f" % roc_auc_score(y_test1, y_pred_prob1[:, 1]))


ROC-AUC score: 0.801


In [123]:
# Determine log-loss
print 'Log loss: {}'.format(log_loss(y_test1, y_pred_prob1[:,1]))
# Determine RMSE
print 'RMSE: {}'.format(mean_squared_error(y_test1, y_pred_prob1[:,1]))

Log loss: 0.536094091343
RMSE: 0.181579832793


### Approach 2

In [124]:
lrc2 = LogisticRegression(penalty='l1')
params = {'C': [50,75,100,125,150,175,200,225]}
gscv2 = GridSearchCV(lrc2, params, scoring='accuracy', refit=True)
gscv2.fit(X_train2, y_train2)
print('Best log_loss: {:.4}, with best C: {}'.format(gscv2.best_score_, gscv2.best_params_['C']))

Best log_loss: 0.7104, with best C: 150


In [125]:
lrc2 = LogisticRegression(penalty='l1', C=gscv2.best_params_['C'])
lrc2.fit(X_train2, y_train2)
lrc2.score(X_test2, y_test2)

0.71069206842923793

In [126]:
from sklearn.metrics import classification_report
y_pred2 = lrc2.predict(X_test2)
y_pred_prob2 = lrc2.predict_proba(X_test2)
print(classification_report(y_test2, y_pred2))

             precision    recall  f1-score   support

          0       0.71      0.71      0.71     77160
          1       0.71      0.71      0.71     77160

avg / total       0.71      0.71      0.71    154320



In [127]:
print("\nROC-AUC score: %.3f" % roc_auc_score(y_test2, y_pred_prob2[:, 1]))


ROC-AUC score: 0.787


In [128]:
# Determine log-loss
print 'Log loss: {}'.format(log_loss(y_test2, y_pred_prob2[:,1]))
# Determine RMSE
print 'RMSE: {}'.format(mean_squared_error(y_test2, y_pred_prob2[:,1]))

Log loss: 0.55208464103
RMSE: 0.187713908139


### Simple Ensemble of Approach 1 and Approach 2

In [129]:
# Get prediction probabilities from both models
probs1 = lrc1.predict_proba(X_test1)
probs2 = lrc2.predict_proba(X_test1_2)

In [130]:
# Get the mean of the probabilities
loss_prob_df = pd.DataFrame(data={'approach1':probs1[:,0], 'approach2':probs2[:,0]})
win_prob_df = pd.DataFrame(data={'approach1':probs1[:,1], 'approach2':probs2[:,1]})
loss_win_df = pd.DataFrame(data={'loss_prob':loss_prob_df.mean(axis=1), 'win_prob':win_prob_df.mean(axis=1), 'predict':0})

In [131]:
# Determine loss / win depending on which probability is larger
ensemble_predictions = []
ensemble_probs = []

for index in range(len(win_prob_df)):
    if loss_win_df['loss_prob'][index] > loss_win_df['win_prob'][index] :
        ensemble_predictions.append(0)
        ensemble_probs.append(1-loss_win_df['loss_prob'][index])
    else:
        ensemble_predictions.append(1)
        ensemble_probs.append(loss_win_df['win_prob'][index])

In [132]:
print(classification_report(y_test1, ensemble_predictions))

             precision    recall  f1-score   support

          0       0.72      0.72      0.72       268
          1       0.72      0.72      0.72       268

avg / total       0.72      0.72      0.72       536



In [133]:
print("\nROC-AUC score: %.3f" % roc_auc_score(y_test1, ensemble_probs))


ROC-AUC score: 0.813


In [134]:
# Determine log-loss
print 'Log loss: {}'.format(log_loss(y_test1, ensemble_probs))
# Determine RMSE
print 'RMSE: {}'.format(mean_squared_error(y_test1, ensemble_probs))

Log loss: 0.527962493173
RMSE: 0.177670652146
