# This Demonstrates Creating an Ensemble using Data from Approach 1 and 2

In [1]:
import psycopg2 as pg
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
conn = pg.connect(database='postgres',
                  user='postgres',
                  password='w207final',
                  host='35.185.225.167')

### Get the train/test data from the database

In [3]:
approach1TrainingDataQuery = '''  SELECT * FROM  prod."features" f WHERE f."Season Type"='NCAA Tourney' AND f."Season" < 2014 ORDER BY "Season", "DayNum", "Team"  '''
df_approach1_training = pd.read_sql_query(approach1TrainingDataQuery, conn)

In [4]:
approach1TestingDataQuery = '''  SELECT * FROM  prod."features" f WHERE (f."Season Type"='NCAA Tourney') AND (f."Season">2013 and f."Season"<2018) ORDER BY "Season", "DayNum", "Team"  '''
df_approach1_test = pd.read_sql_query(approach1TestingDataQuery, conn)

In [5]:
approach2TrainingDataQuery = '''  SELECT * FROM prod."features" f WHERE (f."Season Type"='Regular') AND (f."Season" < 2018) ORDER BY "Season", "DayNum", "Team"  '''
df_approach2_training = pd.read_sql_query(approach2TrainingDataQuery, conn)

In [6]:
approach2TestingDataQuery = ''' SELECT * FROM prod."features" f WHERE (f."Season Type"='Regular') AND (f."Season" > 2002) order by "Season", "DayNum", "Team" '''
df_approach2_test = pd.read_sql_query(approach2TrainingDataQuery, conn)

In [7]:
conn.close()

### Remove NaNs

In [8]:
df_approach1_training = df_approach1_training.drop( df_approach1_training[
                                                                    (np.isnan(df_approach1_training.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach1_training.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach1_training.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach1_training.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_training.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_training.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_training.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_training.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach1_training.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach1_training.DefensiveRebounds_Opponent))

].index)

In [9]:
df_approach1_test = df_approach1_test.drop( df_approach1_test[
                                                                    (np.isnan(df_approach1_test.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach1_test.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach1_test.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach1_test.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_test.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach1_test.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_test.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach1_test.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach1_test.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach1_test.DefensiveRebounds_Opponent))

].index)

In [10]:
df_approach2_training = df_approach2_training.drop( df_approach2_training[
                                                                    (np.isnan(df_approach2_training.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach2_training.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach2_training.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach2_training.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_training.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_training.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_training.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_training.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach2_training.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_training.DefensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_training.AvgRank)) |
                                                                    (np.isnan(df_approach2_training.OpponentAvgRank))
].index)

In [11]:
df_approach2_test = df_approach2_test.drop( df_approach2_test[
                                                                    (np.isnan(df_approach2_test.TwoPointPct_Team)) |
                                                                    (np.isnan(df_approach2_test.ThreePointPct_Team)) |
                                                                    (np.isnan(df_approach2_test.FreeThrowPct_Team)) |
                                                                    (np.isnan(df_approach2_test.OffensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_test.DefensiveRebounds_Team)) |
                                                                    (np.isnan(df_approach2_test.TwoPointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_test.ThreePointPct_Opponent)) |
                                                                    (np.isnan(df_approach2_test.FreeThrowPct_Opponent)) |
                                                                    (np.isnan(df_approach2_test.OffensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_test.DefensiveRebounds_Opponent)) |
                                                                    (np.isnan(df_approach2_test.AvgRank)) |
                                                                    (np.isnan(df_approach2_test.OpponentAvgRank))
].index)

### Create Train and Test Sets

In [12]:
target1 = 'Outcome'
features1 = ['TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'SeedDiff']

target2 = 'Outcome'
features2 = ['TwoPointPct_Team', 'ThreePointPct_Team', 'FreeThrowPct_Team', 'OffensiveRebounds_Team', 'DefensiveRebounds_Team', 'TwoPointPct_Opponent', 'ThreePointPct_Opponent', 'FreeThrowPct_Opponent', 'OffensiveRebounds_Opponent', 'DefensiveRebounds_Opponent', 'AvgRank', 'OpponentAvgRank']

In [13]:
X_train1 = df_approach1_training[features1]
y_train1 = df_approach1_training[target1]
X_test1 = df_approach1_test[features1]
y_test1 = df_approach1_test[target1]

In [15]:
X_train2 = df_approach2_training[features2]
y_train2 = df_approach2_training[target2]
X_test2 = df_approach2_test[features2]
y_test2 = df_approach2_test[target2]

#### This is the same data as X_test1, but has narrower features to support Approach 2 in the ensemble

In [17]:
X_test1_2 = df_approach1_test[features2]
y_test1_2 = df_approach1_test[target2]

### Approach 1  - (note that this includes 'SeedDiff')

In [18]:
lrc1 = LogisticRegression(penalty='l1')
params = {'C': [50,75,100,125,150,175,200,225]}
gscv1 = GridSearchCV(lrc1, params, scoring='accuracy', refit=True)
gscv1.fit(X_train1, y_train1)
print('Best log_loss: {:.4}, with best C: {}'.format(gscv1.best_score_, gscv1.best_params_['C']))

Best log_loss: 0.716, with best C: 75


In [19]:
lrc1 = LogisticRegression(penalty='l1', C=75)
lrc1.fit(X_train1, y_train1)
lrc1.score(X_test1, y_test1)

0.70522388059701491

In [20]:
from sklearn.metrics import classification_report
y_pred1 = lrc1.predict(X_test1)
print(classification_report(y_test1, y_pred1))

             precision    recall  f1-score   support

          0       0.71      0.71      0.71       268
          1       0.71      0.71      0.71       268

avg / total       0.71      0.71      0.71       536



### Approach 2   - (note that this includes 'AvgRank' and 'OpponentAvgRank')

In [21]:
lrc2 = LogisticRegression(penalty='l1')
params = {'C': [50,75,100,125,150,175,200,225]}
gscv2 = GridSearchCV(lrc2, params, scoring='accuracy', refit=True)
gscv2.fit(X_train2, y_train2)
print('Best log_loss: {:.4}, with best C: {}'.format(gscv2.best_score_, gscv2.best_params_['C']))

Best log_loss: 0.7083, with best C: 100


In [22]:
lrc2 = LogisticRegression(penalty='l1', C=100)
lrc2.fit(X_train2, y_train2)
lrc2.score(X_test2, y_test2)

0.70832281941290265

In [23]:
from sklearn.metrics import classification_report
y_pred2 = lrc2.predict(X_test2)
print(classification_report(y_test2, y_pred2))

             precision    recall  f1-score   support

          0       0.71      0.71      0.71     71334
          1       0.71      0.71      0.71     71334

avg / total       0.71      0.71      0.71    142668



### Simple Ensemble of Approach 1 and Approach 2

In [24]:
# Get prediction probabilities from both models
probs1 = lrc1.predict_proba(X_test1)
probs2 = lrc2.predict_proba(X_test1_2)

In [25]:
# Get the mean of the probabilities
loss_prob_df = pd.DataFrame(data={'approach1':probs1[:,0], 'approach2':probs2[:,0]})
win_prob_df = pd.DataFrame(data={'approach1':probs1[:,1], 'approach2':probs2[:,1]})
loss_win_df = pd.DataFrame(data={'loss_prob':loss_prob_df.mean(axis=1), 'win_prob':win_prob_df.mean(axis=1), 'predict':0})

In [26]:
# Determine loss / win depending on which probability is larger
predictions = []

for index in range(len(win_prob_df)):
    if loss_win_df['loss_prob'][index] > loss_win_df['win_prob'][index] :
        predictions.append(0)
    else:
        predictions.append(1)

In [27]:
print(classification_report(y_test1, predictions))

             precision    recall  f1-score   support

          0       0.72      0.72      0.72       268
          1       0.72      0.72      0.72       268

avg / total       0.72      0.72      0.72       536

