In [None]:
import psycopg2

import pandas as pds

import sqlalchemy

import numpy as np

import matplotlib.pyplot as plt

import scipy.stats as scipy

import statistics as stats

import math as math

import yellowbrick as yb

# Create Database Engine and Dataframes

In [None]:
Engine = sqlalchemy.create_engine('postgresql+psycopg2://postgres:georgetown@nflstats.cb6meldrm5db.us-east-1.rds.amazonaws.com:5432/nfl_stats', pool_recycle=3600);

dbConnection = Engine.connect();

df_table = pds.read_sql("""select * from final_table_joined""", dbConnection);

# Use for testing 2018 data vs prior years
#df_table_train = pds.read_sql("""select * from game_data_2000_to_2017""", dbConnection);
#df_table_test = pds.read_sql("""select * from game_data_2018""", dbConnection);

# QC

In [None]:
df_table.describe()

# Use for testing 2018 data vs prior years
#df_table_train.describe()
#df_table_test.describe()

In [None]:
df_table.head(10)

# Use for testing 2018 data vs prior years
#df_table_train.head(10)
#df_table_test.head(10)

In [None]:
df_table.isna().sum()

# Use for testing 2018 data vs prior years
#df_table_train.isna().sum()
#df_table_test.isna().sum()

# Features for Modeling

In [None]:
features = ['home_point_differential', 'home_passing_yards', 'home_rushing_yards', 'home_turnover_differential', 'home_passing_yards_against', 'home_rushing_yards_against', 'home_win_percentage', 'home_power_ranking','away_point_differential', 'away_passing_yards', 'away_rushing_yards', 'away_turnover_differential', 'away_passing_yards_against', 'away_rushing_yards_against', 'away_win_percentage', 'away_power_ranking']

x = df_table[features].values
y = df_table['home_outcome'].values

# Use for testing 2018 data vs prior years
#X_train = df_table_train[features].values
#y_train = df_table_train['home_outcome'].values
#X_test = df_table_test[features].values
#y_test = df_table_test['home_outcome']

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ClassPredictionError

# Create train and test splits
splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

# Declare model
DTCmodel = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)

# State Classes
classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(DTCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(DTCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(DTCmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

#The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(DTCmodel, classes=classes, cmap='BuPu')

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(DTCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    DTCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Declare model
DTCoptimal = DecisionTreeClassifier()

# Specify Parameters
min_samples_split = np.linspace(0.1,0.5,5)
max_depth = [int(x) for x in np.linspace(1,10,1)]
criterion = ['gini', 'entropy']

# Define Parameter Grid
params = dict(max_depth = max_depth,  
             min_samples_split = min_samples_split,  
             criterion = criterion)

scores = ['precision', 'recall']

# Grid Search and Find Optimal Parameter Values
for score in scores:

    gridF = GridSearchCV(DTCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, DTCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ClassPredictionError

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

GNBmodel = GaussianNB()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(GNBmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(GNBmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(GNBmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(GNBmodel, classes=classes, cmap='BuPu')

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(GNBmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    GNBmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ClassPredictionError

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

RFCmodel = RandomForestClassifier()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(RFCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(RFCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(RFCmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(RFCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(RFCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    RFCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

RFCoptimal = RandomForestClassifier()

min_samples_split = np.linspace(0.5,2.5,5)
max_depth = [int(x) for x in np.linspace(1,10,10)]
criterion = ['gini', 'entropy']
n_estimators = [int(x) for x in np.linspace(100,150,2)]
max_features = ['auto', 'sqrt']

params = dict(max_depth = max_depth,  
             min_samples_split = min_samples_split)  
             criterion = criterion, 
             n_estimators = n_estimators,
             max_features = max_features)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(RFCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, RFCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ClassPredictionError

#splits = train_test_split(x, y, test_size=0.2)
#X_train, X_test, y_train, y_test = splits

GBCmodel = GradientBoostingClassifier()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(GBCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(GBCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(GBCmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(GBCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(GBCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    GBCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

GBCoptimal = GradientBoostingClassifier(max_features = 'sqrt', criterion = 'mae', max_depth = 4, validation_fraction = 0.675, subsample = 0.96875)

learning_rate = np.linspace(0.1, 1, 5)
max_depth = [int(x) for x in np.linspace(1,5,5)]
min_impurity_decrease = np.linspace(0,1,5)
validation_fraction = np.linspace(0.65,0.7,5)
min_samples_leaf = [1, 2, 3]
subsample = np.linspace(0.875, 1, 5)

params = dict(learning_rate = learning_rate,
              subsample = subsample,
              min_samples_leaf = min_samples_leaf,
              validation_fraction = validation_fraction,
              min_weight_fraction_leaf = min_weight_fraction_leaf,
              max_depth = max_depth)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(GBCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, GBCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Voting Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix

#splits = train_test_split(x, y, test_size=0.2)
#X_train, X_test, y_train, y_test = splits

DTCinput = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)
GNBinput = GaussianNB()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier()

VCmodel = VotingClassifier(estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput)], voting = 'hard')

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(VCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(VCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(VCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(VCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    VCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for Voting Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

DTCinput = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)
GNBinput = GaussianNB()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier()
BCinput = BaggingClassifier(GaussianNB())
ABCinput = AdaBoostClassifier(RandomForestClassifier())
SCinput = StackingClassifier(estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput), ('bc', BCinput), ('abc', ABCinput)], final_estimator = LogisticRegression(), cv = 12)

VCoptimal = VotingClassifier(estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput), ('bc', BCinput), ('abc', ABCinput), ('sc', SCinput)])

voting = ['soft', 'hard']
flatten_transform = [True, False]

params = dict(voting = voting,
              flatten_transform = flatten_transform)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(VCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, VCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Bagging Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

BCmodel = BaggingClassifier(GNBmodel)

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(BCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(BCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(BCmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(DTCmodel, classes=classes, cmap = 'BuPu')

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(BCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    BCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for Bagging Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

BCoptimal = BaggingClassifier()

DTCinput = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)
GNBinput = GaussianNB()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier(criterion = 'mae', max_features = 'sqrt', max_depth = 4, validation_fraction = 0.675, subsample = 0.96875)

base_estimator = [DTCinput, GNBinput, RFCinput, ETCinput, GBCinput]
n_estimators = [int(x) for x in np.linspace(10,100,10)]
#max_samples = np.linspace(0.1, 1, 5)
max_features = np.linspace(0.1, 1, 5)
#bootstrap = [True, False]
#bootstrap_features = [True, False]
#oob_score = [True, False]
#warm_start = [True, False]

params = dict(base_estimator = base_estimator,
             n_estimators = n_estimators,
             #max_samples = max_samples,  
             max_features = max_features)
             #bootstrap = bootstrap,
             #bootstrap_features = bootstrap_features,
             #oob_score = oob_score,
             #warm_start = warm_start)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(BCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, BCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# AdaBoost Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from sklearn.ensemble import BaggingClassifier

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

ABCmodel = AdaBoostClassifier(RFCmodel)

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(ABCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(ABCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(ABCmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(ABCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(ABCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    ABCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for AdaBoost Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

ABCoptimal = AdaBoostClassifier()

DTCinput = DecisionTreeClassifier()
GNBinput = GaussianNB()
KNNCinput = KNeighborsClassifier()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier
GBCinput = GradientBoostingClassifier(criterion = 'mae', max_features = 'sqrt', max_depth = 4, validation_fraction = 0.675, subsample = 0.96875)
BCinput = BaggingClassifier(GaussianNB())

base_estimator = [RFCinput]
n_estimators = [int(x) for x in np.linspace(10,100,10)]
learning_rate = np.linspace(0.1, 1, 5)
algorithm = ['SAMME', 'SAMME.R']

params = dict(base_estimator = base_estimator, 
             n_estimators = n_estimators,
             learning_rate = learning_rate,  
             algorithm = algorithm)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(ABCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, ABCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Stacking Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve
from sklearn.ensemble import BaggingClassifier

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

DTCinput = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GNBinput = GaussianNB()
GBCinput = GradientBoostingClassifier()
BCinput = BaggingClassifier(GNBmodel)
ABCinput = AdaBoostClassifier(RandomForestClassifier())

estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput), ('bc', BCinput), ('abc', ABCinput)]

SCmodel = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 12, stack_method = 'auto')

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(SCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(SCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Instantiate the ROC visualizer with the classification model
roc_visualizer = ROCAUC(SCmodel, classes=classes)

roc_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
roc_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
roc_visualizer.show()                       # Finalize and show the figure

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(SCmodel, classes=classes, cmap='BuPu')

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Create the Precision-Recall visualizer, fit, score, and show it
prc_viz = PrecisionRecallCurve(SCmodel)
prc_viz.fit(X_train, y_train)
prc_viz.score(X_test, y_test)
prc_viz.show()

# Instantiate the classification model and Class Prediction Error visualizer
cpe_visualizer = ClassPredictionError(
    SCmodel, classes=classes
)

# Fit the training data to the visualizer
cpe_visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
cpe_visualizer.score(X_test, y_test)

# Draw visualization
cpe_visualizer.show()

# Hyperparameter Tuning for Stacking Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

DTCinput = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)
GNBinput = GaussianNB()
KNNCinput = KNeighborsClassifier(algorithm = 'auto', leaf_size = 5, n_neighbors = 100, weights = 'uniform')
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier(criterion = 'mae', max_features = 'sqrt', max_depth = 4, validation_fraction = 0.675, subsample = 0.96875)
BCinput = BaggingClassifier(GaussianNB(), max_features = 0.325, n_estimators = 70)
ABCinput = AdaBoostClassifier(RandomForestClassifier(), learning_rate = 0.325, n_estimators = 10, algorithm = 'SAMME')

estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('knnc', KNNCinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput), ('bc', BCinput), ('abc', ABCinput)]

SCoptimal = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())

stack_method = ['auto', 'predict_proba', 'decision_function', 'predict']

params = dict(stack_method = stack_method)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(SCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, SCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Close Database Connection

In [None]:
dbConnection.close()