In [None]:
import psycopg2

import pandas as pds

import sqlalchemy

import numpy as np

import matplotlib.pyplot as plt

import scipy.stats as scipy

import statistics as stats

import math as math

import seaborn as sns

import yellowbrick as yb

# Create Database Engine

In [None]:
Engine = sqlalchemy.create_engine('postgresql+psycopg2://postgres:georgetown@nflstats.cb6meldrm5db.us-east-1.rds.amazonaws.com:5432/nfl_stats', pool_recycle=3600);

dbConnection = Engine.connect();

# Create Dataframe and QC

In [None]:
df_table = pds.read_sql("""select * from final_table_joined""", dbConnection);

In [None]:
df_table.describe()

In [None]:
df_table.head(10)

In [None]:
df_table.isna().sum()

# Features for RFC Modeling

In [None]:
features = ['home_point_differential', 'home_passing_yards', 'home_rushing_yards', 'home_turnover_differential', 'home_passing_yards_against', 'home_rushing_yards_against', 'home_win_percentage', 'home_power_ranking', 'away_point_differential', 'away_passing_yards', 'away_rushing_yards', 'away_turnover_differential', 'away_passing_yards_against', 'away_rushing_yards_against', 'away_win_percentage', 'away_power_ranking']
x = df_table[features].values
y = df_table['home_outcome'].values

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

DTCmodel = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(DTCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(DTCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(DTCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(DTCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

DTCoptimal = DecisionTreeClassifier()

min_samples_split = np.linspace(0.1,0.5,5)
max_depth = [int(x) for x in np.linspace(1,10,1)]
criterion = ['gini', 'entropy']

params = dict(max_depth = max_depth,  
             min_samples_split = min_samples_split,  
             criterion = criterion)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(DTCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, DTCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

GNBmodel = GaussianNB()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(GNBmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(GNBmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(GNBmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(GNBmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# K-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

KNNCmodel = KNeighborsClassifier(algorithm = 'auto', leaf_size = 5, n_neighbors = 100, weights = 'uniform')

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(KNNCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(KNNCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(KNNCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(KNNCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for K-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

KNNCoptimal = KNeighborsClassifier()

n_neighbors = [int(x) for x in np.linspace(5,100,20)]
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size = [int(x) for x in np.linspace(5,100,20)]

params = dict(n_neighbors = n_neighbors,  
             weights = weights,  
             algorithm = algorithm,
             leaf_size = leaf_size)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(KNNCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, KNNCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

RFCmodel = RandomForestClassifier()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(RFCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(RFCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(RFCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(RFCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

RFCoptimal = RandomForestClassifier()

min_samples_split = np.linspace(0.1,0.5,5)
max_depth = [int(x) for x in np.linspace(1,10,10)]
criterion = ['gini', 'entropy']
n_estimators = [int(x) for x in np.linspace(10,200,20)]
max_features = ['auto', 'sqrt', 'log2']

params = dict(max_depth = max_depth,  
             min_samples_split = min_samples_split,  
             criterion = criterion, n_estimators = n_estimators, 
             max_features = max_features)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(RFCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, RFCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

ETCmodel = ExtraTreesClassifier()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(ETCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(ETCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(ETCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(ETCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

ETCoptimal = ExtraTreesClassifier()

min_samples_split = np.linspace(0.1,0.5,5)
max_depth = [int(x) for x in np.linspace(1,20,10)]
criterion = ['gini', 'entropy']
n_estimators = [int(x) for x in np.linspace(10,200,20)]
max_features = ['auto', 'sqrt', 'log2']

params = dict(max_depth = max_depth,  
             min_samples_split = min_samples_split,  
             criterion = criterion, n_estimators = n_estimators, 
             max_features = max_features)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(RFCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, ETCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

GBCmodel = GradientBoostingClassifier()

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(GBCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(GBCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(GBCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(GBCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

GBCoptimal = GradientBoostingClassifier()

loss = ['deviance', 'exponential']
learning_rate = np.linspace(0.1, 1, 5)
n_estimators = [int(x) for x in np.linspace(50,1000,5)]
max_depth = [int(x) for x in np.linspace(5,100,5)]
max_features = ['log2', 'sqrt']
criterion = ['friedman_mse', 'mae']

params = dict(loss = loss, learning_rate = learning_rate, n_estimators = n_estimators,   
             max_depth = max_depth,max_features = max_features, 
             criterion = criterion)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(GBCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, GBCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Voting Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

DTCinput = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, min_samples_split = 0.2)
GNBinput = GaussianNB()
KNNCinput = KNeighborsClassifier(algorithm = 'auto', leaf_size = 5, n_neighbors = 100, weights = 'uniform')
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier()

VCmodel = VotingClassifier(estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('knnc', KNNCinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput)], voting = 'hard')

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(VCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(VCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(VCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(VCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Bagging Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

BCmodel = BaggingClassifier(GBCmodel)

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(BCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(BCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(BCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(BCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for Bagging Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

BCoptimal = BagginingClassifier()

base_estimator = [DTCinput, GNBinput, RFCinput, ETCinput, GBCinput]
n_estimators = [int(x) for x in np.linspace(10,100,10)]
max_samples = np.linspace(0.1, 1, 5)
max_features = np.linspace(0.1, 1, 5)
bootstrap = [True, False]
bootstrap_features = [True, False]
oob_score = [True, False]
warm_start = [True, False]

params = dict(base_estimator = base_estimator,
             n_estimators = n_estimators,  
             max_samples = max_samples,  
             max_features = max_features,
             bootstrap = bootstrap,
             bootstrap_features = bootstrap_features,
             oob_score = oob_score,
             warm_start = warm_start)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(BCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, BCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# AdaBoost Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

ABCmodel = AdaBoostClassifier(GBCmodel)

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(ABCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(ABCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(ABCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(ABCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for AdaBoost Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

ABCoptimal = AdaBoostClassifier()

DTCinput = DecisionTreeClassifier()
GNBinput = GaussianNB()
KNNCinput = KNeighborsClassifier()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier
GBCinput = GradientBoostingClassifier()

base_estimator = [DTCinput, GNBinput, KNNCinput, RFCinput, ETCinput, GBCinput]
n_estimators = [int(x) for x in np.linspace(5,100,20)]
learning_rate = np.linspace(0.1, 1, 10)
algorithm = ['SAMME', 'SAMME.R']

params = dict(base_estimator = base_estimator, 
             n_estimators = n_estimators,  
             learning_rate = learning_rate,  
             algorithm = algorithm)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(ABCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, ABCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Stacking Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassificationReport
from yellowbrick.model_selection import CVScores
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import LearningCurve

splits = train_test_split(x, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits

DTCinput = DecisionTreeClassifier()
GNBinput = GaussianNB()
KNNCinput = KNeighborsClassifier()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier()

estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('knnc', KNNCinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput)]

SCmodel = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 12, stack_method = 'auto')

classes = ['loss', 'win']

# Classification Report
cr_visualizer = ClassificationReport(SCmodel, classes=classes, support=True)

cr_visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
cr_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
cr_visualizer.show()  

# Cross-Validation
cv = StratifiedKFold(n_splits = 12)

cv_visualizer = CVScores(SCmodel, cv = cv, scoring = 'f1_weighted')

cv_visualizer.fit(x, y)
cv_visualizer.show()

# Learning Curve
sizes = np.linspace(0.1, 1, 20)

lc_visualizer = LearningCurve(SCmodel, cv = cv, scoring='f1_weighted', train_sizes = sizes)

lc_visualizer.fit(x, y)
lc_visualizer.show()

# The ConfusionMatrix visualizer taxes a model
cm_visualizer = ConfusionMatrix(SCmodel, classes=classes)

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm_visualizer.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm_visualizer.score(X_test, y_test)

# How did we do?
cm_visualizer.show()

# Hyperparameter Tuning for Stacking Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

DTCinput = DecisionTreeClassifier()
GNBinput = GaussianNB()
KNNCinput = KNeighborsClassifier()
RFCinput = RandomForestClassifier()
ETCinput = ExtraTreesClassifier()
GBCinput = GradientBoostingClassifier()

estimators = [('dtc', DTCinput), ('gnb', GNBinput), ('knnc', KNNCinput), ('rfc', RFCinput), ('etc', ETCinput), ('gbc', GBCinput)]

SCoptimal = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())

stack_method = ['auto', 'predict_proba', 'decision_function', 'predict']

params = dict(stack_method = stack_method)

scores = ['precision', 'recall']

for score in scores:

    gridF = GridSearchCV(SCoptimal, params, cv = 12, verbose = 1, 
                      n_jobs = -1)

    bestF = gridF.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(bestF.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = bestF.cv_results_['mean_test_score']
    stds = bestF.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, bestF.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, SCoptimal.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Close Database Connection

In [None]:
dbConnection.close()