## Imports & Info

In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

pd.options.display.float_format = '{:.3f}'.format

%matplotlib inline

# sklearn imports
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

test_file_url = '~/Documents/Data Science/Data Projects/nfl_db/sausage_factory/TESTING_combine_imputed.csv'

# set random_state SEED variable
SEED = 42

In [2]:
from matplotlib import rcParams
import matplotlib as mpl

#svg.fonttype: path

blue = '#3498DB'
gray = '#95A5A6'
red = '#E74C3C'
dark_gray = '#34495E'
green = '#2ECC71'
purple = '#9B59B6'
flatui = [blue, gray, red, dark_gray, green, purple]

#rcParams['axes.prop_cycle'] = cycler('color', [blue, gray, red, dark_gray, green, purple])

# Patches
mpl.rc('patch', 
       linewidth=0.5, 
       facecolor=dark_gray, 
       edgecolor='w', 
       force_edgecolor=True, 
       antialiased=True)    
  
# Figure
mpl.rc('figure', 
       figsize= (15, 9),
       dpi= 200,
       facecolor='w', 
       edgecolor='w', 
       titlesize='xx-large',
       titleweight=700)

# Grid
mpl.rc('grid', 
       color=dark_gray,
       alpha=0.5, 
       linewidth=0.5, 
       linestyle='-')

# Axes
mpl.rc('axes', 
       facecolor='w',
       edgecolor=dark_gray,
       linewidth=0.5,
       grid=True,
       titlesize='large',
       labelsize='large',
       labelcolor=dark_gray,
       axisbelow=True)

mpl.rc('axes.spines',
       right=False,
       top=False)

# Ticks
mpl.rc('xtick', 
       direction='out',
       color=dark_gray)

mpl.rc('xtick.major', 
       size=0.0)

mpl.rc('xtick.minor', 
       size=0.0)

mpl.rc('ytick', 
       direction='out',
       color=dark_gray)

mpl.rc('ytick.major', 
       size=0.0)

mpl.rc('ytick.minor', 
       size=0.0)

mpl.rc('legend', 
       frameon=False,
       numpoints=1,
       scatterpoints=1)

mpl.rc('font', 
       size=13,
       weight=400,
       family='sans-serif')

rcParams['font.sans-serif']: ['Helvetica', 'Verdana', 'Lucida Grande']

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns

In [3]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# training set breakdown
train_success = y_train.sum()
train_total = len(y_train)
train_percent = train_success / train_total
print('Training Set\nSuccesses:\t{}\nTotal:\t\t{}\nPercent:\t{:.3f}\n'.format(train_success, train_total, train_percent))

# test set breakdown
test_success = y_test.sum()
test_total = len(y_test)
test_percent = test_success / test_total
print('Test Set\nSuccesses:\t{}\nTotal:\t\t{}\nPercent:\t{:.3f}\n\n'.format(test_success, test_total, test_percent))

FileNotFoundError: File b'/Users/brad/Documents/Data Science/Data Projects/nfl_db/sausage_factory/TESTING_combine_imputed.csv' does not exist

In [1]:
target_count = df.success.value_counts()
print('Class 0: {}'.format(target_count[0]))
print('Class 1: {}'.format(target_count[1]))
print('Proportion: {}'.format(round(target_count[0] / target_count[1]), 2), ': 1')

target_count.plot(kind='bar', title = 'Count (target)');

NameError: name 'df' is not defined

## Dummy Classifier

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

X.shape
y.shape

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit a dummy classifier
dummy = DummyClassifier(random_state = SEED)
dummy.fit(X_train, y_train)

# make predictions
y_pred = dummy.predict(X_test)

# SCORING
# accuracy
dummy_accuracy = dummy.score(X_test, y_test)
print('Dummy Classifier accuracy: {:.4f}\n\n'.format(dummy_accuracy))

# classification report
print(classification_report(y_test,
                           y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## Logistic Regression

In [None]:
# imports
from sklearn.linear_model import LogisticRegression

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate & fit model
model = LogisticRegression(random_state = SEED, class_weight = "balanced")
model.fit(X_train, y_train)

# make predictions on test features
y_pred = model.predict(X_test)

# score predictions
accuracy = model.score(X_test, y_test)
print('Logistic Regression Accuracy:\t{:.4f}\n'.format(accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### Logistic Regression - Tuning

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate the learning algorithm
model = LogisticRegression(random_state = SEED, class_weight = "balanced")

# create a params dict
penalty = ['l1', 'l2']
C = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
hyperparameters = dict(C = C, penalty = penalty)

# instantiate & fit grid search
gridsearch = GridSearchCV(model, hyperparameters, cv = 5, verbose = 0)
best_model = gridsearch.fit(X_train, y_train)

# print the best hyperparameters
best_penalty = best_model.best_estimator_.get_params()['penalty']
best_C = best_model.best_estimator_.get_params()['C']
print('Best penalty: {}'.format(best_penalty))
print('Best C: {}'.format(best_C))

# build & fit a tuned model
tuned_model = LogisticRegression(C = best_C, penalty = best_penalty)
tuned_model.fit(X_train, y_train)

# make predictions on test features
y_pred = tuned_model.predict(X_test)

# score predictions
accuracy = tuned_model.score(X_test, y_test)
print('Tuned Accuracy:\t{:.4f}\n'.format(accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## LinearSVC

In [None]:
#imports
from sklearn.svm import LinearSVC

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit the learning algorithm
model = LinearSVC(random_state=SEED)
model.fit(X_train, y_train)

# make predictions on test features
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}'.format(accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### LinearSVC - Tuning

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit the learning algorithm
model = LinearSVC(random_state=SEED)
model.fit(X_train, y_train)

# make predictions on test features
y_pred = model.predict(X_test)

# create a params dict
C = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
hyperparameters = dict(C = C)

# instantiate & fit grid search
gridsearch = GridSearchCV(model, hyperparameters, cv = 5, verbose = 0)
best_model = gridsearch.fit(X_train, y_train)

# print the best hyperparameters
best_C = best_model.best_estimator_.get_params()['C']
print('Best C: {}'.format(best_C))

# build & fit a tuned model
tuned_model = LinearSVC(C = best_C)
tuned_model.fit(X_train, y_train)

# make predictions on test features
y_pred = tuned_model.predict(X_test)

# score predictions
accuracy = tuned_model.score(X_test, y_test)
print('Tuned Accuracy:\t{:.4f}\n'.format(accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## SVC

In [None]:
from sklearn.svm import SVC

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate the learning algorithm
model = SVC(random_state=SEED)

# create a params dict
model.fit(X_train, y_train)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}'.format(accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### SVC - Tuning

In [None]:
from sklearn.svm import SVC

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)


# instantiate the learning algorithm
model = SVC(random_state = SEED)

# create a params dict
C = [100, 150, 200, 300]
gamma = [0.00001, 0.0001, 0.001, 0.01, 0.1]
hyperparameters = dict(C = C, gamma = gamma)

# instantiate & fit grid search
gridsearch = GridSearchCV(model, hyperparameters, cv = 5, verbose = 0)
best_model = gridsearch.fit(X_train, y_train)

# print the best hyperparameters
best_gamma = best_model.best_estimator_.get_params()['gamma']
best_C = best_model.best_estimator_.get_params()['C']
print('Best gamma: {}'.format(best_gamma))
print('Best C: {}'.format(best_C))


# BUILD MODEL WITH BEST HYPERPARAMETERS

# instantiate & fit model
model = SVC(gamma = best_gamma, C = best_C, random_state = SEED)
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_test)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}'.format(accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## Decision Tree

In [None]:
# imports
from sklearn.tree import DecisionTreeClassifier

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate & train a DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = SEED)
dt.fit(X_train, y_train)

# make predictions
y_pred = dt.predict(X_test)

# SCORING
# accuracy
accuracy_score(y_test, y_pred)

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### Decision Tree - Tuning

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate the learning algorithm
model = DecisionTreeClassifier()

# create a params dict
depth = [2, 3, 4, 5, 6, 10, 15, 20]
min_samples = [.01, .025, .05, .075, .1, .2]
hyperparameters = dict(max_depth = depth, min_samples_leaf = min_samples)

# instantiate grid search
gridsearch = GridSearchCV(model, hyperparameters, cv = 5, verbose = 0)

# fit grid search
best_model = gridsearch.fit(X_train, y_train)

# print the best hyperparameters
best_depth = best_model.best_estimator_.get_params()['max_depth']
best_min_samples = best_model.best_estimator_.get_params()['min_samples_leaf']
print('Best max_depth: {}'.format(best_depth))
print('Best min_samples_leaf: {}'.format(best_min_samples))


# RUN MODEL WITH BEST HYPERPARAMETERS

# instantiate a DecisionTreeClassifier
tuned_model = DecisionTreeClassifier(max_depth = best_depth,
                                     min_samples_leaf = best_min_samples,
                                     random_state = SEED)

# train the model
tuned_model.fit(X_train, y_train)

# make predictions
y_pred = tuned_model.predict(X_test)

# score accuracy
tuned_model_accuracy = accuracy_score(y_test, y_pred)
print('Tuned Decision Tree Accuracy:\t{}'.format(tuned_model_accuracy))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate & fit
model = RandomForestClassifier(random_state = SEED)
model.fit(X_train, y_train)

# make predictions on the test set
y_pred = model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### Random Forest - Tuning

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate the learning algorithm
model = RandomForestClassifier(random_state = SEED)

# create a params dict
depth = [2, 3, 4, 5, 6, 10, 15, 20]
min_samples = [.01, .025, .05, .075, .1, .2]
est = [5, 10, 50, 100, 500]
hyperparameters = dict(max_depth = depth, 
                       min_samples_leaf = min_samples,
                       n_estimators = est)

# instantiate grid search
gridsearch = GridSearchCV(model, hyperparameters, cv = 5, verbose = 0)

# fit grid search
best_model = gridsearch.fit(X_train, y_train)

# print the best hyperparameters
best_depth = best_model.best_estimator_.get_params()['max_depth']
best_min_samples = best_model.best_estimator_.get_params()['min_samples_leaf']
best_est = best_model.best_estimator_.get_params()['n_estimators']
print('Best max_depth: {}'.format(best_depth))
print('Best min_samples_leaf: {}'.format(best_min_samples))
print('Best n_estimators: {}'.format(best_est))


# BUILD MODEL WITH BEST HYPERPARAMETERS

# instantiate a RandomForestClassifier
tuned_model = RandomForestClassifier(max_depth = best_depth,
                                     min_samples_leaf = best_min_samples,
                                     n_estimators = best_est,
                                     random_state = SEED)

# train the model
tuned_model.fit(X_train, y_train)

# make predictions
tuned_model_y_pred = tuned_model.predict(X_test)

# SCORING
# accuracy
accuracy_score(y_test, tuned_model_y_pred)

# classification report
print(classification_report(y_test, tuned_model_y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = tuned_model_y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## Voting Classifier

In [None]:
# Imports
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)


# instantiate individual classifiers
lr = LogisticRegression(random_state = SEED)

svc = SVC(random_state = SEED)

knn = KNN()

rf = RandomForestClassifier(random_state = SEED)

# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic Regression', lr),
               ('SVM', svc),
               ('KNN', knn),
               ('Classification Tree', rf)]

# iterate over the defined list of tuples containing the classifiers 
for clf_name, clf in classifiers:
    # fit clf to the training set
    clf.fit(X_train, y_train)
    
    # predict the labels of the test set
    y_pred = clf.predict(X_test)
    
    # evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, precision_score(y_test, y_pred)))
    
# instantiate a VotingClassifier 'vc'
vc = VotingClassifier(estimators=classifiers)

# fit 'vc' to the training set
vc.fit(X_train, y_train)

# predict test set labels
y_pred = vc.predict(X_test)

# SCORING
# accuracy
accuracy_score(y_test, y_pred)

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)


# instantiate a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=SEED)

# instantiate & fit a Bagging Classifier
bc = BaggingClassifier(base_estimator=dt, 
                       n_estimators=300, 
                       n_jobs=-1)
bc.fit(X_train, y_train)

# make predictions on the test set
y_pred = bc.predict(X_test)

# SCORING
# accuracy
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}%'.format(acc * 100))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## AdaBoost

In [None]:
#imports
from sklearn.ensemble import AdaBoostClassifier

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate a Decision Tree
dt = DecisionTreeClassifier(max_depth = 2,
                            random_state=SEED)

# instantate and fit an AdaBoost Classifier
adb_clf = AdaBoostClassifier(base_estimator = dt, 
                             n_estimators = 500,
                             random_state = SEED)
adb_clf.fit(X_train, y_train)

# make predictions on the test set
y_pred = adb_clf.predict(X_test)

# score accuracy
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}%'.format(acc * 100))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)


# drop binary features
nb_features = X.drop(['d1'], axis = 1)

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(nb_features, y, test_size = .3, random_state = SEED)

# instantiate and train Gaussian Naive Bayes Classifier
model = GaussianNB()
model.fit(X_train, y_train)

# make predictions on test features
y_pred = model.predict(X_test)

# score accuracy
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}%'.format(acc * 100))

print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

## XGBoost

In [None]:
#imports
import xgboost as xgb

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit an XGBoost Classifier
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=SEED)
model.fit(X_train, y_train)

# make predictions on the test set
y_pred = model.predict(X_test)

# score accuracy
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {:.4f}%'.format(acc * 100))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### XGBoost - KFold CV

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate the model
model = xgb.XGBClassifier()

# instantiate KFoldCV
kfold = KFold(n_splits = 10, random_state = SEED)

# score and print results
results = cross_val_score(model, X_train, y_train, cv = kfold)
print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))

model.fit(X_train, y_train)

# plot feature importance
xgb.plot_importance(model)

# make predictions on the test set
y_pred = model.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### Feature Selection Using XGBoost Feature Importance

In [None]:
from numpy import sort
from sklearn.feature_selection import SelectFromModel

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit a model
model = xgb.XGBClassifier(random_state = SEED)
model.fit(X_train, y_train)

# make predictions for test data and evaluate
y_pred = model.predict(X_test)

# score accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    selection_model = xgb.XGBClassifier(random_state = SEED)
    selection_model.fit(select_X_train, y_train)
    
    # transform X_test to match selection
    select_X_test = selection.transform(X_test)
    
    # make predictions using transformed test feature matrix
    y_pred = selection_model.predict(select_X_test)
    
    # score performance
    precision = precision_score(y_test, y_pred)
    print("Thresh=%.3f, n=%d, Precision: %.2f%%" % (thresh, select_X_train.shape[1], precision*100.0));
    
    print(classification_report(y_test, y_pred))
    


### Use Feature-Selected X_train with GridSearch

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit a model
model = xgb.XGBClassifier(random_state = SEED)
model.fit(X_train, y_train)

# make predictions for test data and evaluate
y_pred = model.predict(X_test)

# score precision
precision = precision_score(y_test, y_pred)
print("Precision using all features: %.2f%%" % (precision * 100.0))

# select features using threshold
selection = SelectFromModel(model, threshold=0.008, prefit=True)

# transform training feature set
select_X_train = selection.transform(X_train)

print(select_X_train.shape)

# instantiate the model
selection_model = xgb.XGBClassifier(random_state = SEED)

# fit the model to selected features
selection_model.fit(select_X_train, y_train)
    
# select test features
select_X_test = selection.transform(X_test)

# make predictions using selected features of test set
y_pred = selection_model.predict(select_X_test)

# evaluate the model
precision = precision_score(y_test, y_pred);
print("Precision using selected features: %.2f%%" % (precision*100.0))
print(classification_report(y_test, y_pred));

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### XGBoost - RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)



# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', silent=True, nthread=1)

folds = 5
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = SEED)

random_search = RandomizedSearchCV(xgb, 
                                   param_distributions = params,
                                   n_iter = param_comb,
                                   scoring = 'roc_auc',
                                   n_jobs = 4,
                                   cv = skf.split(X,y),
                                   verbose = 3,
                                   random_state=SEED )

random_search.fit(X, y)

subsample=1.0, min_child_weight=5, max_depth=5, gamma=5, colsample_bytree=0.6, score=0.7734394124847003, total=   0.5s

In [None]:
#imports
import xgboost as xgb

# load data
df = pd.read_csv(test_file_url, index_col = 0)

# split
X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate the model
model = xgb.XGBClassifier(gamma = 5,
                          subsample = 1.0,
                          min_child_weight = 5,
                          colsample_bytree = 0.6,
                          max_depth = 5,
                          random_state = SEED)

# fit 
model.fit(X, y)

# SCORE
#accuracy
model.score(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

In [None]:
#imports
import xgboost as xgb
from numpy import sort
from sklearn.feature_selection import SelectFromModel

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

# split
X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# instantiate and fit a model
model = xgb.XGBClassifier(random_state = SEED)
model.fit(X_train, y_train)

# make predictions for test data and evaluate
y_pred = model.predict(X_test)

# score accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    selection_model = xgb.XGBClassifier(random_state = SEED)
    selection_model.fit(select_X_train, y_train)
    
    # transform X_test to match selection
    select_X_test = selection.transform(X_test)
    
    # make predictions using transformed test feature matrix
    y_pred = selection_model.predict(select_X_test)
    
    # score performance
    accuracy = accuracy_score(y_test, y_pred);
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0));
    print(classification_report(y_test,
                               y_pred))
    


### XGBoost - Tuning

In [None]:
# load data
df = pd.read_csv(test_file_url, index_col = 0)

# split
X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# create the param grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 0.9],
    'n_estimators': [200, 400, 800],
    'max_depth': [2, 3, 4, 5, 6],
    'subsample': [0.3, 0.5, 0.9],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# instantiate the model
model = xgb.XGBClassifier()

# configure GridSearchCV
grid = GridSearchCV(estimator = model, 
                    param_grid = param_grid,
                    scoring = 'roc_auc', 
                    cv = 4, 
                    verbose = 1)

# fit 
grid.fit(X, y)

print("Best parameters found: ",grid.best_params_)


In [None]:
# load data
df = pd.read_csv(test_file_url, index_col = 0)

# split
X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# create dmatrix
dback_dmatrix = xgb.DMatrix(data=X,label=y)


# instantiate the model
model = xgb.XGBClassifier(learning_rate = 0.01, 
                          n_estimators = 200,
                          max_depth = 3,
                          subsample = 0.3,
                          colsample_bytree = 1.0,
                          random_state = SEED)


# fit 
model.fit(X, y)

# SCORE
#accuracy
model.score(X_test, y_test)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');

### XGBoost - Best Model

In [None]:
# load data
df = pd.read_csv(test_file_url, index_col = 0)

# split
X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]


# create the param grid
param_grid = {
    'n_estimators': [1000],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.9],
    'subsample': [0.25, 0.5, 0.75, 1.0],
    'colsample_bytree': [0.4, 0.6, 0.8, 1.0],
    'max_depth': [4, 6, 8, 10],
    'gamma': [0]
}

# instantiate the model
model = xgb.XGBClassifier()

# configure GridSearchCV
grid = GridSearchCV(estimator = model, 
                    param_grid = param_grid,
                    scoring = 'roc_auc', 
                    cv = 4, 
                    verbose = 1)

# fit 
grid.fit(X, y)

print("Best parameters found: ",grid.best_params_)

# instantiate the model
tuned_model = xgb.XGBClassifier(params = grid.best_params_, random_state = SEED)

# fit 
tuned_model.fit(X, y)

# SCORE
y_pred = tuned_model.predict(X_test)
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')


# plot feature importance
xgb.plot_importance(tuned_model);

## Neural Network

In [None]:
from keras import models, layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# set random seed
np.random.seed(42)

# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# number of features
number_of_features = 29

# create function returning a compiled network
def create_network(optimizer = 'rmsprop'):
    
    # start neural network
    network = models.Sequential()
    
    # add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units = 16,
                            activation = 'relu',
                            input_shape = (number_of_features,)))
    
    # add fully connected layer with a signmoid activation function
    network.add(layers.Dense(units = 1, activation = 'sigmoid'))
    
    # compile neural network
    network.compile(loss = 'binary_crossentropy',
                   optimizer = optimizer,
                   metrics = ['accuracy'])
    
    # return compiled network
    return network

# wrap Keras model so it can be used by sklearn
neural_network = KerasClassifier(build_fn = create_network, verbose = 0)

# create hyperparameters
epochs = [5, 10]
batches = [5, 10, 100]
optimizers = ['rmsprop', 'adam']

# create hyperparameter options
hyperparameters = dict(optimizer = optimizers, epochs = epochs, batch_size = batches)

# create grid search
grid = GridSearchCV(estimator = neural_network, param_grid = hyperparameters)

# fit grid search
grid_result = grid.fit(X, y)

# best hyperparameters
grid_result.best_estimator_
grid_result.best_score_
grid_result.best_params_

In [None]:
# import data and split into features matrix and target vector
df = pd.read_csv(test_file_url, index_col = 0)

X, y = df[df.columns.tolist()[:-1]], df[df.columns.tolist()[-1]]

# split features and target into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = SEED)

# number of features
number_of_features = 29

# instantiate and fit a neural_network
tuned_nn = KerasClassifier(build_fn = create_network, verbose = 0)
tuned_nn.fit(X_train,
            y_train,
            epochs = 10,
            verbose = 1,
            batch_size = 5,
            validation_data = (X_test, y_test))

y_pred = tuned_nn.predict(X_test)

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_true = y_test, y_pred = y_pred)

fig, ax = plt.subplots(figsize = (6, 6))
ax.matshow(cm, cmap = plt.cm.Blues, alpha = 0.3)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i, s=cm[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label');