#  Mini Project 2

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, auc
import sklearn.metrics as metrics

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter

from sklearn.utils import resample

ModuleNotFoundError: No module named 'imblearn'

## Reading the Data

In [None]:
stroke_prediction_data = '../DATA/healthcare-dataset-stroke-data.csv'

In [None]:
df = pd.read_csv(stroke_prediction_data)

### EDA

In [None]:
df.head()

In [None]:
# Checking for Data Types

df.dtypes

In [None]:
# Data Shape

df.shape

In [None]:
# Checking for the null values

df.isnull().sum()

In [None]:
# Fill null values 
bmi_avg = df['bmi'].mean()

df['bmi'].fillna(value = bmi_avg, inplace = True)

In [None]:
df.head()

# EDA

In [None]:
df.groupby(['stroke' , 'gender'])['stroke'].count().unstack(1).plot.bar(figsize = (20,20))
plt.rcParams.update({'font.size': 20})
plt.xlabel('Gender', fontsize = 20)
plt.ylabel('Stroke', fontsize = 20)
plt.title('Stroke Prediction By Gender', fontsize = 20)

In [None]:
df.groupby(['stroke' , 'work_type'])['stroke'].count().unstack(1).plot.bar(figsize = (20,20))
plt.rcParams.update({'font.size': 15})
plt.xlabel('Work Type', fontsize = 20)
plt.ylabel('Stroke', fontsize = 20)
plt.title('Stroke Prediction by Work Type', fontsize = 20)

In [None]:
plt.figure(figsize = (20,20))
sns.barplot(x = 'gender', y = 'stroke', hue = 'heart_disease' ,data = df)
plt.rcParams.update({'font.size': 30})
plt.ylabel('Stroke', fontsize = 20)
plt.title('Stroke Prediction based on Heart Disease History', fontsize = 20)

In [None]:
df.groupby(['stroke' , 'heart_disease'])['stroke'].count().unstack(1).plot.bar(figsize = (20,20))
plt.rcParams.update({'font.size': 30})
plt.xlabel('Heart Disease', fontsize = 20)
plt.ylabel('Stroke', fontsize = 20)
plt.title("Stroke Prediction by Heart Disease", fontsize = 20)

In [None]:
plt.figure(figsize = (20,20))
plt.rcParams.update({'font.size': 30})
sns.boxplot(x = 'stroke', y = 'age', data = df)
plt.xlabel('Stroke', fontsize = 20)
plt.ylabel('Age', fontsize = 20)
plt.title('Stroke Prediction based on Age', fontsize = 20)

In [None]:
df.groupby(['stroke' , 'hypertension'])['stroke'].count().unstack(1).plot.bar(figsize = (20,20))
plt.rcParams.update({'font.size': 30})
plt.xlabel('Hypertension', fontsize = 20)
plt.ylabel('Stroke', fontsize = 20)
plt.title('Stroke Prediction by Hypertension History', fontsize = 20)

In [None]:
plt.figure(figsize = (20,20))
ax = sns.barplot(data=df, x='smoking_status', y='stroke')
plt.rcParams.update({'font.size': 20})
ax.set_title('Chance of Getting Stroke Based on Smoking Behavior', y=1.1, weight='bold', fontsize=25)
ax.set_xlabel('Smoking Status', fontsize = 20)
ax.set_ylabel('Stroke', fontsize = 20)

In [None]:
df.columns

# Logistic Regression

In [None]:
df_drop_columns = ['id', 'stroke', 'ever_married', 'work_type', 'Residence_type']
X_drop_columns = [
    'gender_Male',
    'gender_Other',
    'smoking_status_Unknown',
    'smoking_status_formerly smoked',
    'smoking_status_never smoked'
]

X_rename_columns = {
    'gender_female' : 'is_female',
    'smoking_status_smokes' : 'smokes'    
}

In [None]:
X = df.drop(columns = df_drop_columns)
X = pd.get_dummies(X)
X.drop(columns = X_drop_columns, inplace = True)
X.rename(columns = X_rename_columns, inplace = True)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:
log = LogisticRegression(max_iter = 2000, solver = 'liblinear', class_weight = 'balanced')

log.fit(X_train, y_train)
train_accuracy_score = log.score(X_train, y_train)
print('Train accuracy score ', train_accuracy_score)

test_accuracy_score = log.score(X_test, y_test)
print('Test accuracy score ', test_accuracy_score)

In [None]:
y_pred = log.predict(X_test)

# predict test probabability
prob_preds = log.predict_proba(X_test)

In [None]:
tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print('Outcome values \n', tp, fn, fp, tn)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_cancer', 'predicted_healthy']
cm = confusion_matrix(y_pred, y_test)
cm = pd.DataFrame(cm, index = cm_index, columns = cm_columns)
cm

In [None]:
print('Logistic Regression Cross Validations Score ', np.mean(cross_val_score(log, X_test, y_test, cv = 5)))

In [None]:
print(classification_report(y_pred, y_test))

In [None]:
# ros = RandomOverSampler(random_state = 42)
# smote = SMOTE()
undersample = RandomUnderSampler(sampling_strategy = 'majority')

# Fit Predictor and target

X_train_undersample, y_train_undersample = undersample.fit_resample(X_train, y_train)

print('Original dataset shape ', Counter(y_train))
print('Resample dataset shape ', Counter(y_train_undersample))

In [None]:
log = LogisticRegression(max_iter = 2000, solver = 'liblinear', class_weight = 'balanced')

log.fit(X_train_undersample, y_train_undersample)
train_accuracy_score = log.score(X_train_undersample, y_train_undersample)
print('Tain accuracy score ', train_accuracy_score)
# Test Accuracy Score

test_accuracy = log.score(X_test, y_test)
print('Test Accuracy ', test_accuracy)

In [None]:
y_pred = log.predict(X_test)

prob_preds = log.predict_proba(X_test)

tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print('Outcome values: \n', tp, fn, fp, tn)

In [None]:
# Calculate Accuracy

accuracy = (tp + tn) / float(tp + tn + fp + fn)
print('Classification Accuracy: ', accuracy)

In [None]:
# Calculate Classification Error

misclassification_rate = (fp + fn) / float(tp + tn + fp + fn)
print('Misclassification rate is ' , misclassification_rate)

In [None]:
# Calculate True Positive Rate (Sensitivity)

tpr = tp/float(fn+tp)
print('tpr is ', tpr)

In [None]:
# Calculate Specificity (True Negative Rate)
tnr = tn / float(tn+fp)
print('Correct Negative Prediction Rate: ', tnr)

In [None]:
# Calculate false positive rate

fpr = fp/float(tn+fp)
print('False Postive Rate: ', fpr)

In [None]:
# Calculate Precision 

precision = tp/float(tp+fp)
print('Precision: ', precision)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_stroke', 'predicted_healthy']
log_cm = confusion_matrix(y_pred, y_test)
log_cm = pd.DataFrame(log_cm, index = cm_index, columns = cm_columns)
log_cm

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print('Logistic Regression Cross Validations Score ', np.mean(cross_val_score(log, X_test, y_test, cv = 5)))

# KNN

In [None]:
df_drop_columns = ['id', 'stroke', 'ever_married', 'work_type', 'Residence_type']
X_drop_columns = [
    'gender_Male',
    'gender_Other',
    'smoking_status_Unknown',
    'smoking_status_formerly smoked',
    'smoking_status_never smoked'
]

X_rename_columns = {
    'gender_female' : 'is_female',
    'smoking_status_smokes' : 'smokes'    
}

X = df.drop(columns = df_drop_columns)
X = pd.get_dummies(X)
X.drop(columns = X_drop_columns, inplace = True)
X.rename(columns = X_rename_columns, inplace = True)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:
training_data_rows = len(X_train.index)

knn = KNeighborsClassifier(n_neighbors = training_data_rows)

# Fit train Data 
knn.fit(X_train, y_train)
train_accuracy_score = knn.score(X_train, y_train)
print('Train Accuracy Score ', train_accuracy_score)

test_accuracy_score = knn.score(X_test, y_test)
print('Test Accuracy Score: ', test_accuracy_score)

In [None]:
y_pred = knn.predict(X_test)

# predict test probability:
prob_preds = knn.predict_proba(X_test)

knn_tp, knn_fn, knn_fp, knn_tn = confusion_matrix(y_test, y_pred, labels = [1, 0]).reshape(-1)
print('Outcome values \n', knn_tp, knn_fn, knn_fp, knn_tn)

In [None]:
cm_index= ['is_stroke', 'is_healthy']
cm_columns = ['predicted_cancer', 'predicted_healthy']
cm = confusion_matrix(y_pred, y_test)
cm = pd.DataFrame(cm, index = cm_index, columns = cm_columns)
cm

In [None]:
undersample = RandomUnderSampler(sampling_strategy = 0.5)

# Fit Predictor and target

X_train_undersample, y_train_undersample = undersample.fit_resample(X_train, y_train)

print('Original dataset shape ', Counter(y_train))
print('Original dataset shape ', Counter(y_train_undersample))

In [None]:
training_data_rows = len(X_train_undersample.index)

knn = KNeighborsClassifier(n_neighbors = training_data_rows)

# Fit train Data 
knn.fit(X_train_undersample, y_train_undersample)
train_accuracy_score = knn.score(X_train_undersample, y_train_undersample)
print('Train Accuracy Score ', train_accuracy_score)

test_accuracy_score = knn.score(X_test, y_test)
print('Test Accuracy Score: ', test_accuracy_score)

In [None]:
y_pred = knn.predict(X_test)

prob_preds = knn.predict_proba(X_test)

knn_tp, knn_fn, knn_fp, knn_tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print('Outcome values: \n', knn_tp, knn_fn, knn_fp, knn_tn)

## Accuracy

In [None]:
accuracy = (knn_tp + knn_tn) / float(knn_tp + knn_tn + knn_fp + knn_fn)
print('Classification accuracy ', accuracy)

## Classification Error

In [None]:
misclassification_rate = (knn_fp + knn_fn) / float(knn_tp + knn_tn + knn_fp + knn_fn)
print('Misclassification rate is ',misclassification_rate)

## True Positive Rate

In [None]:
knn_tpr = knn_tp / float(knn_fn + knn_tp)

print('True Positive Rate ', knn_tpr)

## Specificity (True Negative Rate)

In [None]:
knn_tnr = knn_tn / float(knn_tn + knn_fp)

print('Specifity (True Negative Rate) ', knn_tnr)

## Calculate False Positive Rate

In [None]:
knn_fpr = knn_fp / float(knn_tn + knn_fp)

print('False Positive Rate: ', fpr)

## Calculate Precision

In [None]:
precision =knn_tp / float(knn_tp + knn_fp)
print('Precision: ', precision)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_stroke', 'predicted_healthy']

knn_cm = confusion_matrix(y_pred, y_test)
knn_cm = pd.DataFrame(knn_cm, index = cm_index, columns = cm_columns)
knn_cm

In [None]:
print(classification_report(y_test, y_pred))

# Random Forest

In [None]:
rfc = RandomForestClassifier(n_estimators = 300, class_weight = 'balanced' )

df_drop_columns = ['id', 'stroke', 'ever_married', 'work_type', 'Residence_type']
X_drop_columns = [
    'gender_Male',
    'gender_Other',
    'smoking_status_Unknown',
    'smoking_status_formerly smoked',
    'smoking_status_never smoked'
]

X_rename_columns = {
    'gender_female' : 'is_female',
    'smoking_status_smokes' : 'smokes'    
}

X = df.drop(columns = df_drop_columns)
X = pd.get_dummies(X)
X.drop(columns = X_drop_columns, inplace = True)
X.rename(columns = X_rename_columns, inplace = True)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:
rfc.fit(X_train, y_train)

train_score = rfc.score(X_train, y_train)
print('Train Score Accuracy ', train_score)

test_score = rfc.score(X_test, y_test)
print('Test Score Accuracy ', test_score)

In [None]:
y_pred = rfc.predict(X_test)

# predict test probability
prob_preds = rfc.predict_proba(X_test)

rfc_tp, rfc_fn, rfc_fp, rfc_tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print('Outcome values \n', rfc_tp, rfc_fn, rfc_fp, rfc_tn)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_cancer', 'predicted_healthy']
cm = confusion_matrix(y_pred, y_test)
cm = pd.DataFrame(cm, index = cm_index, columns = cm_columns)
cm

In [None]:
# ros = RandomOverSampler(random_state = 42)
undersample = RandomUnderSampler(sampling_strategy = 'majority')

# Fit Predictor and target

X_train_undersample, y_train_undersample = undersample.fit_resample(X_train, y_train)

print('Original dataset shape ', Counter(y_train))
print('Resample dataset shape ', Counter(y_train_undersample))

In [None]:
rfc.fit(X_train_undersample, y_train_undersample)

train_score = rfc.score(X_train_undersample, y_train_undersample)
print('Train Score Accuracy ', train_score)

test_score = rfc.score(X_test, y_test)
print('Test Score Accuracy ', test_score)

In [None]:
y_pred = log.predict(X_test)

probs_preds = rfc.predict_proba(X_test)

rfc_tp, rfc_fn, rfc_fp, rfc_tn = confusion_matrix(y_test, y_pred, labels = [1, 0]).reshape(-1)
print('Outcome values: \n', rfc_tp, rfc_fn, rfc_fp, rfc_tn)

In [None]:
print(classification_report(y_test, y_pred))

## Accuracy

In [None]:
accuracy = (rfc_tp + rfc_tn) / float(rfc_tp + rfc_tn + rfc_fp + rfc_fn)
print('Classification Accuracy ', accuracy)

## Classification Error

In [None]:
misclassification_rate = (rfc_fp + rfc_fn) / float(rfc_tp + rfc_tn + rfc_fp + rfc_fn)
print('Misclassification rate ',misclassification_rate )

## True Positive Rate

In [None]:
rfc_tpr = rfc_tp / float(rfc_fn + rfc_tp)
print('True positive rate ', rfc_tpr)

## Specifity (True Negative Rate)

In [None]:
rfc_tnr = rfc_tn / float(rfc_tn + rfc_fp)
print('True Negative Rate: ', rfc_tnr)

## False Positive Rate

In [None]:
rfc_fpr = rfc_fp / float(rfc_tn + rfc_fp)
print('False positive rate ', rfc_fpr)

## Calculate Precision

In [None]:
precision = rfc_tp / float(rfc_tp + rfc_fp)
print('Precision: ', precision)

## Confusion Matrix

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_stroke', 'predicted_healthy']

rfc_cm = confusion_matrix(y_pred, y_test)
rfc_cm = pd.DataFrame(rfc_cm, index = cm_index, columns = cm_columns)
rfc_cm

# Support Vector Machine

In [2]:
df_drop_columns = ['id', 'stroke', 'ever_married', 'work_type', 'Residence_type']
X_drop_columns = [
    'gender_Male',
    'gender_Other',
    'smoking_status_Unknown',
    'smoking_status_formerly smoked',
    'smoking_status_never smoked'
]

X_rename_columns = {
    'gender_female' : 'is_female',
    'smoking_status_smokes' : 'smokes'    
}

In [3]:
X = df.drop(columns = df_drop_columns)
X = pd.get_dummies(X)
X.drop(columns = X_drop_columns, inplace = True)
X.rename(columns = X_rename_columns, inplace = True)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

NameError: name 'df' is not defined

In [None]:
svc = SVC(kernel = 'linear', probability = True, class_weight = 'balanced')

svc.fit(X_train, y_train)
train_accuracy_score = svc.score(X_train, y_train)
print('Train Accuracy Score ', train_accuracy_score)
test_accuracy_score = svc.score(X_test, y_test)
print('Test Accuracy Score ', test_accuracy_score)

In [None]:
y_pred = svc.predict(X_test)

# predict test probability
prob_preds = svc.predict_proba(X_test)

svc_tp, svc_fn, svc_fp, svc_tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print(svc_tp, svc_fn, svc_fp, svc_tn)

print(np.mean(cross_val_score(log, X_train, y_train, cv = 5)))

In [None]:
cm_index = ['is_cancer', 'is_healthy']
cm_columns = ['predicted_cancer', 'predicted_healthy']
svc_cm = confusion_matrix(y_pred, y_test)
svc_cm = pd.DataFrame(svc_cm, index = cm_index, columns = cm_columns)
svc_cm

In [None]:
# ros = RandomOverSampler(random_state = 42)
under = RandomUnderSampler(sampling_strategy = 'majority')

# Fit Predictor and target

X_train_under, y_train_under = under.fit_resample(X_train, y_train)

print('Original dataset shape ', Counter(y_train))
print('Resample dataset shape ', Counter(y_train_under))

In [None]:
svc = SVC(kernel = 'linear', probability = True, class_weight = 'balanced')

svc.fit(X_train_under, y_train_under)
train_accuracy_score = svc.score(X_train_under, y_train_under)
print('Train Accuracy Score ', train_accuracy_score)
test_accuracy_score = svc.score(X_test, y_test)
print('Test Accuracy Score ', test_accuracy_score)

In [None]:
y_pred = svc.predict(X_test)

prob_preds = svc.predict_proba(X_test)

In [None]:
svc_tp, svc_fn, svc_fp, svc_tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print('Outcome values: \n', svc_tp, svc_fn, svc_fp, svc_tn)

In [None]:
# Calculate Accuracy

accuracy = (svc_tp + svc_tn) / float(svc_tp + svc_tn + svc_fp + svc_fn)
print('Classification Accuracy: ', accuracy)

In [None]:
# Calculate Classification Error

misclassification_rate = (svc_fp + svc_fn) / float(svc_tp + svc_tn + svc_fp + svc_fn)
print('Misclassification rate is ', misclassification_rate)

In [None]:
# Calculate True Positive Rate (Sensitivity)

svc_tpr = svc_tp/float(svc_fn + svc_tp)
print('tpr is ', svc_tpr)

In [None]:
# Calculate Specificity (True Negative Rate)
svc_tnr = svc_tn / float(svc_tn + svc_fp)
print('Correct Negative Prediction Rate: ', svc_tnr)

In [None]:
# Calculate false positive rate

svc_fpr = svc_fp / float(svc_tn + svc_fp)
print('False positive rate: ', svc_fpr)

In [None]:
# Calculate Precision 

svc_precision = svc_tp / float(svc_tp + svc_fp)
print('Precision: ', svc_precision)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_stroke', 'predicted_healthy']
svc_cm = confusion_matrix(y_pred, y_test)
svc_cm = pd.DataFrame(svc_cm, index = cm_index, columns = cm_columns)
svc_cm

In [None]:
print(classification_report(y_test, y_pred))

# Naive Bayes

In [None]:
df_drop_columns = ['id', 'stroke', 'ever_married', 'work_type', 'Residence_type']
X_drop_columns = [
    'gender_Male',
    'gender_Other',
    'smoking_status_Unknown',
    'smoking_status_formerly smoked',
    'smoking_status_never smoked'
]

X_rename_columns = {
    'gender_female' : 'is_female',
    'smoking_status_smokes' : 'smokes'    
}

In [None]:
X = df.drop(columns = df_drop_columns)
X = pd.get_dummies(X)
X.drop(columns = X_drop_columns, inplace = True)
X.rename(columns = X_rename_columns, inplace = True)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:

nb = GaussianNB()
nb.fit(X_train, y_train)
train_accuracy = nb.score(X_train, y_train)
print('Train Accuracy ', train_accuracy)

test_accuracy = nb.score(X_test, y_test)
print('Test Accuracy ', test_accuracy)

In [None]:
y_pred = nb.predict(X_test)

prob_preds = nb.predict_proba(X_test)

nb_tp, nb_fn, nb_fp, nb_tn = confusion_matrix(y_test, y_pred, labels = [1, 0]).reshape(-1)
print('Outcome values \n', nb_tp, nb_fn, nb_fp, nb_tn)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_stroke', 'predicted_healthy']
nb_cm = confusion_matrix(y_pred, y_test)
nb_cm = pd.DataFrame(nb_cm, index = cm_index, columns = cm_columns)
nb_cm

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# ros = RandomOverSampler(random_state = 42)
smote = SMOTE()

# Fit Predictor and target

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Original dataset shape ', Counter(y_train))
print('Resample dataset shape ', Counter(y_train_smote))

In [None]:
nb = GaussianNB()
nb.fit(X_train_smote, y_train_smote)
train_accuracy = nb.score(X_train_smote, y_train_smote)
print('Train Accuracy ', train_accuracy)

test_accuracy = nb.score(X_test, y_test)
print('Test Accuracy ', test_accuracy)

In [None]:
y_pred = nb.predict(X_test)

prob_preds = nb.predict_proba(X_test)

nb_tp, nb_fn, nb_fp, nb_tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1)
print('Outcome values: \n', nb_tp, nb_fn, nb_fp, nb_tn)

In [None]:
cm_index = ['is_stroke', 'is_healthy']
cm_columns = ['predicted_stroke', 'predicted_healthy']
nb_cm = confusion_matrix(y_pred, y_test)
nb_cm = pd.DataFrame(nb_cm, index = cm_index, columns = cm_columns)
nb_cm

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Print Accuracy


accuracy = (nb_tp + nb_tn) / float(nb_tp + nb_tn + nb_fp + nb_fn)
print('Accuracy: ', accuracy )

In [None]:
# Print Classification Error

misclassification_rate = (nb_fp + nb_fn) / float(nb_tp + nb_tn + nb_fp + nb_fn)
print('Misclassification Rate: ', misclassification_rate)

In [None]:
# Calculate Sensitivity (True Positive Rate)

nb_tpr = nb_tp / float(nb_tp + nb_fn)
print('Sensitivity(True Positive Rate): ', nb_tpr)

In [None]:
# Calculate Specifity (True Negative Rate)

nb_tnr = nb_tn / float(nb_tn + nb_fp)
print('Specifity (True Negative Rate): ', nb_tnr)

In [None]:
# Calculate False Positive Rate

nb_fpr = nb_fp / float(nb_tn + nb_fp)
print('False Positive Rate: ', nb_fpr)

In [None]:
# Calculate Precision

precision = nb_tp / float(nb_tp + nb_fp)

print('Precision: ', precision)

In [None]:
from sklearn.model_selection import GridSearchCV

# Best Estimator for Logistic Regression

In [None]:
lr_params = {
    'penalty' : ['l1', 'l2'],
    'C' : [1, 10, 100]
}

lr_grid = GridSearchCV(estimator = LogisticRegression(solver = 'liblinear'),
                      param_grid = lr_params)

lr_grid.fit(X, y)

In [None]:
lr_best_score = lr_grid.best_score_
lr_best_estimator = lr_grid.best_estimator_.C

print('Best score for logistic Regression ', lr_best_score)
print('Best estimator for Logistic Regression ', lr_best_estimator)

# Best Estimator for Support Vector Machine

In [None]:
svc_params = {
#     'C' : [1, 10, 100],
    'gamma' : [0.001, 0.0001],
    'kernel' : ['linear', 'rbf']
}

svc_grid = GridSearchCV(estimator = svc,
                        param_grid = svc_params)
svc_grid.fit(X, y)

In [None]:
svc_grid_best_score = svc_grid.best_score_
svc_grid_best_estimator = svc_grid.best_estimator_.C

print('SVC best score is ', svc_grid_best_score)
print('SVC best estimator is ', svc_grid_best_estimator)

# Best Estimator for Naive Bayer

In [None]:
nb_params = {
    'var_smoothing': np.logspace(0, -9, num = 100)
}

nb_grid = GridSearchCV(estimator = GaussianNB(), param_grid = nb_params, verbose = 1, cv = 10, n_jobs = 1)

nb_grid.fit(X, y)

print('Best NB estimator ',  nb_grid.best_estimator_)
print('Best NB Score ',  nb_grid.best_score_)

# KNN Best Estimator

In [None]:
knn_params = {
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

knn_grid = GridSearchCV(
    KNeighborsClassifier(),
    knn_params,
    verbose = 1,
    cv = 3,
    n_jobs = -1
)

knn_grid.fit(X, y)

print('Best KNN Estimator ', knn_grid.best_estimator_)
print('Best KNN score ', knn_grid.best_score_)

# Random Forest

In [None]:
rfc_params = {
    'n_estimators' : [200, 700],
    'max_features' : ['auto', 'sqrt', 'log2']
}

rfc_grid = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 400, class_weight = 'balanced'), param_grid = rfc_params, cv = 5)
rfc_grid.fit(X, y)
print('Random Forest Best Estimator ', rfc_grid.best_estimator_)
print('Random Forest Best Score ', rfc_grid.best_score_)

# ROC curve for the SVM, Logistic Regression and Naive Bayes on the Same plot

In [4]:
log_preds = log.predict_proba(X_test)
svc_preds = svc.predict_proba(X_test)
nb_preds = nb.predict_proba(X_test)
knn_preds = knn.predict_proba(X_test)
rfc_preds = rfc.predict_proba(X_test)

log_fpr, log_tpr, _ = roc_curve(y_test, log_preds[:,1])
svc_fpr, svc_tpr, _ = roc_curve(y_test, svc_preds[:,1])
nb_fpr, nb_tpr, _ = roc_curve(y_test, nb_preds[:,1])
knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_preds[:,1])
rfc_fpr, rfc_tpr, _ = roc_curve(y_test, rfc_preds[:,1])

plt.figure(figsize = [20,20])

log_auc = auc(log_fpr, log_tpr)
svc_auc = auc(svc_fpr, svc_tpr)
nb_auc = auc(nb_fpr, nb_tpr)
knn_auc = auc(knn_fpr, knn_tpr)
rfc_auc = auc(rfc_fpr, rfc_tpr)

log_label = 'ROC Curve Logistic Regression (area = %0.5f)'%log_auc
svc_label = 'ROC Curve SVC (area = %0.5f)'%svc_auc
nb_label = 'ROC Curve Gaussian NB (area = %0.5f)' % nb_auc
knn_label = 'K Nearest Neighbour (area = %0.5f)' % knn_auc
rfc_label = 'Random Forest Classifier (area = %0.5f)' % rfc_auc
plt.rcParams.update({'font.size': 25})

plt.plot(log_fpr, log_tpr, label = log_label, linewidth = 4)
plt.plot(svc_fpr, svc_tpr, label = svc_label, linewidth = 4)
plt.plot(nb_fpr, nb_tpr, label = nb_label, linewidth = 4)
plt.plot(knn_fpr, knn_tpr, label = knn_label, linewidth = 4)
plt.plot(rfc_fpr, rfc_tpr, label = rfc_label, linewidth = 4)

plt.plot([0, 1], [0, 1], 'k--', linewidth=4)

plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])

plt.xlabel('False Positive Rate', fontsize = 25)
plt.ylabel('True Positive Rate', fontsize = 25)
plt.title('Receiver Operating Characteristic: M', fontsize = 25)
plt.legend(loc = 'lower right')
plt.show()

NameError: name 'log' is not defined

# Precision Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

# Predict Probabilities
log_probs = log.predict_proba(X_test)
svc_probs = svc.predict_proba(X_test)
nb_probs = nb.predict_proba(X_test)
knn_probs = knn.predict_proba(X_test)
rfc_probs = rfc.predict_proba(X_test)

# Keep probabilities for outcome only
log_probs = log_probs[:,1]
svc_probs = svc_probs[:,1]
nb_probs = nb_probs[:,1]
knn_probs = knn_probs[:,1]
rfc_probs = rfc_probs[:,1]



# Predict class values
log_y_preds = log.predict(X_test)
svc_y_preds = svc.predict(X_test)
nb_y_preds = svc.predict(X_test)
knn_y_preds = knn.predict(X_test)
rfc_y_preds = rfc.predict(X_test)



# Precision Recall
log_precision, log_recall, _= precision_recall_curve(y_test, log_probs)
svc_precision, svc_recall,_ = precision_recall_curve(y_test, svc_probs)
nb_precision, nb_recall,_ = precision_recall_curve(y_test, nb_probs)
knn_precision, knn_recall,_ = precision_recall_curve(y_test, knn_probs)
rfc_precision, rfc_recall,_ = precision_recall_curve(y_test, rfc_probs)


# F1 and AUC

log_f1, log_auc = f1_score(y_test, log_y_preds), auc(log_recall, log_precision)
svc_f1, svc_auc = f1_score(y_test, svc_y_preds), auc(svc_recall, svc_precision)
nb_f1, nb_auc = f1_score(y_test, nb_y_preds), auc(nb_recall, nb_precision)
knn_f1, knn_auc = f1_score(y_test, knn_y_preds), auc(knn_recall, knn_precision)
rfc_f1, rfc_auc = f1_score(y_test, rfc_y_preds), auc(rfc_recall, rfc_precision)



# Summarize Scpres
print('Logistic: f1=%.5f auc=%.9f' % (log_f1, log_auc))
print('Support Vector Machine: f1=%.9f auc=%.5f' % (svc_f1, svc_auc))
print('Naive Bayes: f1=%.9f auc=%.5f' % (nb_f1, nb_auc))
print('K Nearest Neighbours: f1=%.5f auc=%.5f' % (knn_f1, knn_auc))
print('Random Forest Classifier: f1=%.5f auc=%.5f' % (rfc_f1, rfc_auc))


# plot the precision-recall curves

plt.figure(figsize = (20, 20))
no_skill = len(y_test[y_test == 1]) / len(y_test)

plt.plot(log_recall, log_precision, marker = '.', label = 'Logistic')
plt.plot(svc_recall, svc_precision, marker = '.', label = 'Support Vector Machine')
plt.plot(nb_recall, nb_precision, marker = '.', label = 'Naive Bayes')
plt.plot(knn_recall, knn_precision, marker = '.', label = 'K Nearest Neighbour')
plt.plot(rfc_recall, rfc_precision, marker = '.', label = 'Random Forest Classifier')
plt.plot([0, 1], [no_skill, no_skill], linestyle = '--', label = 'No Skill')

# Axis labels

plt.xlabel('Recall')
plt.ylabel('Precision')

# Show Legend
plt.legend()

# Show the plot

plt.show()