In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


In [None]:
data = pd.read_csv()

# Modeling 

Logistic Regression, Naive Bayes, KNN, SVM, Decision Tree. 

Use the cross validation function to run each model 10 times and calculate an average performance. Remember to use F1 score in the cross validation function.

In [None]:
#Build Pipline
cat_columns = ['Pclass','Sex','Embarked']
num_columns = ['Age','SibSp','Parch','Fare']
target = 'Survived'

cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, cat_columns),
                                                 ('num', num_transformer, num_columns)])

In [None]:
# Build the model
log = LogisticRegression()
nb = GaussianNB()
knn = KNeighborsClassifier() #default neighbours is 5
svc = SVC()
dt = DecisionTreeClassifier(random_state=123)


In [None]:
#split the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [None]:
#log Cross-Validation
cv_scores_log = cross_val_score(log, x_train, y_train, cv=10, scoring='f1')
log.fit(x_train, y_train)
y_pred_log = log.predict(x_test)
print("Cross-validation scores for Logistic Regression: ", cv_scores_log)

In [None]:
#nb Cross-Validation
cv_scores_nb = cross_val_score(nb, x_train, y_train, cv=10, scoring='f1')
nb.fit(x_train, y_train)
y_pred_nb = nb.predict(x_test)
print("Cross-validation scores for Navie Bayes: ", cv_scores_nb)

In [None]:
#KNN Cross-Validation
cv_scores_knn = cross_val_score(knn, x_train, y_train, cv=10, scoring='f1')
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)
print("Cross-validation scores for KNN: ", cv_scores_knn)

In [None]:
#SVC Cross-Validation
cv_scores_svc = cross_val_score(svc, x_train, y_train, cv=10, scoring='f1')
svc.fit(x_train, y_train)
y_pred_svc = svc.predict(x_test)
print("Cross-validation scores for SVC: ", cv_scores_svc)

In [None]:
#dt Cross-Validation
cv_scores_dt = cross_val_score(dt, x_train, y_train, cv=10, scoring='f1')
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)
print("Cross-validation scores for Decision Tree: ", cv_scores_dt)

By compare the f1_score choose a best model and start tuneing

# Model Tuning

Decide whether you want to optimize based on Precision or Recall. You will need to explain your choice in relation to the business objective.?

In [None]:
# Logistic Regression Tuning
pred_l_prob = log.predict_proba(x_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred_l_prob[:,1])

In [None]:
plt.plot(fpr, tpr)

Ideally we want this curve to be towards the top left; but in a non-ideal world, we want to find the optimal threshold. One way we can do this is by calculating the geometric mean (G-mean) which will find the balance between Sensitivity and Specificity. As a refresher:

Sensitivity = True Positive Rate \
Specificity = 1 - False Positive Rate

In other words:

Sensitivity = TP / (TP + FN) \
Specificity = TN / (FP + TN)

To calculate the G-mean, we simply take the square root of the Sensitivity multiplied by the Specificity.

In [None]:
gmeans = np.sqrt(tpr*(1-fpr))
thresholds[np.argmax(gmeans)]
#find the index with the highest gmean
print(thresholds[np.argmax(gmeans)])
print(gmeans[np.argmax(gmeans)])

In [None]:
# knn tuning
knn_params = {'n_neighbors':range(50,300)}
rs_knn = RandomizedSearchCV(knn, knn_params, n_iter=100, cv=5, scoring='f1')
rs_knn.fit(x_train,y_train)
rs_knn.best_estimator_

In [None]:
knn_best = KNeighborsClassifier(n_neighbors=72)
knn_best.fit(x_train,y_train)
pred_knn_best = knn_best.predict(x_test)
print(f1_score(pred_knn_best, y_test))

In [None]:
#svc tuning
svc_params = {'kernel':['linear','poly','rbf']}

rs_svc = RandomizedSearchCV(svc, svc_params, n_iter=100, cv=5, scoring='f1')
rs_svc.fit(x_train, y_train)
rs_svc.best_estimator_

In [None]:
#Gnb tuning
yhat_nb = nb.predict_proba(x_test)
fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test, yhat_nb[:,1])
gmeans_nb = np.sqrt(tpr_nb*(1-fpr_nb))
print(thresholds_nb[np.argmax(gmeans_nb)])
print(gmeans_nb[np.argmax(gmeans_nb)])

In [None]:
print('accuracy_score:', accuracy_score(pred,data['cardio']))
print('f1_score', f1_score(pred,data['cardio']))
print('precision_score:' , precision_score(pred,data['cardio']))
print('recall_score:' , recall_score(pred,data['cardio']))
print('roc_auc score', roc_auc_score(y_test, pred)) #有没有多种情况