In [None]:
import numpy as np
import scipy.optimize as opt
import seaborn as sns
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

In [None]:
df_diabetes = pd.read_csv("/content/diabetes-dataset.csv" , delimiter=",")
df_diabetes[0:8]

In [None]:
df_diabetes.shape

In [None]:
df_diabetes.isnull().any().any()

In [None]:
df_diabetes.describe()

In [None]:
fig, ax = plt.subplots(figsize=(16,10))  
sns.set(font_scale=2.0)
sns.heatmap(df_diabetes.corr() , annot= True, linewidths=3, ax=ax)

In [None]:
cdf= df_diabetes[['Glucose', 'BMI', 'Age', 'Outcome']]
sns.set(font_scale=0.2)
sns.pairplot(cdf)

In [None]:
from seaborn.categorical import boxplot
plt.figure(figsize=(10,8))
boxplot= df_diabetes.boxplot(column=['Glucose', 'BMI', 'Age' , 'BloodPressure'])

In [None]:
x = df_diabetes[['Glucose', 'BMI', 'Age']]
y = df_diabetes['Outcome']

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
#By default, cross_val_score performs three-fold cross-validation, returning three accuracy values. I change the number of folds
#used by changing the cv parameter.
scores = cross_val_score(logreg, x, y, cv=10)
print("cross-validation scores: ", scores)
print('Mean of cross-validation accuracy: {:.2f}' .format(scores.mean()))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
# Normalization
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.svm import SVC
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        # perform cross-validation
        scores = cross_val_score(svm, X_train, y_train, cv=5)
        # compute mean cross-validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
# rebuild a model on the combined training and validation set
svm = SVC(**best_parameters)
svm.fit(X_train, y_train)

In [None]:
print('Accuracy of SVC on train set: {:.2f}' .format(svm.score(X_train, y_train)))
print('Accuracy of SVC on test set: {:.2f}' .format(svm.score(X_test, y_test)))

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_scores_logreg = logreg.decision_function(X_test)
precision_logreg, recall_logreg, thresholds_logreg = precision_recall_curve(y_test, y_scores_logreg)

closest_zero_logreg = np.argmin(np.abs(thresholds_logreg))
closest_zero_p_logreg = precision_logreg[closest_zero_logreg]
closest_zero_r_logreg = recall_logreg[closest_zero_logreg]

y_scores = svm.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]

plt.figure(figsize=(12, 8))
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision_logreg, recall_logreg, color='green', label='LogisticRegression Precision-Recall Curve')
plt.plot(precision, recall, label='SVC Precision-Recall Curve')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle ='none', c='r', mew=3)
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.title('Precision-Recall_Curve_Comparison', fontsize=16)
plt.legend(loc='lower left', fontsize=13)
plt.axes().set_aspect('equal')
plt.show()

In [None]:
fpr_lr_logreg, tpr_lr_logreg, _ = roc_curve(y_test, y_scores_logreg)
roc_auc_lr_logreg = auc(fpr_lr_logreg, tpr_lr_logreg)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_scores)
roc_auc_lr = auc(fpr_lr, tpr_lr)
plt.figure(figsize=(12, 8))
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr_logreg, tpr_lr_logreg, lw=3, color='green', label='LogisticRegression ROC curve (area = {:0.2f})'.format(roc_auc_lr_logreg))
plt.plot(fpr_lr, tpr_lr, lw=3, label='SVC ROC curve (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC_Curve_Comparison', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='red', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

In [None]:
DTClf = DecisionTreeClassifier(random_state = 0)  #DTClf = Decision Tree Classifier
DTClf.fit(X_train, y_train)

In [None]:
Model_Prediction = DTClf.predict(X_test)


In [None]:
d = {'real or true values(y_test)': y_test, 'Model_Prediction': Model_Prediction}
dfC = pd.DataFrame(data=d)
dfC.head(10)

In [None]:
print('Accuracy of Decision Tree classifier on train set: {:.2f}' .format(DTClf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}' .format(DTClf.score(X_test, y_test)))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNClf = KNeighborsClassifier(n_neighbors = 1)  # KNClf= KNeighborsClassifier
KNClf.fit(X_train_scaled, y_train) 


In [None]:
Model_Prediction_knn = KNClf.predict(X_test_scaled)
Model_Prediction_knn

In [None]:
print('Accuracy of KNeighborsClassifier on train set: {:.2f}' .format(KNClf.score(X_train_scaled, y_train)))
print('Accuracy of KNeighborsClassifier on test set: {:.2f}' .format(KNClf.score(X_test_scaled, y_test)))

In [None]:
from sklearn import metrics
Ks = 15
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train_scaled, y_train)
    yhat = neigh.predict(X_test_scaled)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
    
mean_acc

In [None]:
plt.figure(figsize=(10,6))
plt.rcParams.update({'font.size': 16})
plt.plot(range(1,Ks), mean_acc, 'g')
plt.fill_between(range(1,Ks), mean_acc - 1 * std_acc, mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks), mean_acc - 3 * std_acc, mean_acc + 3 * std_acc, alpha=0.10, color='blue')
plt.legend(('Accuracy', '+/- 1xstd', '+/- 3xstd'))
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()

In [None]:
print("The best accuracy is {:.2f}" .format(mean_acc.max()), "with k =", mean_acc.argmax()+ 1)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix 
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm=cm.astype('float')/cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt='.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i,j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i , j] , fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True Outcome')
    plt.xlabel('Predicted Outcome')

In [None]:
cnf_matrix = confusion_matrix(y_test, Model_Prediction_knn, labels=[1,0])
np.set_printoptions(precision=3)

plt.figure(figsize=(14, 8))
plt.grid(False)
plt.rcParams.update({'font.size': 18})
plot_confusion_matrix(cnf_matrix, classes=['Outcome = (diabetes or 1)', 'Outcome = (no diabetes or 0)'], normalize = False)

In [None]:
print(classification_report(y_test, Model_Prediction_knn))
