In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For data preprocessing and splitting
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For classification models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# For evaluation metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, classification_report

In [4]:
# Load the dataset (adjust the file path as needed)
df = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')

# Display the first few rows to inspect the data
print(df.head())

# Check basic info and missing values
print(df.info())
print(df.isnull().sum())

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

In [6]:
# Example: Impute missing values for numerical columns
df.fillna(df.mean(), inplace=True)

# Separate features and target (assuming the target variable is named 'Diabetes' and is binary)
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# Build kNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict on test data
y_pred_knn = knn.predict(X_test)

In [None]:
# Build SVM classifier with RBF kernel
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train, y_train)

# Predict on test data
y_pred_svm = svm.predict(X_test)

In [None]:
def evaluate_model(y_true, y_pred, model_name='Model'):
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)

    print(f"{model_name} Confusion Matrix:\n{cm}\n")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Sensitivity (Recall): {sensitivity:.3f}")
    print(f"Specificity: {specificity:.3f}\n")

    return cm, accuracy, sensitivity, specificity

# Evaluate kNN
cm_knn, acc_knn, sens_knn, spec_knn = evaluate_model(y_test, y_pred_knn, model_name='kNN')

# Evaluate SVM
cm_svm, acc_svm, sens_svm, spec_svm = evaluate_model(y_test, y_pred_svm, model_name='SVM')

In [None]:
# Compute ROC curves and AUC for both models

# For kNN
y_scores_knn = knn.predict_proba(X_test)[:, 1]  # probability estimates for the positive class
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_scores_knn)
roc_auc_knn = auc(fpr_knn, tpr_knn)

# For SVM (using probability=True)
y_scores_svm = svm.predict_proba(X_test)[:, 1]
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_scores_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)

# Plot ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_knn, tpr_knn, label=f'kNN ROC curve (AUC = {roc_auc_knn:.2f})')
plt.plot(fpr_svm, tpr_svm, label=f'SVM ROC curve (AUC = {roc_auc_svm:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.show()