In [28]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("/content/binary_diabetes_data.csv")

# Step 1: Remove duplicates
df = df.drop_duplicates()

# Step 2: Select features and target
selected_features = ['HighBP', 'HighChol', 'BMI', 'Stroke', 'HeartDiseaseorAttack',
       'PhysActivity', 'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth',
       'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
target = 'Diabetes_012'

# Separate the features and target variable
X = df[selected_features]
y = df['Diabetes_012']

# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 3: Apply train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize KNN model with best parameters
knn_model_tuned = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan')

# Train the KNN model using training data
knn_model_tuned.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred_tuned = knn_model_tuned.predict(X_test)

# Evaluate the KNN model on test data
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f"Accuracy (Tuned KNN with Selected Features): {accuracy_tuned}")
print("Classification Report (Tuned KNN with Selected Features):")
print(classification_report(y_test, y_pred_tuned))
print("Confusion Matrix (Tuned KNN with Selected Features):")
print(confusion_matrix(y_test, y_pred_tuned))


Accuracy (Tuned KNN with Selected Features): 0.8778774565257426
Classification Report (Tuned KNN with Selected Features):
              precision    recall  f1-score   support

           0       0.91      0.83      0.87     37998
           1       0.85      0.92      0.88     38024

    accuracy                           0.88     76022
   macro avg       0.88      0.88      0.88     76022
weighted avg       0.88      0.88      0.88     76022

Confusion Matrix (Tuned KNN with Selected Features):
[[31719  6279]
 [ 3005 35019]]


In [60]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("/content/binary_diabetes_data.csv")

# Step 1: Remove duplicates
df = df.drop_duplicates()

# Step 2: Select features and target
selected_features = ['HighBP', 'HighChol', 'BMI', 'Stroke', 'HeartDiseaseorAttack',
       'PhysActivity', 'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth',
       'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
target = 'Diabetes_012'

# Separate the features and target variable
X = df[selected_features]
y = df['Diabetes_012']

# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 3: Apply train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize CatBoost model with specified parameters
catboost_model = CatBoostClassifier(
    learning_rate=0.05,
    l2_leaf_reg=1,
    iterations=1000,
    depth=10,
    border_count=32,
    cat_features=[],  # If there are categorical features, list them here
    random_state=42,
    verbose=200  # Display output every 200 iterations
)

# Train the CatBoost model using training data
catboost_model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred_catboost = catboost_model.predict(X_test)

# Evaluate the CatBoost model on test data
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"Accuracy (CatBoost with Selected Features): {accuracy_catboost}")
print("Classification Report (CatBoost with Selected Features):")
print(classification_report(y_test, y_pred_catboost))
print("Confusion Matrix (CatBoost with Selected Features):")
print(confusion_matrix(y_test, y_pred_catboost))


0:	learn: 0.6444520	total: 200ms	remaining: 3m 19s
200:	learn: 0.2379151	total: 37.6s	remaining: 2m 29s
400:	learn: 0.2210449	total: 1m 13s	remaining: 1m 49s
600:	learn: 0.2097887	total: 1m 47s	remaining: 1m 11s
800:	learn: 0.2000777	total: 2m 22s	remaining: 35.4s
999:	learn: 0.1919211	total: 2m 56s	remaining: 0us
Accuracy (CatBoost with Selected Features): 0.8905053800215728
Classification Report (CatBoost with Selected Features):
              precision    recall  f1-score   support

           0       0.85      0.95      0.90     37998
           1       0.94      0.83      0.88     38024

    accuracy                           0.89     76022
   macro avg       0.90      0.89      0.89     76022
weighted avg       0.90      0.89      0.89     76022

Confusion Matrix (CatBoost with Selected Features):
[[36118  1880]
 [ 6444 31580]]


In [59]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [163]:
#Check

In [161]:
import numpy as np
def predict_diabetes():
  # # Get feature input from the user
  # HighBP = int(input("Enter HighBP (0 or 1): "))
  # HighChol = int(input("Enter HighChol (0 or 1): "))
  # BMI = int(input("Enter BMI: "))
  # Stroke = int(input("Enter Stroke (0 or 1): "))
  # HeartDiseaseorAttack = int(input("Enter HeartDiseaseorAttack (0 or 1): "))
  # PhysActivity = int(input("Enter PhysActivity (0 or 1): "))
  # HvyAlcoholConsump = int(input("Enter HvyAlcoholConsump (0 or 1): "))
  # AnyHealthcare = int(input("Enter AnyHealthcare (0 or 1): "))
  # GenHlth = int(input("Enter GenHlth (1-5): "))
  # PhysHlth = int(input("Enter PhysHlth (0-30): "))
  # DiffWalk = int(input("Enter DiffWalk (0 or 1): "))
  # Sex = int(input("Enter Sex (0 for Female, 1 for Male): "))
  # Age = int(input("Enter Age (1-13): "))
  # Education = int(input("Enter Education (1-6): "))
  # Income = int(input("Enter Income (1-8): "))

  # Create a list of features
  # features = [HighBP, HighChol, BMI, Stroke, HeartDiseaseorAttack, PhysActivity,
  #             HvyAlcoholConsump, AnyHealthcare, GenHlth, PhysHlth, DiffWalk,
  #             Sex, Age, Education, Income]
  features=X.iloc[60].values

  # Convert features to a NumPy array and reshape for prediction
  features_array = np.array(features).reshape(1, -1)

  # Use the trained Random Forest model to make a prediction
  prediction = catboost_model.predict(features_array)[0]

  # Return the prediction result as a string
  return 'Diabetic' if prediction == 1 else 'Non-Diabetic'


# Example usage:
prediction_result = predict_diabetes()
print(f"Prediction: {prediction_result}")


Prediction: Non-Diabetic


In [162]:
  features=X.iloc[60].values

  # Convert features to a NumPy array and reshape for prediction
  features_array = np.array(features).reshape(1, -1)

  # Use the trained Random Forest model to make a prediction
  prediction =knn_model_tuned.predict(features_array)[0]
  prediction



1

In [57]:
with open('model_binary_working.pkl', 'wb') as f:
    pickle.dump(knn_model_tuned, f)

print("Model saved successfully!")

Model saved successfully!


In [133]:
X.iloc[2].values

array([ 1.,  1., 28.,  0.,  0.,  0.,  0.,  1.,  5., 30.,  1.,  0.,  9.,
        4.,  8.])

In [160]:
y.iloc[60]


1