In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/DataSets/diabetes.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.shape

(768, 9)

In [None]:
df.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
32,3,88,58,11,54,24.8,0.267,22,0
282,7,133,88,15,155,32.4,0.262,37,0
121,6,111,64,39,0,34.2,0.26,24,0
436,12,140,85,33,0,37.4,0.244,41,0
515,3,163,70,18,105,31.6,0.268,28,1


In [None]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_trf = scaler.fit_transform(X_train)
X_test_trf = scaler.transform(X_test)

**LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Create the Logistic Regression model
log_model = LogisticRegression()

# Step 2: Train the model
log_model.fit(X_train_trf, y_train)

# Step 3: Predict on test data
y_pred = log_model.predict(X_test_trf)

# Step 4: Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (using Logistic Regression):", accuracy)


Accuracy (using Logistic Regression): 0.7662337662337663


Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Create the Random Forest model
rf_model = RandomForestClassifier(random_state=2)

# Step 2: Train the model
rf_model.fit(X_train_trf, y_train)

# Step 3: Predict on test data
y_pred = rf_model.predict(X_test_trf)

# Step 4: Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (using Random Forest):", accuracy)


Accuracy (using Random Forest): 0.7792207792207793


**SUPPORT VECTOR CLASSIFICATION**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 1: Create the Support Vector Classifier model
svc_model = SVC(kernel='linear')  # You can also try 'rbf', 'poly', etc.

# Step 2: Train the model
svc_model.fit(X_train_trf, y_train)

# Step 3: Predict on test data
y_pred = svc_model.predict(X_test_trf)

# Step 4: Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (using SVC):", accuracy)


Accuracy (using SVC): 0.7662337662337663


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Step 1: Create the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can change k as needed

# Step 2: Train the model
knn_model.fit(X_train_trf, y_train)

# Step 3: Predict on test data
y_pred = knn_model.predict(X_test_trf)

# Step 4: Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (using KNN):", accuracy)


Accuracy (using KNN): 0.7402597402597403


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Step 1: Create the Naive Bayes model
nb_model = GaussianNB()

# Step 2: Train the model
nb_model.fit(X_train_trf, y_train)

# Step 3: Predict on test data
y_pred = nb_model.predict(X_test_trf)

# Step 4: Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (using Naive Bayes):", accuracy)


Accuracy (using Naive Bayes): 0.7597402597402597


IN one shot:

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier(random_state=2)),
    ("SVC", SVC(kernel='linear')),  # You can change kernel type here
    ("KNN", KNeighborsClassifier(n_neighbors=5)),
    ("Naive Bayes", GaussianNB())
]

# Dictionary to store model accuracies
accuracies = {}

# Step 1: Train and evaluate each model
for name, model in models:
    model.fit(X_train_trf, y_train)
    y_pred = model.predict(X_test_trf)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy

# Step 2: Print all accuracies
print("Model Accuracies:")
for name, accuracy in accuracies.items():
    print(f"{name}: {accuracy:.4f}")


Model Accuracies:
Logistic Regression: 0.7662
Random Forest: 0.7792
SVC: 0.7662
KNN: 0.7403
Naive Bayes: 0.7597


**1. Logistic Regression with Hyperparameter Tuning**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter grid for Logistic Regression
param_grid_logreg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

# Step 1: Create Logistic Regression model
logreg_model = LogisticRegression()

# Step 2: Apply GridSearchCV for hyperparameter tuning
grid_search_logreg = GridSearchCV(logreg_model, param_grid_logreg, cv=5, n_jobs=-1, verbose=1)
grid_search_logreg.fit(X_train_trf, y_train)

# Step 3: Best model and accuracy
best_logreg_model = grid_search_logreg.best_estimator_
y_pred_logreg = best_logreg_model.predict(X_test_trf)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

# Step 4: Print results
print(f"Best parameters for Logistic Regression: {grid_search_logreg.best_params_}")
print(f"Accuracy (Logistic Regression): {accuracy_logreg:.4f}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'saga'}
Accuracy (Logistic Regression): 0.7727


** 2. Random Forest with Hyperparameter Tuning**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Step 1: Create Random Forest model
rf_model = RandomForestClassifier(random_state=2)

# Step 2: Apply GridSearchCV for hyperparameter tuning
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train_trf, y_train)

# Step 3: Best model and accuracy
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test_trf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Step 4: Print results
print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Accuracy (Random Forest): {accuracy_rf:.4f}")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Accuracy (Random Forest): 0.7597


**3. Support Vector Classifier (SVC) with Hyperparameter Tuning**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter grid for SVC
param_grid_svc = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

# Step 1: Create SVC model
svc_model = SVC()

# Step 2: Apply GridSearchCV for hyperparameter tuning
grid_search_svc = GridSearchCV(svc_model, param_grid_svc, cv=5, n_jobs=-1, verbose=1)
grid_search_svc.fit(X_train_trf, y_train)

# Step 3: Best model and accuracy
best_svc_model = grid_search_svc.best_estimator_
y_pred_svc = best_svc_model.predict(X_test_trf)
accuracy_svc = accuracy_score(y_test, y_pred_svc)

# Step 4: Print results
print(f"Best parameters for SVC: {grid_search_svc.best_params_}")
print(f"Accuracy (SVC): {accuracy_svc:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for SVC: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy (SVC): 0.7597


**4. K-Nearest Neighbors (KNN) with Hyperparameter Tuning**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}

# Step 1: Create KNN model
knn_model = KNeighborsClassifier()

# Step 2: Apply GridSearchCV for hyperparameter tuning
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=5, n_jobs=-1, verbose=1)
grid_search_knn.fit(X_train_trf, y_train)

# Step 3: Best model and accuracy
best_knn_model = grid_search_knn.best_estimator_
y_pred_knn = best_knn_model.predict(X_test_trf)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Step 4: Print results
print(f"Best parameters for KNN: {grid_search_knn.best_params_}")
print(f"Accuracy (KNN): {accuracy_knn:.4f}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters for KNN: {'n_neighbors': 9, 'weights': 'uniform'}
Accuracy (KNN): 0.7273


5. Naive Bayes (No Hyperparameter Tuning), I have summarized all in one code as mentioned below.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Hyperparameter grids for each model
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "SVC": {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['linear', 'rbf']
    },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    "Naive Bayes": {
        # GaussianNB doesn't have hyperparameters to tune, so we'll skip it for tuning
    }
}

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest", RandomForestClassifier(random_state=2)),
    ("SVC", SVC()),
    ("KNN", KNeighborsClassifier()),
    ("Naive Bayes", GaussianNB())
]

# Dictionary to store model accuracies
accuracies = {}

# Step 1: Train and evaluate each model with hyperparameter tuning
for name, model in models:
    if name != "Naive Bayes":  # Naive Bayes does not have hyperparameters to tune
        print(f"Tuning {name}...")
        grid_search = GridSearchCV(model, param_grids[name], cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train_trf, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_trf)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[name] = accuracy
        print(f"Best parameters for {name}: {grid_search.best_params_}")
    else:
        # Train and evaluate Naive Bayes without tuning
        model.fit(X_train_trf, y_train)
        y_pred = model.predict(X_test_trf)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[name] = accuracy

# Step 2: Print all accuracies after tuning
print("\nModel Accuracies after Hyperparameter Tuning:")
for name, accuracy in accuracies.items():
    print(f"{name}: {accuracy:.4f}")


Tuning Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'saga'}
Tuning Random Forest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Tuning SVC...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for SVC: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Tuning KNN...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters for KNN: {'n_neighbors': 9, 'weights': 'uniform'}

Model Accuracies after Hyperparameter Tuning:
Logistic Regression: 0.7727
Random Forest: 0.7597
SVC: 0.7597
KNN: 0.7273
Naive Bayes: 0.7597
