In [15]:
#libraries

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import fetch_openml
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

In [17]:
def load_data():
    diabetes_df = pd.read_csv('dataset/diabetes_prediction_dataset.csv')
    y = diabetes_df['diabetes'].values
    X = diabetes_df.drop('diabetes', axis=1)  
    return X, y

def one_hot_encode(X, columns):
    for col in columns:
        one_hot = pd.get_dummies(X[col], prefix=col)
        X = pd.concat([X, one_hot], join='inner', axis='columns').drop(col, axis=1)
    return X



def remove_outliers(X):
    outlier_detection = IsolationForest()
    outliers = outlier_detection.fit_predict(X)
   
    X = X[outliers == 1]
    return X

def standardize_data(X, columns):
    scaler = MinMaxScaler()
    X[columns] = scaler.fit_transform(X[columns])
    return X

def prepare_data(X, columns_to_encode, columns_to_scale, columns_no_scale):
    X_encoded = one_hot_encode(X, columns_to_encode)
    
    X_scaled = standardize_data(X_encoded, columns_to_scale)
    
   
    X_final = pd.concat([X_scaled, X_encoded[columns_no_scale]], join='inner', axis='columns')
    return X_final

def split_train_dev_test(X, y):
    total_examples = len(X)

   
    train_size = int(total_examples * 0.8)
    dev_size = int(total_examples * 0.1) 
    Xtrain, Xdev, Xtest = X[:train_size], X[train_size:train_size+dev_size], X[train_size+dev_size:]
    ytrain, ydev, ytest = y[:train_size], y[train_size:train_size+dev_size], y[train_size+dev_size:]

    return Xtrain, Xdev, Xtest, ytrain, ydev, ytest
def train_mnist_SGD(X_train, y_train):
    sgd_classifier = SGDClassifier(loss='hinge', max_iter=1, verbose=3, random_state=42)
    sgd_classifier.fit(X_train, y_train)
    return sgd_classifier


def train_mnist_Logistic_Regression(X_train, y_train):
    LR = LogisticRegression(multi_class='ovr', solver='saga', max_iter=100, verbose=3, random_state=42, n_jobs=-1)
    LR.fit(X_train, y_train)
    return LR


def train_mnist_SVC(Xtrain, ytrain):
    
    SVCclassifier= SVC( max_iter=2000, decision_function_shape='ovr', verbose = 3, random_state= 42)
    
    SVCclassifier.fit (Xtrain,ytrain)
    return SVCclassifier


def train_mnist_KNeighbors(X_train, y_train):
    
    knn_classifier = KNeighborsClassifier(n_neighbors=3, weights='uniform')
    knn_classifier.fit(X_train, y_train)
    return knn_classifier


def train_mnist_DecisionTree(X_train, y_train):
    
    dt_classifier = DecisionTreeClassifier(max_depth=None, criterion='gini')
    dt_classifier.fit(X_train, y_train)
    return dt_classifier

    

def evaluate(model, Xdev, ydev):
    ydev_pred = model.predict(Xdev)

    accuracy = accuracy_score(ydev, ydev_pred)
    labels = np.unique(ydev_pred)
    precision = precision_score(ydev, ydev_pred, labels=labels, average=None)
    recall = recall_score(ydev, ydev_pred, labels=labels, average=None)
    f1 = f1_score(ydev, ydev_pred, labels=labels, average=None)
    
    dfscore = pd.DataFrame(data=[[accuracy]*len(labels), precision, recall, f1], columns=["Class" + str(l) for l in labels], index=['Accuracy', 'Precision', 'Recall', 'F1_score'])
    dfscore['mean'] = [accuracy, precision.mean(), recall.mean(), f1.mean()]
    
    return dfscore


def main():
    X, y = load_data()

    columns_to_encode = ['gender', 'smoking_history']
    columns_to_scale = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
    columns_no_scale = ['hypertension', 'heart_disease']

    X_prep = prepare_data(X, columns_to_encode, columns_to_scale, columns_no_scale)
    
    
    Xtrain, Xdev, Xtest, ytrain, ydev, ytest = split_train_dev_test(X_prep, y)
    print(Xtrain.shape, ytrain.shape)
    print(Xtest.shape, ytest.shape)
    print(Xdev.shape, ydev.shape)
    
    # Train models
    model_Logistic = train_mnist_Logistic_Regression(Xtrain, ytrain)
    model_SGD = train_mnist_SGD(Xtrain, ytrain)
    model_SVC = train_mnist_SVC(Xtrain, ytrain)
    model_KNeighbors = train_mnist_KNeighbors(Xtrain, ytrain)
    model_DecisionTree = train_mnist_DecisionTree(Xtrain, ytrain)
    
    # Evaluate models
    dfscore1 = evaluate(model_SGD, Xdev, ydev)
    dfscore2 = evaluate(model_Logistic, Xdev, ydev)
    dfscore3 = evaluate(model_SVC, Xdev, ydev)
    dfscore4 = evaluate(model_KNeighbors, Xdev, ydev)
    dfscore5 = evaluate(model_DecisionTree, Xdev, ydev)

    # Combine the evaluation results
    evaluation_results = pd.concat([dfscore1.add_suffix('_SGD'), 
                                     dfscore2.add_suffix('_Logistic'), 
                                     dfscore3.add_suffix('_SVC'), 
                                     dfscore4.add_suffix('_KNeighbors'), 
                                     dfscore5.add_suffix('_DecisionTree')], axis=1)
    
    return evaluation_results




dfscore = main()
dfscore














(80000, 17) (80000,)
(10000, 17) (10000,)
(10000, 17) (10000,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.


convergence after 64 epochs took 2 seconds
-- Epoch 1
Norm: 11.60, NNZs: 16, Bias: -12.469790, T: 80000, Avg. loss: 0.183592
Total training time: 0.01 seconds.
[LibSVM]



Epoch 1, change: 1.00000000
Epoch 2, change: 0.08726354
Epoch 3, change: 0.04274949
Epoch 4, change: 0.02766213
Epoch 5, change: 0.02635183
Epoch 6, change: 0.02095191
Epoch 7, change: 0.01771921
Epoch 8, change: 0.01605991
Epoch 9, change: 0.01406865
Epoch 10, change: 0.01248383
Epoch 11, change: 0.01143417
Epoch 12, change: 0.01029218
Epoch 13, change: 0.00949221
Epoch 14, change: 0.00852810
Epoch 15, change: 0.00769081
Epoch 16, change: 0.00705600
Epoch 17, change: 0.00647130
Epoch 18, change: 0.00581424
Epoch 19, change: 0.00536393
Epoch 20, change: 0.00483909
Epoch 21, change: 0.00441604
Epoch 22, change: 0.00405784
Epoch 23, change: 0.00366573
Epoch 24, change: 0.00337949
Epoch 25, change: 0.00307534
Epoch 26, change: 0.00281781
Epoch 27, change: 0.00258303
Epoch 28, change: 0.00232413
Epoch 29, change: 0.00214576
Epoch 30, change: 0.00195468
Epoch 31, change: 0.00177914
Epoch 32, change: 0.00162059
Epoch 33, change: 0.00149571
Epoch 34, change: 0.00137306
Epoch 35, change: 0.001



Unnamed: 0,Class0_SGD,Class1_SGD,mean_SGD,Class0_Logistic,Class1_Logistic,mean_Logistic,Class0_SVC,Class1_SVC,mean_SVC,Class0_KNeighbors,Class1_KNeighbors,mean_KNeighbors,Class0_DecisionTree,Class1_DecisionTree,mean_DecisionTree
Accuracy,0.9409,0.9409,0.9409,0.9599,0.9599,0.9599,0.8833,0.8833,0.8833,0.9578,0.9578,0.9578,0.9532,0.9532,0.9532
Precision,0.974837,0.613419,0.794128,0.966281,0.85413,0.910206,0.983151,0.392667,0.687909,0.968296,0.802215,0.885256,0.976211,0.700957,0.838584
Recall,0.960526,0.716418,0.838472,0.990974,0.604478,0.797726,0.888321,0.825871,0.857096,0.986407,0.630597,0.808502,0.972814,0.728856,0.850835
F1_score,0.967629,0.660929,0.814279,0.978472,0.707939,0.843205,0.933333,0.532265,0.732799,0.977268,0.706128,0.841698,0.97451,0.714634,0.844572
