In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
import seaborn as sn
from sklearn import preprocessing
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle
from sklearn.svm import SVC

%matplotlib inline
pd.options.mode.chained_assignment = None

import warnings
warnings.filterwarnings("ignore")


In [None]:
def logreg_classifier_012_train_evaluate(train_data_file, test_data_file):
    train_data = pd.read_csv(train_data_file)
    test_data = pd.read_csv(test_data_file)
    
    # Extracting the input and output features from the train and test data
    X_train = train_data.drop(['Diabetes_012'], axis=1)
    y_train = train_data['Diabetes_012']
    
    X_test = test_data.drop(['Diabetes_012'], axis=1)
    y_test = test_data['Diabetes_012']
    
    # Training the Model
    log_reg = LogisticRegression(random_state=42,
                                  C=0.1, penalty='l1', solver='saga')

    log_reg.fit(X_train, y_train)

    # with open('logreg-012.pkl', 'wb') as file:
    #     pickle.dump(log_reg, file)
    
    # Estimating the output
    y_pred = log_reg.predict(X_test)
    
    return (y_test, y_pred)

def logreg_classifier_binary_split_train_evaluate(train_data_file, test_data_file):
    train_data = pd.read_csv(train_data_file)
    test_data = pd.read_csv(test_data_file)
    
    # Extracting the input and output features from the train and test data
    X_train = train_data.drop(['Diabetes_binary'], axis=1)
    y_train = train_data['Diabetes_binary']
    
    X_test = test_data.drop(['Diabetes_binary'], axis=1)
    y_test = test_data['Diabetes_binary']
    
    # Training the Model
    log_reg = LogisticRegression(random_state=42,
                                  C=1, penalty='l1', solver='liblinear')

    log_reg.fit(X_train, y_train)

    # with open('logreg-binary_split.pkl', 'wb') as file:
    #     pickle.dump(log_reg, file)
    
    # Estimating the output
    y_pred = log_reg.predict(X_test)
    
    return (y_test, y_pred)

In [None]:
train_data_file_012 =  "Dataset/train_012.csv"
test_data_file_012  =  "Dataset/test_012.csv"

y_test_logreg, y_pred_logreg = logreg_classifier_012_train_evaluate(train_data_file_012, test_data_file_012)

print(classification_report(y_test_logreg, y_pred_logreg))

cm = confusion_matrix(y_test_logreg, y_pred_logreg)
cm_df = pd.DataFrame(cm, index = [i for i in ['Non-Diabetic', 'Pre-Diabetic', 'Diabetic']], columns = [i for i in ['Non-Diabetic', 'Pre-Diabetic', 'Diabetic']])
plt.figure(figsize = (10,7))
sn.heatmap(cm_df, cmap="YlGnBu", fmt='g', annot=True)
plt.xlabel('Predicted label',fontsize = 13)
plt.ylabel('True label',fontsize = 13)
plt.show()





In [None]:
train_data_file_binary_split = "Dataset/train_binary_split.csv"
test_data_file_binary_split = "Dataset/test_binary_split.csv"

y_test_logreg, y_pred_logreg = logreg_classifier_binary_split_train_evaluate(train_data_file_binary_split, test_data_file_binary_split)

print(classification_report(y_test_logreg, y_pred_logreg))

cm = confusion_matrix(y_test_logreg, y_pred_logreg)
cm_df = pd.DataFrame(cm, index = [i for i in ['Non-Diabetic', 'Diabetic']], columns = [i for i in ['Non-Diabetic', 'Diabetic']])
plt.figure(figsize = (10,7))
sn.heatmap(cm_df,  cmap="YlGnBu", fmt='g', annot=True)
plt.xlabel('Predicted label',fontsize = 13)
plt.ylabel('True label',fontsize = 13)
plt.show()

In [None]:
def svm_classifier_012_train_evaluate(train_data_file, test_data_file):
    train_data = pd.read_csv(train_data_file)
    test_data = pd.read_csv(test_data_file)
    
    # Extracting the input and output features from the train and test data
    X_train = train_data.drop(['Diabetes_012'], axis=1)
    y_train = train_data['Diabetes_012']
    
    X_test = test_data.drop(['Diabetes_012'], axis=1)
    y_test = test_data['Diabetes_012']
    
    # Training the Model
    svm = SVC(random_state=42, C=1, kernel='rbf', gamma='scale')

    svm.fit(X_train, y_train)

    with open('svm-012.pkl', 'wb') as file:
        pickle.dump(svm, file)
    
    # Estimating the output
    y_pred = svm.predict(X_test)
    
    return (y_test, y_pred)

def svm_classifier_binary_split_train_evaluate(train_data_file, test_data_file):
    train_data = pd.read_csv(train_data_file)
    test_data = pd.read_csv(test_data_file)
    
    # Extracting the input and output features from the train and test data
    X_train = train_data.drop(['Diabetes_binary'], axis=1)
    y_train = train_data['Diabetes_binary']
    
    X_test = test_data.drop(['Diabetes_binary'], axis=1)
    y_test = test_data['Diabetes_binary']
    
    # Training the Model
    svm = SVC(random_state=42, C=1, kernel='rbf', gamma='scale')

    svm.fit(X_train, y_train)

    with open('svm-binary_split.pkl', 'wb') as file:
        pickle.dump(svm, file)
    
    # Estimating the output
    y_pred = svm.predict(X_test)
    
    return (y_test, y_pred)

In [None]:
train_data_file_012 =  "Dataset/train_012.csv"
test_data_file_012  =  "Dataset/test_012.csv"

y_test_svm, y_pred_svm = svm_classifier_012_train_evaluate(train_data_file_012, test_data_file_012)

print(classification_report(y_test_svm, y_pred_svm))

cm = confusion_matrix(y_test_svm, y_pred_svm)
cm_df = pd.DataFrame(cm, index = [i for i in ['Non-Diabetic', 'Pre-Diabetic', 'Diabetic']], columns = [i for i in ['Non-Diabetic', 'Pre-Diabetic', 'Diabetic']])
plt.figure(figsize = (10,7))
sn.heatmap(cm_df, cmap="YlGnBu", fmt='g', annot=True)
plt.xlabel('Predicted label',fontsize = 13)
plt.ylabel('True label',fontsize = 13)
plt.show()

In [None]:
train_data_file_binary_split = "Dataset/train_binary_split.csv"
test_data_file_binary_split = "Dataset/test_binary_split.csv"

y_test_svm, y_pred_svm = svm_classifier_binary_split_train_evaluate(train_data_file_binary_split, test_data_file_binary_split)

print(classification_report(y_test_svm, y_pred_svm))

cm = confusion_matrix(y_test_svm, y_pred_svm)
cm_df = pd.DataFrame(cm, index = [i for i in ['Non-Diabetic', 'Diabetic']], columns = [i for i in ['Non-Diabetic', 'Diabetic']])
plt.figure(figsize = (10,7))
sn.heatmap(cm_df,  cmap="YlGnBu", fmt='g', annot=True)
plt.xlabel('Predicted label',fontsize = 13)
plt.ylabel('True label',fontsize = 13)
plt.show()