In [11]:
# Logistic Regression: A simple yet effective linear model for binary classification tasks like disease prediction.
# Decision Trees: They can handle both numerical and categorical data, making them suitable for this dataset. Decision trees are interpretable and can capture nonlinear relationships between features and the target variable.
# Random Forest: An ensemble method that builds multiple decision trees and combines their predictions. It often provides improved accuracy compared to a single decision tree and is robust to overfitting.
# Gradient Boosting Machines (e.g., XGBoost, LightGBM): These are powerful ensemble methods that build trees sequentially, each one correcting the errors of the previous model. They are highly effective for classification tasks and often perform well in competitions and real-world applications.
# Support Vector Machines (SVM): SVMs can effectively handle high-dimensional data and are capable of capturing complex relationships between features and the target variable.
# Neural Networks: Deep learning models, such as feedforward neural networks or convolutional neural networks (CNNs), can learn intricate patterns from the data. They are particularly useful when dealing with large and complex datasets, but they require more computational resources and data to train effectively.
# Naive Bayes Classifier
# Ensemble Methods

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

In [12]:
training_set = pd.read_csv('./dataset/Blood_samples_dataset_balanced_2(f).csv')
testing_set = pd.read_csv('./dataset/blood_samples_dataset_test.csv')
training_set.head()

Unnamed: 0,Glucose,Cholesterol,Hemoglobin,Platelets,White Blood Cells,Red Blood Cells,Hematocrit,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,...,HbA1c,LDL Cholesterol,HDL Cholesterol,ALT,AST,Heart Rate,Creatinine,Troponin,C-reactive Protein,Disease
0,0.739597,0.650198,0.713631,0.868491,0.687433,0.529895,0.290006,0.631045,0.001328,0.795829,...,0.502665,0.21556,0.512941,0.064187,0.610827,0.939485,0.095512,0.465957,0.76923,Healthy
1,0.121786,0.023058,0.944893,0.905372,0.507711,0.403033,0.164216,0.307553,0.207938,0.505562,...,0.85681,0.652465,0.106961,0.942549,0.344261,0.666368,0.65906,0.816982,0.401166,Diabetes
2,0.452539,0.116135,0.54456,0.40064,0.294538,0.382021,0.625267,0.295122,0.868369,0.026808,...,0.466795,0.387332,0.421763,0.007186,0.506918,0.431704,0.417295,0.799074,0.779208,Thalasse
3,0.136609,0.015605,0.419957,0.191487,0.081168,0.166214,0.073293,0.668719,0.125447,0.501051,...,0.016256,0.040137,0.826721,0.265415,0.594148,0.225756,0.490349,0.637061,0.354094,Anemia
4,0.176737,0.75222,0.971779,0.785286,0.44388,0.439851,0.894991,0.442159,0.257288,0.805987,...,0.429431,0.146294,0.221574,0.01528,0.567115,0.841412,0.15335,0.794008,0.09497,Thalasse


In [13]:
testing_set.head()

Unnamed: 0,Glucose,Cholesterol,Hemoglobin,Platelets,White Blood Cells,Red Blood Cells,Hematocrit,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,...,HbA1c,LDL Cholesterol,HDL Cholesterol,ALT,AST,Heart Rate,Creatinine,Troponin,C-reactive Protein,Disease
0,0.001827,0.033693,0.114755,0.997927,0.562604,0.866499,0.578042,0.914615,0.026864,0.038641,...,0.65323,0.186104,0.430398,0.016678,0.885352,0.652733,0.788235,0.054788,0.031313,Thalasse
1,0.436679,0.972653,0.084998,0.180909,0.675736,0.563889,0.798382,0.670361,0.376092,0.18489,...,0.83354,0.153001,0.458533,0.401845,0.635969,0.574425,0.047025,0.607985,0.594123,Diabetes
2,0.545697,0.324815,0.584467,0.475748,0.558596,0.661007,0.934056,0.381782,0.500342,0.531829,...,0.678901,0.220479,0.817151,0.690981,0.101633,0.85574,0.551124,0.413294,0.070909,Heart Di
3,0.172994,0.050351,0.736,0.782022,0.069435,0.085219,0.032907,0.460619,0.785448,0.491495,...,0.3815,0.459396,0.420154,0.798537,0.399236,0.3246,0.499504,0.436662,0.242766,Diabetes
4,0.758534,0.739968,0.597868,0.772683,0.87572,0.860265,0.486189,0.486686,0.621048,0.191756,...,0.993381,0.272338,0.663579,0.265227,0.918847,0.80491,0.571119,0.188368,0.750848,Heart Di


In [14]:
from matplotlib import pyplot as plt
training_set.describe()

Unnamed: 0,Glucose,Cholesterol,Hemoglobin,Platelets,White Blood Cells,Red Blood Cells,Hematocrit,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,...,Triglycerides,HbA1c,LDL Cholesterol,HDL Cholesterol,ALT,AST,Heart Rate,Creatinine,Troponin,C-reactive Protein
count,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,...,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0,2351.0
mean,0.362828,0.393648,0.58619,0.504027,0.511086,0.50659,0.507152,0.4922,0.484459,0.562273,...,0.374373,0.439112,0.421777,0.546079,0.434972,0.452138,0.582255,0.425075,0.454597,0.430308
std,0.251889,0.239449,0.271498,0.303347,0.27727,0.266565,0.285537,0.275735,0.315618,0.273281,...,0.256981,0.263779,0.252124,0.269511,0.267388,0.242075,0.250915,0.229298,0.251189,0.243034
min,0.010994,0.012139,0.003021,0.012594,0.010139,0.044565,0.011772,0.046942,0.000554,0.006947,...,0.005217,0.016256,0.033037,0.039505,0.007186,0.013013,0.11455,0.021239,0.00749,0.004867
25%,0.129198,0.195818,0.346092,0.200865,0.259467,0.263589,0.288132,0.287532,0.207938,0.355774,...,0.184604,0.18875,0.217757,0.307132,0.211078,0.239659,0.339125,0.213026,0.288961,0.196192
50%,0.351722,0.397083,0.609836,0.533962,0.527381,0.467431,0.493428,0.453052,0.420723,0.603635,...,0.317857,0.466375,0.413071,0.512941,0.373235,0.486317,0.61086,0.417295,0.426863,0.481601
75%,0.582278,0.582178,0.791215,0.754841,0.743164,0.74367,0.753657,0.722293,0.77816,0.741381,...,0.57233,0.652514,0.604753,0.779378,0.710319,0.616181,0.800666,0.606719,0.682164,0.631426
max,0.96846,0.905026,0.983306,0.999393,0.990786,1.0,0.97752,0.995263,0.963235,0.975586,...,0.973679,0.950218,0.983826,0.989411,0.942549,0.99446,0.996873,0.925924,0.972803,0.797906


In [15]:
testing_set.describe()

Unnamed: 0,Glucose,Cholesterol,Hemoglobin,Platelets,White Blood Cells,Red Blood Cells,Hematocrit,Mean Corpuscular Volume,Mean Corpuscular Hemoglobin,Mean Corpuscular Hemoglobin Concentration,...,Triglycerides,HbA1c,LDL Cholesterol,HDL Cholesterol,ALT,AST,Heart Rate,Creatinine,Troponin,C-reactive Protein
count,486.0,486.0,486.0,486.0,486.0,486.0,486.0,486.0,486.0,486.0,...,486.0,486.0,486.0,486.0,486.0,486.0,486.0,486.0,486.0,486.0
mean,0.490044,0.506797,0.485502,0.528136,0.509783,0.504347,0.501042,0.516185,0.510145,0.501105,...,0.498538,0.489365,0.501706,0.487933,0.500615,0.499675,0.493794,0.515541,0.510077,0.517365
std,0.284196,0.282871,0.298818,0.29261,0.290887,0.302865,0.294501,0.279954,0.285213,0.290614,...,0.288848,0.290849,0.286729,0.292726,0.285374,0.301451,0.2924,0.287223,0.294011,0.291645
min,0.001827,0.003088,0.000719,6e-06,-0.000206,0.000552,0.004556,0.000309,-0.000614,0.000719,...,0.001885,-0.000991,0.001036,-0.000546,-0.000312,0.002047,0.002145,0.008831,0.005714,-0.000991
25%,0.236664,0.268021,0.201994,0.276155,0.264944,0.218573,0.246255,0.287755,0.259967,0.255839,...,0.238772,0.239302,0.262516,0.229095,0.251076,0.239369,0.228581,0.262924,0.267101,0.263192
50%,0.496471,0.502397,0.477706,0.538642,0.511102,0.518103,0.496275,0.533319,0.498332,0.493929,...,0.503164,0.481549,0.511373,0.481533,0.515321,0.497846,0.507581,0.526299,0.508479,0.546542
75%,0.727144,0.754638,0.750028,0.789486,0.767896,0.768466,0.761107,0.746765,0.761956,0.734817,...,0.749527,0.725472,0.756136,0.741867,0.739248,0.775458,0.751124,0.777839,0.781128,0.77694
max,0.991742,0.999606,0.997876,0.999507,0.999646,0.997267,1.000857,0.996362,0.999234,0.999666,...,0.999826,0.999265,0.999567,1.000744,1.000326,1.000402,0.999857,0.999348,1.000125,0.999488


In [16]:
X_train = training_set.drop('Disease', axis=1)
y_train = training_set['Disease']

X_test = testing_set.drop('Disease', axis=1)
y_test = testing_set['Disease']

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Logistic Regression
logisticRegression = LogisticRegression(max_iter=1000 )

# Decision Tree
decisionTree = tree.DecisionTreeClassifier()

# Random Forest
randomForest = RandomForestClassifier()

# Gradient Boosting
gradientBoosting = GradientBoostingClassifier()

# Naive Bayes
naiveBayes = GaussianNB()

# KNN
knn = KNeighborsClassifier(n_neighbors=3)

# SVM
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))

In [18]:
# Logistic Regression
logisticRegression.fit(X_train, y_train)
logisticRegression_predictions = logisticRegression.predict(X_test)
logisticRegression_accuracy = accuracy_score(y_test, logisticRegression_predictions)
logisticRegression_precision = precision_score(y_test, logisticRegression_predictions, average='weighted', zero_division=1)
logisticRegression_recall = recall_score(y_test, logisticRegression_predictions, average='weighted')
logisticRegression_f1 = f1_score(y_test, logisticRegression_predictions, average='weighted')
print('Logistic Regression Accuracy: ', logisticRegression_accuracy * 100, '%')
print('Logistic Regression Precision: ', logisticRegression_precision * 100, '%')
print('Logistic Regression Recall: ', logisticRegression_recall * 100, '%')
print('Logistic Regression F1 Score: ', logisticRegression_f1 * 100, '%')

print("\n")
# Decision Tree
decisionTree.fit(X_train, y_train)
decisionTree_predictions = decisionTree.predict(X_test)
decisionTree_accuracy = accuracy_score(y_test, decisionTree_predictions)
decisionTree_precision = precision_score(y_test, decisionTree_predictions, average='weighted', zero_division=1)
decisionTree_recall = recall_score(y_test, decisionTree_predictions, average='weighted')
decisionTree_f1 = f1_score(y_test, decisionTree_predictions, average='weighted')
print('Decision Tree Accuracy: ', decisionTree_accuracy * 100, '%')
print('Decision Tree Precision: ', decisionTree_precision * 100, '%')
print('Decision Tree Recall: ', decisionTree_recall * 100, '%')
print('Decision Tree F1 Score: ', decisionTree_f1 * 100, '%')

print("\n")
# Random Forest
randomForest.fit(X_train, y_train)
randomForest_predictions = randomForest.predict(X_test)
randomForest_accuracy = accuracy_score(y_test, randomForest_predictions)
randomForest_precision = precision_score(y_test, randomForest_predictions, average='weighted', zero_division=1)
randomForest_recall = recall_score(y_test, randomForest_predictions, average='weighted')
randomForest_f1 = f1_score(y_test, randomForest_predictions, average='weighted')
print('Random Forest Accuracy: ', randomForest_accuracy * 100, '%')
print('Random Forest Precision: ', randomForest_precision * 100, '%')
print('Random Forest Recall: ', randomForest_recall * 100, '%')
print('Random Forest F1 Score: ', randomForest_f1 * 100, '%')

print("\n")
# Gradient Boosting
gradientBoosting.fit(X_train, y_train)
gradientBoosting_predictions = gradientBoosting.predict(X_test)
gradientBoosting_accuracy = accuracy_score(y_test, gradientBoosting_predictions)
gradientBoosting_precision = precision_score(y_test, gradientBoosting_predictions, average='weighted', zero_division=1)
gradientBoosting_recall = recall_score(y_test, gradientBoosting_predictions, average='weighted')
gradientBoosting_f1 = f1_score(y_test, gradientBoosting_predictions, average='weighted')
print('Gradient Boosting Accuracy: ', gradientBoosting_accuracy * 100, '%')
print('Gradient Boosting Precision: ', gradientBoosting_precision * 100, '%')
print('Gradient Boosting Recall: ', gradientBoosting_recall * 100, '%')
print('Gradient Boosting F1 Score: ', gradientBoosting_f1 * 100, '%')

print("\n")
# Naive Bayes
naiveBayes.fit(X_train, y_train)
naiveBayes_predictions = naiveBayes.predict(X_test)
naiveBayes_accuracy = accuracy_score(y_test, naiveBayes_predictions)
naiveBayes_precision = precision_score(y_test, naiveBayes_predictions, average='weighted', zero_division=1)
naiveBayes_recall = recall_score(y_test, naiveBayes_predictions, average='weighted')
naiveBayes_f1 = f1_score(y_test, naiveBayes_predictions, average='weighted')
print('Naive Bayes Accuracy: ', naiveBayes_accuracy * 100, '%')
print('Naive Bayes Precision: ', naiveBayes_precision * 100, '%')
print('Naive Bayes Recall: ', naiveBayes_recall * 100, '%')
print('Naive Bayes F1 Score: ', naiveBayes_f1 * 100, '%')

print("\n")
# KNN
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions, average='weighted', zero_division=1)
knn_recall = recall_score(y_test, knn_predictions, average='weighted')
knn_f1 = f1_score(y_test, knn_predictions, average='weighted')
print('KNN Accuracy: ', knn_accuracy * 100, '%')
print('KNN Precision: ', knn_precision * 100, '%')
print('KNN Recall: ', knn_recall * 100, '%')
print('KNN F1 Score: ', knn_f1 * 100, '%')

print("\n")
# SVM
svm.fit(X_train, y_train)
svm_predictions = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted', zero_division=1)
svm_recall = recall_score(y_test, svm_predictions, average='weighted')
svm_f1 = f1_score(y_test, svm_predictions, average='weighted')
print('SVM Accuracy: ', svm_accuracy * 100, '%')
print('SVM Precision: ', svm_precision * 100, '%')
print('SVM Recall: ', svm_recall * 100, '%')
print('SVM F1 Score: ', svm_f1 * 100, '%')


Logistic Regression Accuracy:  34.36213991769547 %
Logistic Regression Precision:  56.322504600174675 %
Logistic Regression Recall:  34.36213991769547 %
Logistic Regression F1 Score:  38.631067798823395 %


Decision Tree Accuracy:  41.1522633744856 %
Decision Tree Precision:  60.41431506040675 %
Decision Tree Recall:  41.1522633744856 %
Decision Tree F1 Score:  42.96835782825475 %

Random Forest Accuracy:  46.913580246913575 %
Random Forest Precision:  61.68474142062485 %
Random Forest Recall:  46.913580246913575 %
Random Forest F1 Score:  47.98721930662494 %

Gradient Boosting Accuracy:  45.06172839506173 %
Gradient Boosting Precision:  58.94951941288075 %
Gradient Boosting Recall:  45.06172839506173 %
Gradient Boosting F1 Score:  46.59251265845462 %


Naive Bayes Accuracy:  53.70370370370371 %
Naive Bayes Precision:  63.52330374712912 %
Naive Bayes Recall:  53.70370370370371 %
Naive Bayes F1 Score:  52.33447447966617 %


KNN Accuracy:  18.51851851851852 %
KNN Precision:  55.787023499