# Data Preprocessing


In [72]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score , confusion_matrix , precision_score , recall_score

In [73]:
df = pd.read_csv('./diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [74]:
df.shape
#768 data
#9 coloumn

(768, 9)

In [75]:
# Labeled column
# 0: Healthy
# 1: Diabetic
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [76]:
#x -> features => 8 columns
#y -> Outcome
x = df.drop('Outcome',axis=1)
y = df['Outcome']
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [77]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [78]:
x = np.array(x)
y = np.array(y)

In [79]:
x

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [80]:
y

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

# Standardization

In [81]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(x)
# We scale our data between 0 and 1 to make it easier to work with and compare
X

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

# Train/Test

In [82]:
from sklearn.model_selection import train_test_split

# test_size=0.2 means 20% of the data is used for testing and 80% for training
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2)

In [83]:
X_train.shape , y_train.shape

((614, 8), (614,))

In [84]:
X_test.shape , y_test.shape

((154, 8), (154,))

# Now our data is ready to use!
# Next sections we are going to learn several models ;) 

# First Algorithm : Naive Bayes

In [85]:
from sklearn.naive_bayes import GaussianNB

model_naive_bayes = GaussianNB()
model_naive_bayes.fit(X_train , y_train)

In [86]:
y_pred_train_naive_bayes = model_naive_bayes.predict(X_train)
y_pred_test_naive_bayes = model_naive_bayes.predict(X_test)

In [87]:
acc_train_naive_bayes = accuracy_score(y_true= y_train , y_pred= y_pred_train_naive_bayes)
acc_test_naive_bayes = accuracy_score(y_true= y_test , y_pred= y_pred_test_naive_bayes)

acc_train_naive_bayes , acc_test_naive_bayes

(0.760586319218241, 0.7662337662337663)

In [88]:
confusion_matrix(y_test , y_pred_test_naive_bayes)

array([[85, 16],
       [20, 33]], dtype=int64)

In [89]:
precision_naive_bayes = precision_score(y_test , y_pred_test_naive_bayes)
recall_naive_bayes = recall_score(y_test , y_pred_test_naive_bayes)

precision_naive_bayes , recall_naive_bayes

(0.673469387755102, 0.6226415094339622)

# Second Algorithm: SVM (Support Vector Machine)

In [90]:
from sklearn import svm

# Kernel: 'rbf' can be used for more complex decision boundaries compared to linear
# Sigmoid kernel: mimics neural nets; good for certain non-linear binary classifications
model_svm = svm.SVC(kernel='linear')
model_svm.fit(X_train,y_train)

In [91]:
y_pred_train_svm = model_svm.predict(X_train)
y_pred_test_svm = model_svm.predict(X_test)

In [92]:
accuracy_svm_train = accuracy_score(y_true= y_train , y_pred= y_pred_train_svm)
accuracy_svm_test = accuracy_score(y_true= y_test , y_pred= y_pred_test_svm)

accuracy_svm_train , accuracy_svm_test

(0.7833876221498371, 0.7727272727272727)

In [93]:
precision_svm = precision_score(y_test , y_pred_test_svm)
recall_svm = recall_score(y_test , y_pred_test_svm)

precision_svm , recall_svm

(0.7142857142857143, 0.5660377358490566)

# Third Algorithm: KNN (K-Nearest Neighbors)

In [94]:
from sklearn.neighbors import KNeighborsClassifier

## You can play with n_neighbors to see how it impacts accuracy, precision, and recall
model_knn = KNeighborsClassifier(n_neighbors=4)
model_knn.fit(X_train,y_train)

In [95]:
y_pred_train_knn = model_knn.predict(X_train)
y_pred_test_knn = model_knn.predict(X_test)

In [96]:
confusion_matrix(y_test , y_pred_test_knn)

array([[84, 17],
       [25, 28]], dtype=int64)

In [97]:
acc_train_knn = accuracy_score(y_true= y_train , y_pred= y_pred_train_knn)
acc_test_knn = accuracy_score(y_true= y_test , y_pred= y_pred_test_knn)

acc_train_knn , acc_test_knn

(0.8208469055374593, 0.7272727272727273)

In [98]:
precision_knn = precision_score(y_test , y_pred_test_knn)
recall_knn = recall_score(y_test , y_pred_test_knn)

precision_knn , recall_knn

(0.6222222222222222, 0.5283018867924528)

# Fourth Algorithm: Decision Tree

In [99]:
from sklearn.tree import DecisionTreeClassifier

# We doubled the depth of the decision tree (max_depth=16) and reduced the number of trees (n_estimators) in the random forest to one-fifth (n_estimators=20).
# In this case, the decision tree performed better; otherwise, the random forest usually gave better results.
#
# Increasing the depth of a decision tree can lead to overfitting, but it often improves training accuracy.
# On the other hand, reducing the number of estimators in a random forest weakens the ensemble effect,
# which can lower its overall performance.
# So, in cases where the random forest is under-optimized, a well-tuned decision tree might outperform it.


model_dt = DecisionTreeClassifier(max_depth= 8 , min_samples_split= 4 , min_samples_leaf= 2 )
model_dt.fit(X_train , y_train)

# Fifth Algorithm: Random Forest

In [100]:
from sklearn.ensemble import RandomForestClassifier

# n_estimators: number of trees
# A deeper tree doesn't necessarily mean better results!

model_rf = RandomForestClassifier(n_estimators= 100 , max_depth= 8)
model_rf.fit(X_train , y_train)

In [101]:
#Train
y_pred_train_decision_tree = model_dt.predict(X_train)
y_pred_train_Random_forest = model_rf.predict(X_train)

acc_train_decision_tree = accuracy_score(y_train , y_pred_train_decision_tree)
acc_train_random_forest = accuracy_score(y_train , y_pred_train_Random_forest)

acc_train_decision_tree , acc_train_random_forest

(0.9071661237785016, 0.9755700325732899)

In [102]:
#Test
y_pred_test_decision_tree = model_dt.predict(X_test)
y_pred_test_Random_forest = model_rf.predict(X_test)

acc_test_decision_tree = accuracy_score(y_test , y_pred_test_decision_tree)
acc_test_random_forest = accuracy_score(y_test , y_pred_test_Random_forest)

acc_test_decision_tree , acc_test_random_forest

(0.6688311688311688, 0.7532467532467533)

In [103]:
precision_decision_tree = precision_score(y_test , y_pred_test_decision_tree)
precision_random_forest = precision_score(y_test , y_pred_test_Random_forest)

precision_decision_tree , precision_random_forest

(0.5208333333333334, 0.6470588235294118)

In [104]:
recall_decision_tree = recall_score(y_test , y_pred_test_decision_tree)
recall_random_forest = recall_score(y_test , y_pred_test_Random_forest)

recall_decision_tree , recall_random_forest

(0.4716981132075472, 0.6226415094339622)