<a href="https://colab.research.google.com/github/cicily19/ClassificationMachineLearning/blob/main/ClassificationMachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CLASSIFICATION: SUPERVISED MACHINE LEARNING
#  DISEASE PREDICTION MODEL FOR DIABETIC PATIENTS
# This model will predict whether or not a female patient is developing signs of diabetes
# Constraints: FEMALES ARE AT LEAST 21 YEARS OLD
# lITERATURE REVIEW: It is proven that diadnosing diabetes patients at an early age reduces the risks associated with diabetes


# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# read the data
diabetes_data = pd.read_csv('https://msi.martial.co.ke/datasets/pima.csv')
diabetes_data.head(10)

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,Diabetic
1,1,85,66,29,0,26.6,0.351,31,Not Diabetic
2,8,183,64,0,0,23.3,0.672,32,Diabetic
3,1,89,66,23,94,28.1,0.167,21,Not Diabetic
4,0,137,40,35,168,43.1,2.288,33,Diabetic
5,5,116,74,0,0,25.6,0.201,30,Not Diabetic
6,3,78,50,32,88,31.0,0.248,26,Diabetic
7,10,115,0,0,0,35.3,0.134,29,Not Diabetic
8,2,197,70,45,543,30.5,0.158,53,Diabetic
9,8,125,96,0,0,0.0,0.232,54,Diabetic


In [None]:
# Statistical Analysis
diabetes_data.describe()

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [None]:
# MACHINE LEARNNG
# STEP 1: DIVIDE THE DATASET INTO FEATURES(X) AND LABELS(Y)
array = diabetes_data.values
array

array([[6, 148, 72, ..., 0.627, 50, 'Diabetic'],
       [1, 85, 66, ..., 0.351, 31, 'Not Diabetic'],
       [8, 183, 64, ..., 0.672, 32, 'Diabetic'],
       ...,
       [5, 121, 72, ..., 0.245, 30, 'Not Diabetic'],
       [1, 126, 60, ..., 0.349, 47, 'Diabetic'],
       [1, 93, 70, ..., 0.315, 23, 'Not Diabetic']], dtype=object)

In [None]:
X = array[:, 0:8]
print(X.shape)

Y = array[:, 8]
print(Y.shape)

(768, 8)
(768,)


In [None]:
# b) Split the Dataset(X and Y) into Training and Testing Sets
# The Training Dataset will be used for Training the Model and Normally Takes 70% of records
# The Testing Dataset will be later used for Evaluation, and cover at least 30% of the records

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)


# training sets
print(X_train.shape, Y_train.shape)

# testing sets
print(X_test.shape, Y_test.shape)

(537, 8) (537,)
(231, 8) (231,)


In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Support Vector Machine
from sklearn.svm import SVC

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [None]:
# d) CROSS VALIDATION
# Hyperparameter Tuning
# The process that is used to determine the best learning algorithm for a particular problem
# This process validates and picks the right algorithm that will produce a higher accuracy
import warnings
warnings.filterwarnings('ignore')

models = []
models.append(('D Trees', DecisionTreeClassifier()))
models.append(('KNR', KNeighborsClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('NB', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('Gradient Boosting', GradientBoostingClassifier()))
models.append(('SVM', SVC()))

# Import Cross Validation and KFOLD
from sklearn.model_selection import cross_val_score, KFold
# Create a for loop so that each model is tested in turn
for name, model in models:
          # Here we do a 10 split K-FOLD
          kfold = KFold(n_splits = 10, random_state=42, shuffle=True)

          # We get the results for each Fold, Train and Test using the Folds
          cv_results = cross_val_score(model, X_train, Y_train, cv = kfold,
          scoring='accuracy')

          # Get the average of all Folds for current model in the loop
          print(name, ' Results:= ', cv_results.mean())


D Trees  Results:=  0.732040531097135
KNR  Results:=  0.7208944793850453
LR  Results:=  0.7710342417889586
NB  Results:=  0.7507686932215234
Random Forest  Results:=  0.7671907756813418
Gradient Boosting  Results:=  0.7691125087351502
SVM  Results:=  0.759958071278826


In [None]:
# e) Model Training
# The is the Learning Process where the Model uses the best to learn using the Training set
import warnings
warnings.filterwarnings('ignore')

model = LogisticRegression()
model.fit(X_train, Y_train)

In [None]:
# f) Model Evaluation
model_predictions = model.predict(X_test)
print("The Model Predicted: ", model_predictions)

print("\n\n")

print("The Correct Answer is : ", Y_test)

print("\n\n")

The Model Predicted:  ['Not Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Not Diabetic' 'Not Diabetic' 'Not Diabetic' 'Diabetic' 'Diabetic'
 'Diabetic' 'Not Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Not Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic' 'Diabetic'
 'Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Not Diabetic' 'Diabetic' 'Diabetic' 'Diabetic' 'Diabetic' 'Diabetic'
 'Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic' 'Diabetic'
 'Not Diabetic' 'Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic' 'Diabetic'
 'Not Diabetic' 'Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Not Diabetic' 'Diabetic' 'Not Diabetic' 'Not Diabetic' 'Diabetic'
 'Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic' 'Not Diabetic'
 'Diabetic' 'Not Diabetic' 'Diabetic' 'Not Diabetic' 'Diabetic' 'Diabetic'
 'Not Diabetic' 'No

In [None]:
# Therefore, we will a scientific metric called accuracy_score, which is the ratio between the model_prediction and Y_test result
from sklearn.metrics import accuracy_score
print("The Accuracy of the Model is: ", accuracy_score(Y_test, model_predictions) * 100,  "%")

The Accuracy of the Model is:  74.02597402597402 %


In [None]:
diabetes_data.describe()

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [None]:
# g) Predicting Results
# The model was trained on 8 features(Children	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age)
# We will be able to predicted the condition

new_features = [[2, 100, 72,  79, 5, 56, 2.0, 40]]
predicted_condition = model.predict(new_features)

print(predicted_condition)

['Diabetic']
