In [1]:
import pandas as pd
import numpy as np

# Import the Dataset
The Diabetes dataset is used, it can be found [here](https://archive.ics.uci.edu/ml/datasets/Early+stage+diabetes+risk+prediction+dataset.).

In [2]:
data = pd.read_csv('diabetes_data_upload.csv')
data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


# Check for NULL values

In [3]:
data.isnull().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

There are no null values, so no values need to be filled in.

# Split the data columns into an X and Y variable

In [4]:
X = data.drop(columns=['class'], axis = 1)
y = data['class']

# Data Preprocessing
Apply some preprocessing to the data so it will fit better with the Model. The dataset contains all string values, so they need to be converted into a numerical format.

In [5]:
from sklearn import preprocessing

labelEncoder = preprocessing.LabelEncoder()

for column in X:
    if column == 'Age':
        continue
    
    X[column] = labelEncoder.fit_transform(X[column])

labelEncoder.fit(['Positive', 'Negative'])
y = labelEncoder.transform(y)

# Split the data
The data will be split into a training dataset and a testing dataset to ensure the model is accurate.

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression(solver='lbfgs', max_iter=3000)
lg_model.fit(X_train, Y_train)

LogisticRegression(max_iter=3000)

# Create the Logistic Regression Model

In [8]:
y_predicted_values = lg_model.predict(X_test)
actual_values = Y_test

In [9]:
test_dict = {'Predicted': y_predicted_values, 'Actual': actual_values}
test_result = pd.DataFrame(test_dict)

In [10]:
test_result.head()

Unnamed: 0,Predicted,Actual
0,1,1
1,1,1
2,1,1
3,0,0
4,0,0


The results look accurate based on what is shown above, however further checks will be done to get the real accuracy.

# Check the accuracy of the Model

In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(actual_values, y_predicted_values)

0.9519230769230769

As can be seen above, the accuracy of the model is very high, at roughly 95%.

## Classification Report

In [12]:
from sklearn.metrics import classification_report
classif_report = classification_report(actual_values, y_predicted_values)
print(classif_report)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        40
           1       0.95      0.97      0.96        64

    accuracy                           0.95       104
   macro avg       0.95      0.95      0.95       104
weighted avg       0.95      0.95      0.95       104



The result of the classification report shows that the model scores very highly.