In [None]:
# Some magic functions for the notebook to work efficiently
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Downloading the dataset
# Actual source of the dataset: https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database?resource=download
!pip install --upgrade gdown -qq
!gdown 1_eVroNLDwiGaOsw9N59LPTHEUUp3_o9K

Downloading...
From: https://drive.google.com/uc?id=1_eVroNLDwiGaOsw9N59LPTHEUUp3_o9K
To: /content/diabetes.csv
  0% 0.00/23.9k [00:00<?, ?B/s]100% 23.9k/23.9k [00:00<00:00, 41.2MB/s]


In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [None]:
# Load the dataset
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.columns = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
df

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
feat_col = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = df[feat_col]
Y = df['label']
X

Unnamed: 0,pregnant,insulin,bmi,age,glucose,bp,pedigree
0,6,0,33.6,50,148,72,0.627
1,1,0,26.6,31,85,66,0.351
2,8,0,23.3,32,183,64,0.672
3,1,94,28.1,21,89,66,0.167
4,0,168,43.1,33,137,40,2.288
...,...,...,...,...,...,...,...
763,10,180,32.9,63,101,76,0.171
764,2,0,36.8,27,122,70,0.340
765,5,112,26.2,30,121,72,0.245
766,1,0,30.1,47,126,60,0.349


In [None]:
Y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: label, Length: 768, dtype: int64

In [None]:
# Now we split the data into train and test set for actual validation of performance on how the model performs
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Let's check the splits
print(f"Actual length of X before splitting:{len(X)}")
print(f"Actual length of X_train after splitting:{len(X_train)}")
print(f"Actual length of X_test after splitting:{len(X_test)}")

Actual length of X before splitting:768
Actual length of X_train after splitting:614
Actual length of X_test after splitting:154


In [None]:
# Building the actual model
model = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
# Fit the data on the training dataset
model = model.fit(X_train, y_train)

In [None]:
# Making predictions on the test dataset
y_pred = model.predict(X_test)

In [None]:
# Calculating the accuracy score of the predictions
accuracy_score(y_pred, y_test)

0.7402597402597403

In [None]:
# Looking at the confusion matrix of the predictions
cnf = confusion_matrix(y_test, y_pred)
cnf

array([[78, 21],
       [19, 36]])

This is a 2x2 matrix because we only have binary label values in our dataset

In [None]:
# Now let's try to visualize the matrix as a heatmap
sns.heatmap(pd.DataFrame(cnf), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

NameError: name 'sns' is not defined

In [None]:
# Looking at the confusion matrix of the predictions
cnf = confusion_matrix(y_test, y_pred)
cnf

array([[78, 21],
       [19, 36]])

In [None]:
# It is very easy to calculate the precision, recall and f1-score as well
print(classification_report(y_test, y_pred, target_names=['without_diabetes','with_diabetes']))

NameError: name 'classification_report' is not defined

References:
1. https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
2. https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
3. https://seaborn.pydata.org/generated/seaborn.heatmap.html
4. https://www.datacamp.com/tutorial/understanding-logistic-regression-python