# Diabetes Prediction using LightGBMClassifier

Installing dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

Data preprocessing

In [2]:
diabetes_dataset = pd.read_csv("diabetes_prediction_dataset.csv")

In [3]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [4]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [5]:
diabetes_dataset.shape

(100000, 9)

In [6]:
diabetes_dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


Checking null values

In [7]:
diabetes_dataset.isna().sum() 

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Dealing with duplicate values

In [8]:
diabetes_dataset.duplicated().sum()

3854

In [9]:
diabetes_dataset = diabetes_dataset.drop_duplicates()

In [10]:
diabetes_dataset.duplicated().sum()

0

Performing Label Encoding on categorical values

In [11]:
for col in ['gender','smoking_history', 'diabetes'] :
    print (f'Number of Column {col} is: ' , diabetes_dataset[col].nunique())
    print (f'Number of Column {col} is: ' , diabetes_dataset[col].unique())

Number of Column gender is:  3
Number of Column gender is:  ['Female' 'Male' 'Other']
Number of Column smoking_history is:  6
Number of Column smoking_history is:  ['never' 'No Info' 'current' 'former' 'ever' 'not current']
Number of Column diabetes is:  2
Number of Column diabetes is:  [0 1]


In [12]:
diabetes_dataset['gender'].value_counts()

gender
Female    56161
Male      39967
Other        18
Name: count, dtype: int64

In [13]:
diabetes_dataset['smoking_history'].value_counts()

smoking_history
never          34398
No Info        32887
former          9299
current         9197
not current     6367
ever            3998
Name: count, dtype: int64

In [14]:
diabetes_dataset['diabetes'].value_counts()

diabetes
0    87664
1     8482
Name: count, dtype: int64

In [15]:
label = LabelEncoder() # label encoding
diabetes_dataset['gender'] = label.fit_transform(diabetes_dataset['gender'])
diabetes_dataset['smoking_history'] = label.fit_transform(diabetes_dataset['smoking_history'])

In [16]:
diabetes_dataset['gender'].unique()

array([0, 1, 2])

In [17]:
diabetes_dataset['smoking_history'].unique()

array([4, 0, 1, 3, 2, 5])

In [18]:
diabetes_dataset['smoking_history'].value_counts()

smoking_history
4    34398
0    32887
3     9299
1     9197
5     6367
2     3998
Name: count, dtype: int64

In [19]:
diabetes_dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [20]:
diabetes_dataset.shape

(96146, 9)

Removing outliers

In [21]:
from collections import Counter
def detect_outliers(diabetes_dataset,features):
    outlier_indices = []

    for c in features:
        # 1st quartile
        Q1 = np.percentile(diabetes_dataset[c],25)
        # 3rd quartile
        Q3 = np.percentile(diabetes_dataset[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = diabetes_dataset[(diabetes_dataset[c] < Q1 - outlier_step) | (diabetes_dataset[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)

    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)

    return multiple_outliers

In [22]:
diabetes_dataset.loc[detect_outliers(diabetes_dataset,["age","bmi","HbA1c_level","blood_glucose_level"])]

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
1135,0,62.0,1,1,4,43.16,8.8,280,1
1765,0,25.0,0,0,5,41.65,9.0,280,1
2622,0,57.0,1,0,1,45.13,8.8,280,1
5928,1,65.0,1,0,3,43.41,8.8,300,1
7480,1,38.0,0,0,1,69.66,8.8,300,1
8793,1,42.0,0,0,4,49.2,9.0,300,1
11400,0,64.0,0,0,4,41.92,9.0,260,1
12082,0,63.0,0,0,5,41.81,8.8,280,1
12942,0,62.0,0,0,4,43.49,9.0,300,1
21320,0,40.0,0,0,5,47.65,8.8,300,1


In [23]:
#Let's drop them.
diabetes_dataset = diabetes_dataset.drop(detect_outliers(diabetes_dataset,["age","bmi","HbA1c_level","blood_glucose_level"]),axis = 0).reset_index(drop=True)


In [24]:
diabetes_dataset.loc[detect_outliers(diabetes_dataset,["age","bmi","HbA1c_level","blood_glucose_level"])] #Check again if we have any outlier values.

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes


In [25]:
diabetes_dataset.shape

(96104, 9)

In [26]:
diabetes_dataset['diabetes'].value_counts()

diabetes
0    87664
1     8440
Name: count, dtype: int64

Train Test split

In [27]:
x= diabetes_dataset.drop(columns=["diabetes"], axis= 1)
y= diabetes_dataset["diabetes"] # Target

In [28]:
x

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.0,0,1,4,25.19,6.6,140
1,0,54.0,0,0,0,27.32,6.6,80
2,1,28.0,0,0,4,27.32,5.7,158
3,0,36.0,0,0,1,23.45,5.0,155
4,1,76.0,1,1,1,20.14,4.8,155
...,...,...,...,...,...,...,...,...
96099,0,36.0,0,0,0,24.60,4.8,145
96100,0,2.0,0,0,0,17.37,6.5,100
96101,1,66.0,0,0,3,27.83,5.7,155
96102,0,24.0,0,0,4,35.42,4.0,100


In [29]:
x.dtypes


gender                   int32
age                    float64
hypertension             int64
heart_disease            int64
smoking_history          int32
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
dtype: object

In [30]:
for columns in x.columns:
    print(columns)

gender
age
hypertension
heart_disease
smoking_history
bmi
HbA1c_level
blood_glucose_level


In [31]:
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size=0.2, random_state=0,stratify=y)

Model training and Classification report

In [32]:
# Create the LightGBM Classifier
model = lgb.LGBMClassifier(random_state=42)

model.fit(x_train, y_train)

# Predict on the test set
y_pred = model.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("LightGBM Model Accuracy:", accuracy)
print("Classification Report:\n", report)

[LightGBM] [Info] Number of positive: 6752, number of negative: 70131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 76883, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.087822 -> initscore=-2.340526
[LightGBM] [Info] Start training from score -2.340526
LightGBM Model Accuracy: 0.9705010145153738
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     17533
           1       0.97      0.68      0.80      1688

    accuracy                           0.97     19221
   macro avg       0.97      0.84      0.89     19221
weighted avg       0.97      0.97      0.97     19221



Making a predictive system

In [33]:
input_data = (0,67.0,0,0,4,63.48,8.8,155,)
# changing the input data to numpy array
input_data_as_numpy_array = np.asarray(input_data,dtype=float)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

print(input_data_reshaped)
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[  0.    67.     0.     0.     4.    63.48   8.8  155.  ]]
[1]
The person is diabetic


Saving a Model for app

In [34]:
import pickle

In [35]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename,'wb'))

In [36]:
# loading the saved model
loaded_model =pickle.load(open('trained_model.sav','rb'))

In [37]:
model