In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
# ********** Load and clean dataset ************
# Download the dateset here: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset?resource=download
data = pd.read_csv("diabetes_prediction_dataset.csv")
data.head(5)
print('The shape of our data is:', data.shape)
data.head()

The shape of our data is: (100000, 9)


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
feature_list = list(data.columns)

educations = list(data['smoking_history'].unique())
genders = list(data['gender'].unique())

# Encode the categorical data using sequential numbers
data['smoking_history'].replace(educations, [i for i,_ in enumerate(educations)], inplace=True)
data['gender'].replace(genders, [i for i,_ in enumerate(genders)], inplace=True)
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,25.19,6.6,140,0
1,0,54.0,0,0,1,27.32,6.6,80,0
2,1,28.0,0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,2,23.45,5.0,155,0
4,1,76.0,1,1,2,20.14,4.8,155,0


In [5]:
# Remove the labels from the data
labels = np.array(data['diabetes'])
data= data.drop('diabetes', axis = 1)

data = np.array(data)

In [6]:
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size = 0.2, random_state = 42)
print('Training Features Shape:', train_data.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_data.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (80000, 8)
Training Labels Shape: (80000,)
Testing Features Shape: (20000, 8)
Testing Labels Shape: (20000,)


In [7]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(train_data, train_labels)
pickle.dump(rf, open("model.pkl", 'wb'))

# Use the forest's predict method on the test data
predictions = rf.predict(test_data)

In [8]:
# Calculate the absolute errors
errors = np.count_nonzero(predictions - test_labels)
print('Absolute Error:', errors)

# Calculate mean absolute percentage error (MAPE)
error_rate = 100 * (errors / predictions.shape[0])
# Calculate and display accuracy
accuracy = 100 - error_rate
print('Accuracy:', round(accuracy, 2), '%')

Absolute Error: 600
Accuracy: 97.0 %
