In [1]:
# Import our dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Import our input dataset
df = pd.read_csv('Resources/diabetes_prediction_dataset.csv')
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [3]:
df["diabetes"].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [4]:
# Select categorical columns
categorical_cols = ['gender', 'smoking_history']

# Convert categorical columns to one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols).astype("int")
df_encoded.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80,0,1,25,6,140,0,1,0,0,0,0,0,0,1,0
1,54,0,0,27,6,80,0,1,0,0,1,0,0,0,0,0
2,28,0,0,27,5,158,0,0,1,0,0,0,0,0,1,0
3,36,0,0,23,5,155,0,1,0,0,0,1,0,0,0,0
4,76,1,1,20,4,155,0,0,1,0,0,1,0,0,0,0


In [5]:
# Separate the Features (X) from the Target (y)
y = df_encoded["diabetes"]
X = df_encoded.drop(columns="diabetes")

In [6]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(75000, 15)

In [7]:
# Create StandardScaler object
scaler = StandardScaler()

# Fit scaler to training data and transform training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Logistic Regression Model

In [8]:
# Create a logistic regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [9]:
# Fit (train) or model using the training data
classifier.fit(X_train_scaled, y_train)

In [10]:
# Score the model using the test data
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.9587333333333333
Testing Data Score: 0.95784


In [11]:
# Make predictions
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.tail(30)

Unnamed: 0,Prediction,Actual
24970,0,0
24971,0,1
24972,0,0
24973,0,0
24974,0,0
24975,0,0
24976,0,0
24977,0,0
24978,0,0
24979,0,0


In [12]:
# Predict the target variable for the test set
y_pred = classifier.predict(X_test)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[    0 22875]
 [    0  2125]]




In [13]:
# Calculate the Accuracy Score
accuracy_score(y_test, predictions)

0.95784

In [14]:
# Classification Report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     22875
           1       0.87      0.60      0.71      2125

    accuracy                           0.96     25000
   macro avg       0.91      0.79      0.84     25000
weighted avg       0.96      0.96      0.95     25000



# Random Forest  Model

In [23]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [24]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [25]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22722,153
Actual 1,740,1385


Accuracy Score : 0.96428
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     22875
           1       0.90      0.65      0.76      2125

    accuracy                           0.96     25000
   macro avg       0.93      0.82      0.87     25000
weighted avg       0.96      0.96      0.96     25000

