In [1]:
# import libraries

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.dummy import DummyClassifier

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
from sklearn.metrics import confusion_matrix, classification_report

In [23]:
#prepare our dependent and independent variable

In [25]:
df = pd.read_csv("diabetes_data_corrected.csv")

In [29]:
x=df.drop("class", axis=1)

In [31]:
y= df["class"]

In [33]:
#split data into train and test

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, stratify=y)

In [35]:
#begin our model training

In [37]:
#start with DummyClassifer to establish baseline

In [38]:
dummy= DummyClassifier()
dummy.fit(x_train, y_train)
dummy_pred=dummy.predict(x_test)



In [39]:
#asses DummyClassifier model

In [40]:
#By the use of a confusion matrix we need to asses the predictions made by the DummyClassifier

The upper left quadrant is the true negative, meaning the patient does not have diabetes,
and you predict that the patient does not have diabetes (This is good)
The lower right quadrant: The patient actually has diabetes and you predict the patient to have diabetes (Also good)
The upper right quadrant: A patient with no diabetes is predicted to have diabetes(patient has to go through other tests leading to anxiety).
The lower left quadrant: patients who do have diabetes, but have been predicted not to have (The disease is not diagnosed). 

In [41]:
confusion_matrix(y_test, dummy_pred)

array([[12, 28],
       [17, 47]], dtype=int64)

In [42]:
#use a classification report

In [43]:
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.41      0.30      0.35        40
           1       0.63      0.73      0.68        64

    accuracy                           0.57       104
   macro avg       0.52      0.52      0.51       104
weighted avg       0.54      0.57      0.55       104



From the above report we have an accuracy of 57% which is as good as random guessing.
Hence, the need to use other models.

In [44]:
#start with logistic regression

In [47]:
logr = LogisticRegression(max_iter=10000)
logr.fit(x_train,y_train)
logr_pred= logr.predict(x_test)

In [48]:
# Lets check the accuracy of our model

In [49]:
confusion_matrix(y_test,logr_pred)

array([[35,  5],
       [ 2, 62]], dtype=int64)

In [50]:
print(classification_report(y_test,logr_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91        40
           1       0.93      0.97      0.95        64

    accuracy                           0.93       104
   macro avg       0.94      0.92      0.93       104
weighted avg       0.93      0.93      0.93       104



In [57]:
#Try DecisionTree
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)
tree_pred = tree.predict(x_test)

In [58]:
confusion_matrix(y_test,tree_pred)

array([[40,  0],
       [ 1, 63]], dtype=int64)

In [59]:
print(classification_report(y_test,tree_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [60]:
#try RandomForest

In [64]:
forest = RandomForestClassifier()
forest.fit(x_train,y_train)
forest_pred=forest.predict(x_test)


In [65]:
confusion_matrix(y_test,forest_pred)

array([[40,  0],
       [ 0, 64]], dtype=int64)

In [66]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        64

    accuracy                           1.00       104
   macro avg       1.00      1.00      1.00       104
weighted avg       1.00      1.00      1.00       104



In [69]:
forest.feature_importances_

array([0.08814336, 0.12228863, 0.22198181, 0.18199886, 0.0627755 ,
       0.01737768, 0.02589422, 0.01907919, 0.0290946 , 0.02834091,
       0.03691234, 0.02869586, 0.05624557, 0.02287917, 0.03760182,
       0.02069046])

In [70]:
x.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [72]:
pd.DataFrame({"feature": x.columns, "importance":forest.feature_importances_}).sort_values("importance",ascending= False)

Unnamed: 0,feature,importance
2,polyuria,0.221982
3,polydipsia,0.181999
1,ismale,0.122289
0,age,0.088143
4,sudden weight loss,0.062776
12,partial paresis,0.056246
14,alopecia,0.037602
10,irritability,0.036912
8,visual blurring,0.029095
11,delayed healing,0.028696


1. Trained a baseline model.
2. Trained 3 different models - logistic regression, decision tree and random forest.
3. Identified the important features in the best performing model.