In [1]:
import pandas as pd 
import numpy as np

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(r"E:\Data Science by SRK\Machine_learning\Classification\Diabetes\cleaned_dataset_diabetes.csv")

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89.0,66.0,23.0,1.460250,28.1,0.861443,1.288798,0
1,0,137.0,40.0,35.0,1.532648,43.1,1.071407,1.338267,1
2,3,78.0,50.0,32.0,1.452246,31.0,0.890303,1.311941,1
3,2,197.0,70.0,45.0,1.690052,30.5,0.857475,1.392162,1
4,1,189.0,60.0,23.0,1.753669,30.1,0.926098,1.404659,1
...,...,...,...,...,...,...,...,...,...
387,0,181.0,88.0,44.0,1.681244,43.3,0.882124,1.311941,1
388,1,128.0,88.0,39.0,1.479504,36.5,1.004630,1.351088,1
389,2,88.0,58.0,26.0,1.259921,28.4,0.978030,1.293804,0
390,10,101.0,76.0,48.0,1.541485,32.9,0.863144,1.412359,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               392 non-null    int64  
 1   Glucose                   392 non-null    float64
 2   BloodPressure             392 non-null    float64
 3   SkinThickness             392 non-null    float64
 4   Insulin                   392 non-null    float64
 5   BMI                       392 non-null    float64
 6   DiabetesPedigreeFunction  392 non-null    float64
 7   Age                       392 non-null    float64
 8   Outcome                   392 non-null    int64  
dtypes: float64(7), int64(2)
memory usage: 27.7 KB


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# X and y

In [6]:
X = pd.get_dummies(df.drop('Outcome', axis = 1), drop_first = True)
y = df['Outcome']

# Train_Test_split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = True)

# Modelling and Evaluation

In [8]:
# Random Forest classifire with default parameters
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = True)
model.fit(X_train, y_train)

# Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy : ", accuracy_score(ypred_train, y_train))
print("Test accuracy : ", accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score : ", cross_val_score(model, X, y, cv=5).mean())

Train accuracy :  1.0
Test accuracy :  0.7341772151898734
cross_val_score :  0.7910094125284


# Hyper_parameter Tuning

In [9]:
from sklearn.model_selection import GridSearchCV

# model
estimator = RandomForestClassifier(random_state = True)

# Parameters (which you want to tune and identify the best)
param_grid = {"n_estimators" : list(range(1, 10))}

grid = GridSearchCV(estimator, param_grid, scoring = 'accuracy', cv= 5)

grid.fit(X_train, y_train)

grid.best_params_

{'n_estimators': 6}

# Importance of each features

In [10]:
grid.best_estimator_.feature_importances_


array([0.09861464, 0.32872845, 0.07951967, 0.08776356, 0.06597938,
       0.11931882, 0.1383978 , 0.08167768])

In [11]:
feats_imp = pd.DataFrame(data = grid.best_estimator_.feature_importances_, 
                         index = X.columns, 
                         columns = ['Feature Importance'])
important_feats = feats_imp[feats_imp['Feature Importance'] > 0]
important_features_list = important_feats.index.to_list()
important_features_list

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

# Final Random Forest model
**with best HPT AND important features**

In [12]:
X_imp = X[important_features_list]

# Train_Test_Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = True)


# Model with best HPT AND important features
final_rf_model = RandomForestClassifier()
final_rf_model.fit(X_train, y_train)

# Prediction
ypred_train = final_rf_model.predict(X_train)
ypred_test = final_rf_model.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy : ", accuracy_score(ypred_train, y_train))
print("Test accuracy : ", accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score : ", cross_val_score(model, X, y, cv=5).mean())

Train accuracy :  1.0
Test accuracy :  0.759493670886076
cross_val_score :  0.7910094125284
