In [1]:
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
set_config(display="diagram")

In [3]:
seed = 42

In [4]:
df = pd.read_csv('../data/diabetes.csv')
print(df.sample(5))

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
100            1      163             72              0        0  39.0   
546            5      187             76             27      207  43.6   
294            0      161             50              0        0  21.9   
613            6      105             80             28        0  32.5   
282            7      133             88             15      155  32.4   

     DiabetesPedigreeFunction  Age  Outcome  
100                     1.222   33        1  
546                     1.034   53        1  
294                     0.254   65        0  
613                     0.878   26        0  
282                     0.262   37        0  


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
X = df.drop("Outcome", axis=1).values
y = df["Outcome"].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [8]:
scaling = StandardScaler()
pca = PCA()
rf_clf = RandomForestClassifier(n_jobs=-1)

In [9]:
classifier = Pipeline(
    [
        ("scaler", scaling),
        ("decomposer", pca),
        ("random_forest", rf_clf)
    ]
)

In [10]:
n_estimators = [1000, 1500, 2000, 2500, 3000]
max_depth = [i for i in range(3, 10)]
min_samples_split = [i for i in range(3, 8)]
min_samples_leaf = [i for i in range(3, 8)]
criterion = ['entropy', 'gini']

params = {
    'decomposer__n_components': [3, 4, 5],
    'random_forest__n_estimators': n_estimators,
    'random_forest__max_depth': max_depth,
    'random_forest__min_samples_split': min_samples_split,
    'random_forest__min_samples_leaf': min_samples_leaf,
    'random_forest__criterion': criterion
}

In [11]:
rf_clf_grid_cv = GridSearchCV(estimator=classifier, param_grid=params, cv=3, n_jobs=-1, verbose=2, error_score='raise')
rf_clf_grid_cv.fit(X_train, y_train)

Fitting 3 folds for each of 5250 candidates, totalling 15750 fits


In [12]:
rf_clf_grid_cv.best_params_

{'decomposer__n_components': 5,
 'random_forest__criterion': 'gini',
 'random_forest__max_depth': 6,
 'random_forest__min_samples_leaf': 6,
 'random_forest__min_samples_split': 4,
 'random_forest__n_estimators': 1000}

In [13]:
rf_clf_grid_cv.best_estimator_

In [14]:
y_pred = rf_clf_grid_cv.best_estimator_.predict(X_test)

In [15]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[81 18]
 [25 30]]
0.7207792207792207
              precision    recall  f1-score   support

           0       0.76      0.82      0.79        99
           1       0.62      0.55      0.58        55

    accuracy                           0.72       154
   macro avg       0.69      0.68      0.69       154
weighted avg       0.71      0.72      0.72       154

