In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Data Transformation

In [None]:
df = pd.read_csv(r'..\\datasets\\diabetic_data.csv')

In [None]:
df.drop(columns = ['encounter_id','patient_nbr','weight', 'medical_specialty'], inplace=True)
df = df[df.race!='?']
df = df[df.gender!='Unknown/Invalid']
df = df[df.payer_code!='?']
df.replace(to_replace=['Steady', 'Down', 'Up', '>30', '<30'], value='Yes', inplace=True)

In [None]:
df_breakout = pd.get_dummies(df, columns=["race","gender", "age", "admission_type_id", "discharge_disposition_id","admission_source_id",
                                          "payer_code", "diag_1", "diag_2", "diag_3", "max_glu_serum", "A1Cresult", "metformin", "repaglinide",
                                          "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide", "tolbutamide",
                                          "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide", "examide", "citoglipton",
                                          "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone", "metformin-rosiglitazone",
                                          "metformin-pioglitazone", "change", "diabetesMed"], drop_first=True)

**Split dataset into train and test sets**

In [None]:
X = df_breakout.drop(columns = 'readmitted')
y = df_breakout.readmitted
y.replace(to_replace=['>30', '<30'], value='YES', inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Random Forest Classifier Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

#List Hyperparameters that we want to tune.
max_features = [None, 'log2', 'sqrt', 0.2, 0.3, 0.4, 0.5]

#Convert to dictionary
hyperparameters = dict(max_features=max_features)

#Create new KNN object
forest = RandomForestClassifier(random_state=0, verbose=True, n_jobs=-1)
                
#Use GridSearch
clf = GridSearchCV(forest, hyperparameters, cv=10)

best_model = clf.fit(X_train, y_train)

#Print The value of best Hyperparameters
best_max_features = best_model.best_estimator_.get_params()['max_features']

print('Best max_features:', best_max_features)