In [109]:
import os
import pickle
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score , accuracy_score, roc_auc_score, confusion_matrix, roc_curve
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler , MinMaxScaler, OneHotEncoder
from sklearn.metrics import PrecisionRecallDisplay , RocCurveDisplay
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [110]:
def random_forest_012_train_evaluate(train_data, test_data):
    
    ## Read Process data
    X_train , X_test = pd.read_csv(train_data) , pd.read_csv(test_data)
    y_train , y_test  = X_train["Diabetes_012"] , X_test["Diabetes_012"]
    X_train.drop("Diabetes_012", axis = 1, inplace = True)
    X_test.drop("Diabetes_012", axis = 1, inplace = True)

    # Train Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 20, max_depth = 100, min_samples_leaf = 1, min_samples_split = 4)    
    rf_clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = rf_clf.predict(X_test)
    
#     print(classification_report(y_test, y_pred))
    
    return (y_test, y_pred)

def random_forest_012_train_evaluate_top5(train_data, test_data):
    
    ## Read Process data
    X_train , X_test = pd.read_csv(train_data) , pd.read_csv(test_data)
    y_train , y_test  = X_train["Diabetes_012"] , X_test["Diabetes_012"]
    X_train.drop("Diabetes_012", axis = 1, inplace = True)
    X_test.drop("Diabetes_012", axis = 1, inplace = True)
    target_col = "Diabetes_012"
    
    # Train Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 20, max_depth = 100, min_samples_leaf = 1, min_samples_split = 4)    
    rf_clf.fit(X_train, y_train)
    
    ranked_features = pd.DataFrame( zip(X_train.columns, list(rf_clf.feature_importances_) ) , 
                                       columns = ["feature", "rf_importance"]).sort_values( "rf_importance", ascending = False )

    # Select top 5 features
    subset = list(ranked_features['feature'][0:5])
    X_train = X_train[subset]
    X_test  = X_test[subset]
    
    # Train Again
    rf_clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = rf_clf.predict(X_test)
    
#     print(classification_report(y_test, y_pred))
    
    
    return (y_test, y_pred)

def random_forest_binary_split_train_evaluate(train_data, test_data):
    
    ## Read Process data
    X_train , X_test = pd.read_csv(train_data) , pd.read_csv(test_data)
    y_train , y_test  = X_train["Diabetes_binary"] , X_test["Diabetes_binary"]
    X_train.drop("Diabetes_binary", axis = 1, inplace = True)
    X_test.drop("Diabetes_binary", axis = 1, inplace = True)

    # Train Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 180, max_depth = 80, min_samples_leaf = 2, min_samples_split = 6) 
    rf_clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = rf_clf.predict(X_test)
    
#     print(classification_report(y_test, y_pred))
    
    fpr, tpr, _ = roc_curve(y_test, rf_clf.predict_proba(X_test)[:,1])
    roc_auc_val = roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1])
    
    return (y_test, y_pred, fpr, tpr, roc_auc_val)

def random_forest_binary_split_train_evaluate_top5(train_data, test_data):
    
    ## Read Process data
    X_train , X_test = pd.read_csv(train_data) , pd.read_csv(test_data)
    y_train , y_test  = X_train["Diabetes_binary"] , X_test["Diabetes_binary"]
    X_train.drop("Diabetes_binary", axis = 1, inplace = True)
    X_test.drop("Diabetes_binary", axis = 1, inplace = True)
    
    # Train Random Forest
    rf_clf = RandomForestClassifier(n_estimators = 180, max_depth = 80, min_samples_leaf = 2, min_samples_split = 6)    
    rf_clf.fit(X_train, y_train)
    
    ranked_features = pd.DataFrame( zip(X_train.columns, list(rf_clf.feature_importances_) ) , 
                                       columns = ["feature", "rf_importance"]).sort_values( "rf_importance", ascending = False )

    # Select top 5 features
    subset = list(ranked_features['feature'][0:5])
    X_train = X_train[subset]
    X_test  = X_test[subset]
    
    # Train Again
    rf_clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = rf_clf.predict(X_test)
    
#     print(classification_report(y_test, y_pred))
    
    fpr, tpr, _ = roc_curve(y_test, rf_clf.predict_proba(X_test)[:,1])
    roc_auc_val = roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1])
    
    return (y_test, y_pred, fpr, tpr, roc_auc_val)


### All Random Forest Models

In [111]:
train_data_file_012 =  "Dataset/train_012.csv"
test_data_file_012  =  "Dataset/test_012.csv"
train_data_file_binary_split = "Dataset/train_binary_split.csv"
test_data_file_binary_split = "Dataset/test_binary_split.csv"

## Multiclass Random Forest
rf_012_vals = fit_evaluate_random_forest(train_data_file_012, test_data_file_012)
## Multiclass Random Forest trained on top 5 features only
rf_012_vals_top5 = fit_evaluate_random_forest(train_data_file_012, test_data_file_012)

## Binary Random Forest
rf_binary_vals =  random_forest_binary_split_train_evaluate(train_data_file_binary_split , test_data_file_binary_split)
## Binary Random Forest trained on top 5 features only
rf_binary_vals_top5 =  random_forest_binary_split_train_evaluate_top5(train_data_file_binary_split , test_data_file_binary_split)

print("done")

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.49      0.21      0.29      7069

    accuracy                           0.84     50736
   macro avg       0.45      0.39      0.40     50736
weighted avg       0.80      0.84      0.81     50736

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.49      0.20      0.29      7069

    accuracy                           0.84     50736
   macro avg       0.45      0.39      0.40     50736
weighted avg       0.80      0.84      0.81     50736

done
