In [1]:
import random
import time

from prediction_functions import cross_validation, predict_test_set
from preprocessing import preprocess_and_split

In [2]:
# Setting our Data Path
filepath = "C:/Users/ericn/Desktop/assignment_eric_nechayev/data/diabetes.csv"

# Importing, Pre-processing and Splitting our Dataset
test_size = 0.2
random.seed(42)
train_df, test_df = preprocess_and_split(filepath=filepath, test_size=test_size)

In [3]:
# Initializing Default Parameters
subsample = 0.8
feature_fraction = 0.8
max_depth = 4
K, p, s = 10, 0.5, 100
plots = 'no'

In [4]:
# Regular Random Forest Baseline
print('Regular Random Forest Baseline:\n')
predict_test_set(train_df, test_df, model_type='RF', s=s, subsample=subsample,
                 feature_fraction=feature_fraction, max_depth=max_depth, plots=plots)

Regular Random Forest Baseline:

Test RF Precision = 0.7884615384615384
Test RF Recall = 0.7884615384615384
Test RF AUPRC = 0.8466772819743903
Test RF AUROC = 0.9309954751131222



In [5]:
# Cross-Validation of Train Set using Biased Random Forest with Default Parameters
# Plots are saved to the plot_images folder 

cross_validation(train_df, model_type='BRAF', K=10, p=0.5, s=100, subsample=subsample,
                 feature_fraction=feature_fraction, max_depth=max_depth, plots='yes')

CV BRAF Precision = 0.8008073101311484
CV BRAF Recall = 0.8387998776585732
CV BRAF AUPRC = 0.8508305911891944
CV BRAF AUROC = 0.9466273106697771



0.9466273106697771

In [6]:
start = time.time()

In [7]:
# Predicting the Test Set using Biased Random Forest with Default Parameters
# Plots are saved to the plot_images folder 

print("Biased Random Forest Test Set: K=10, p=0.5, s=100:\n".format(K,p,s))
predict_test_set(train_df, test_df, model_type='BRAF', K=10, p=0.5, s=100, subsample=subsample,
                 feature_fraction=feature_fraction, max_depth=max_depth, plots='yes')

Biased Random Forest Test Set: K=10, p=0.5, s=100:

Test BRAF Precision = 0.7647058823529411
Test BRAF Recall = 0.75
Test BRAF AUPRC = 0.8298190024070038
Test BRAF AUROC = 0.9325980392156863



In [8]:
print("--- %s seconds ---" % (time.time() - start))

--- 86.92306065559387 seconds ---


In [None]:
# Using 10-Fold Cross Validation to Tune Parameters
max_auroc = 0

for K in [1, 2, 3, 5, 10]:
    
    for p in [0.250, 0.375, 0.500, 0.625, 0.750]:
        
        for s in [50, 100, 150, 200]:
            
            # Score the Current Parameter Combo Based on Mean AUROC of Folds
            print("Biased Random Forest: K={}, p={}, s={}:\n".format(K,p,s))
            current_mean_auroc = cross_validation(train_df, model_type='BRAF', K=K, p=p, s=s, subsample=subsample,
                                                  feature_fraction=feature_fraction, max_depth=max_depth, plots=plots)
            # If Better, Update the Best Parameters
            if current_mean_auroc > max_auroc:
                best_k, best_p, best_s = K, p, s

In [None]:
# Best Parameters
K, p, s = best_k, best_p, best_s
plots = 'yes'

In [None]:
# Cross-Validation with Optimal Parameters
cross_validation(train_df, model_type='BRAF', K=K, p=p, s=s, subsample=subsample,
                 feature_fraction=feature_fraction, max_depth=max_depth, plots=plots)

In [None]:
# Predicting the Test Set with Optimal Parameters
print("Biased Random Forest Test Set: K={}, p={}, s={}:\n".format(K,p,s))
predict_test_set(train_df, test_df, model_type='BRAF', K=K, p=p, s=s, subsample=subsample,
                 feature_fraction=feature_fraction, max_depth=max_depth, plots=plots)