# Test Random Forest with uncertainty

In [1]:
import dataset
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import feature_selection as fsa 
import predictions as p 
from scipy.stats import entropy


# Parameters

In [2]:
folder_path = '/home/tachennf/Documents/delta-rad/extracted_radiomics/'
table = 'rd_f1_f5_gtv.csv'
outcome_csv = 'outcomes.csv'
outcome = 'Récidive Locale'
feat_sel_algo = 'ANOVA_PERC'
max_features = 2

# Load data

In [3]:
X, y, features_list = dataset.get_xy(os.path.join(folder_path, table), os.path.join(folder_path, outcome_csv), outcome)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y[outcome])

In [4]:
print("{} features are selected after repeatability, reproductibility and correlation analysis".format(len(features_list)))

29 features are selected after repeatability, reproductibility and correlation analysis


# Feature selection

In [5]:
best_features, best_feat_sel_model = fsa.get_best_features(X_train, y_train, feat_sel_algo, features_list=features_list, max_features=max_features)
print(best_features)


['original_shape_Maximum2DDiameterRow']


In [6]:
sel_features, X_filtered = fsa.filter_dataset2(X, best_features, len(best_features), features_list)
X_filtered = X_filtered.loc[X_filtered.index.isin(y_train.index)]


In [7]:
znorm_scaler = StandardScaler()
X_train = znorm_scaler.fit_transform(X_filtered)

# Training

In [8]:
from sklearn.model_selection import GridSearchCV

# These are customized functions: should be in utils.py
def hyper_parameters_search(clf, X, y, param_grid, scorer = 'f1', cv=5):
    
    grid = GridSearchCV(clf, param_grid = param_grid, scoring = scorer, cv = cv, return_train_score = True)
    grid.fit(X, y)

    print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
    print("best parameters: {}".format(grid.best_params_))
    
    return grid

In [9]:
param_grid = [{'max_depth': range(1, 5, 4), 'n_estimators' : range(5, 200, 25)}]
model = RandomForestClassifier()
rf_grid = hyper_parameters_search(model, X_train, y_train, param_grid, scorer = 'f1', cv=5)

best mean cross-validation score: 0.233
best parameters: {'max_depth': 1, 'n_estimators': 30}


In [10]:
optimal_threshold = p.compute_opt_threshold(rf_grid, X_train, y_train) # compute optimal threshold


# Testing

In [11]:
X_test = X_test[sel_features]
print(X_test.columns)

Index(['original_shape_Maximum2DDiameterRow'], dtype='object')


In [12]:
X_test = znorm_scaler.fit_transform(X_test)

In [13]:
print(X_test)

[[ 0.79437167]
 [ 0.24969675]
 [-0.04296344]
 [ 1.08530303]
 [-0.16575844]
 [ 0.24969675]
 [ 0.20917148]
 [ 0.24969675]
 [ 0.24969675]
 [ 0.24969675]
 [-2.79462528]
 [-0.03012244]
 [ 0.24969675]
 [ 0.19924077]
 [ 0.24969675]
 [ 0.21043803]
 [-3.12580138]
 [ 0.47578112]
 [ 0.24969675]
 [ 0.69966576]
 [ 0.48772509]]


In [14]:
test_auc, sensitivity, specificity, brier_loss = p.compute_test_metrics(rf_grid, X_test, y_test, optimal_threshold)
print("Test AUC: {:.3f}".format(test_auc))
print("Sensitivity: {:.3f}".format(sensitivity))
print("Specificity: {:.3f}".format(specificity))
print("Brier loss: {:.3f}".format(brier_loss))

Test AUC: 0.525
Sensitivity: 0.200
Specificity: 0.875
Brier loss: 0.231


# Uncertainty test

In [15]:
rf_model = rf_grid.best_estimator_
tree_probs = np.array([tree.predict_proba(X_test) for tree in rf_model.estimators_])
# Calculate the mean probabilities across all trees
mean_probs = np.mean(tree_probs, axis=0)
# Calculate the entropy of the mean probability distribution
entropies = np.array([entropy(prob) for prob in mean_probs])

In [16]:
print(entropies)

[0.40798673 0.40798673 0.40798673 0.50291593 0.40798673 0.40798673
 0.40798673 0.40798673 0.40798673 0.40798673 0.6085921  0.40798673
 0.40798673 0.40798673 0.40798673 0.40798673 0.6085921  0.40798673
 0.40798673 0.40798673 0.40798673]


In [17]:
uncertain_predictions = entropies > 0.5
print(uncertain_predictions)

[False False False  True False False False False False False  True False
 False False False False  True False False False False]


In [18]:
##removeuncertain predictions
X_test = X_test[~uncertain_predictions]
y_test = y_test[~uncertain_predictions]

In [19]:
test_auc, sensitivity, specificity, brier_loss = p.compute_test_metrics(rf_grid, X_test, y_test, optimal_threshold)
print("Test AUC: {:.3f}".format(test_auc))
print("Sensitivity: {:.3f}".format(sensitivity))
print("Specificity: {:.3f}".format(specificity))
print("Brier loss: {:.3f}".format(brier_loss))

Test AUC: 0.500
Sensitivity: 0.000
Specificity: 1.000
Brier loss: 0.179


# Brier score