In [1]:
import numpy as np
import matplotlib.pyplot as plt
from define_dataset import define_dataset
import time
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
def build_model(X, y):
    params = [{
    'bootstrap': [True, False],
    'max_depth': [100, None],
    'max_features': ['auto', 'log2'],
    'min_samples_leaf': [1, 2],
    #'min_samples_split': [2, 4, 8],
    'n_estimators': [100, 500, 1000]
    }] 
    clf = GridSearchCV(RandomForestClassifier(), params, refit = True, cv=6)
    clf.fit(X, y)
    print("Best parameters set found:")
    print(clf.best_params_)
    print("Best score found:")
    print(clf.best_score_)
    return(clf.best_score_, clf.best_estimator_)

In [5]:
dataset = define_dataset()
tr_set = dataset.tr_set
test_set = dataset.test_set
results_dict = dataset.results_dict
num_bins_arr = [40, 60, 100]
ROI_threshold_arr = [0.85, 0.9, 0.95]
best_clf_score = 0

for ROI_threshold in ROI_threshold_arr:
    for num_bins in num_bins_arr:
        X = np.load('bins/bins_arr_'+str(num_bins)+'r'+str(ROI_threshold)+'.npy')
        y = np.zeros(np.shape(tr_set), dtype='int')
        for i, slide_name in enumerate(tr_set):
            y[i] = results_dict[slide_name]
        print('Num bins:', num_bins, 'ROI threshold:', ROI_threshold)

        (score, clf) = build_model(X, y)
        if score >= best_clf_score:
            best_clf_score = score
            best_clf = clf
            best_bins = num_bins
            best_ROI = ROI_threshold

Num bins: 40 ROI threshold: 0.85
Best parameters set found:
{'bootstrap': True, 'max_depth': 100, 'max_features': 'log2', 'min_samples_leaf': 2, 'n_estimators': 1000}
Best score found:
0.85
Num bins: 60 ROI threshold: 0.85
Best parameters set found:
{'bootstrap': False, 'max_depth': 100, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 1000}
Best score found:
0.85
Num bins: 100 ROI threshold: 0.85
Best parameters set found:
{'bootstrap': True, 'max_depth': 100, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 500}
Best score found:
0.8222222222222223
Num bins: 40 ROI threshold: 0.9
Best parameters set found:
{'bootstrap': True, 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 1000}
Best score found:
0.8222222222222223
Num bins: 60 ROI threshold: 0.9
Best parameters set found:
{'bootstrap': True, 'max_depth': 100, 'max_features': 'log2', 'min_samples_leaf': 2, 'n_estimators': 100}
Best score found:
0.85
Num bins: 100 ROI thresh

In [8]:
print('best bins:', best_bins, ', best ROI threshold:', best_ROI, ', best score:', best_clf_score)
X_test = np.load('./bins/bins_arr_test_'+str(best_bins)+'r'+str(best_ROI)+'.npy')
y_test = np.zeros(np.shape(test_set), dtype='int')
for i, slide_name in enumerate(test_set):
    y_test[i] = results_dict[slide_name]

y_pred = best_clf.predict(X_test)

print('predictions:',y_pred)
print('real y:', y_test)
score = accuracy_score(y_test, y_pred)
errors = np.size(test_set) - accuracy_score(y_test, y_pred, normalize=False)
print('Test result:', score)
print('Errors:', errors)

best bins: 100 , best ROI threshold: 0.95 , best score: 0.8777777777777778
predictions: [1 1 0 1 1 1]
real y: [0 0 0 1 1 1]
Test result: 0.6666666666666666
Errors: 2
