In [1]:
import sys

In [2]:
sys.path.insert(1, "/home/gemeinl/code/brainfeatures/")

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
from brainfeatures.data_set.tuh_abnormal import TuhAbnormal

In [6]:
from brainfeatures.decoding.decode import get_X_y

In [7]:
from autosklearn.classification import AutoSklearnClassifier



In [8]:
n_recordings = None
ds = TuhAbnormal("/home/gemeinl/data/feats_100_Hz/median/train/", extension=".h5", n_recordings=n_recordings, key="natural", subset="train")

In [9]:
ds.load()

In [10]:
X, y = get_X_y(ds)

In [11]:
len(X), X[0].shape, len(y)

(2716, (1, 8631), 2716)

In [12]:
X = np.concatenate([x for x in X], axis=0)

In [13]:
X.shape

(2716, 8631)

In [103]:
tmp_folder='/home/gemeinl/tmp/tmp5/'
output_folder='/home/gemeinl/tmp/out5/'

In [16]:
automl = AutoSklearnClassifier(
    time_left_for_this_task=60*60*24*1,
    per_run_time_limit=60*60*4,
    initial_configurations_via_metalearning=25,
    ensemble_size=50,
    ensemble_nbest=50,
    ensemble_memory_limit=30000,
    seed=4,
    ml_memory_limit=60000,
    include_estimators=None,
    exclude_estimators=None,
    include_preprocessors=None,
    exclude_preprocessors=None,
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds": 5, "shuffle": False},
    tmp_folder=tmp_folder,
    output_folder=output_folder,
    delete_tmp_folder_after_terminate=False,
    delete_output_folder_after_terminate=False,
    shared_mode=False,
    n_jobs=16,
    disable_evaluator_output=False,
    get_smac_object_callback=None,
    smac_scenario_args=None,
    logging_config=None,
    metadata_directory=None,
)

In [None]:
# fit() changes the data in place, but refit needs the original data. We
# therefore copy the data. In practice, one should reload the data
automl.fit(X.copy(), y.copy(), dataset_name='auto_diagnosis')



In [None]:
automl.cv_results_

In [None]:
print(automl.sprint_statistics())

In [15]:
import pickle

In [None]:
x = automl.show_models()
results = {"ensemble": x}
pickle.dump(results, open(output_folder+'show_models_beore_refit.pkl','wb'))

In [None]:
pickle.dump(automl, open(output_folder+'automl_object.pkl','wb'))

In [None]:
# During fit(), models are fit on individual cross-validation folds. To use
# all available data, we call refit() which trains all models in the
# final ensemble on the whole dataset.
automl.refit(X.copy(), y.copy())

In [None]:
print(automl.show_models())

In [50]:
ds_eval = TuhAbnormal("/home/gemeinl/data/feats_100_Hz/median/eval/", extension=".h5", n_recordings=None, key="natural", subset="eval")

In [51]:
ds_eval.load()

In [52]:
X_eval, y_eval = get_X_y(ds_eval)

In [53]:
X_eval = np.concatenate([x for x in X_eval], axis=0)

In [54]:
X_eval.shape, len(y_eval)

((276, 8631), 276)

In [None]:
predictions = automl.predict(X_eval)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Accuracy score", accuracy_score(y_eval, predictions))

In [None]:
print(automl.sprint_statistics())

In [None]:
df = pd.DataFrame(predictions)

In [None]:
df.to_csv(output_folder+"predictions_eval.csv")

In [None]:
probas = automl.predict_proba(X_eval)

In [None]:
df = pd.DataFrame(probas)

In [None]:
df.to_csv(output_folder+"probas_eval.csv")

In [None]:
import pickle

In [None]:
results = {"ensemble": automl.show_models()}

In [None]:
pickle.dump(results, open(tmp_folder+'show_models_after_refit.pkl','wb'))

In [33]:
ensemble_models = {}
for i in range(2,7):
    output_folder = '/home/gemeinl/tmp/out{}/'.format(i)
    import pickle
    with open(output_folder+"automl_object.pkl", "rb") as pkl_file:
        automl = pickle.load(pkl_file)
    print(automl.sprint_statistics())
    ensemble_models.update({i: automl.show_models()})    



auto-sklearn results:
  Dataset name: auto_diagnosis
  Metric: accuracy
  Best validation score: 0.852725
  Number of target algorithm runs: 598
  Number of successful target algorithm runs: 510
  Number of crashed target algorithm runs: 39
  Number of target algorithms that exceeded the time limit: 35
  Number of target algorithms that exceeded the memory limit: 14

auto-sklearn results:
  Dataset name: auto_diagnosis
  Metric: accuracy
  Best validation score: 0.851620
  Number of target algorithm runs: 607
  Number of successful target algorithm runs: 468
  Number of crashed target algorithm runs: 66
  Number of target algorithms that exceeded the time limit: 45
  Number of target algorithms that exceeded the memory limit: 28

auto-sklearn results:
  Dataset name: auto_diagnosis
  Metric: accuracy
  Best validation score: 0.849779
  Number of target algorithm runs: 487
  Number of successful target algorithm runs: 378
  Number of crashed target algorithm runs: 56
  Number of target 