In [None]:
from os import path

def gather_learning_curve_results(learners, datasets, train_portions, seeds):
    
    # read in data frame if exists
    FILENAME = "data/learningcurves.csv"
    cols = ["openmlid", "learner", "train_size", "seed", "error_rate"]
    df = pd.read_csv(FILENAME) if path.exists(FILENAME) else pd.DataFrame([], columns=cols)
    
    total = len(datasets) * len(learners) * len(train_portions) * len(seeds)
    pbar = tqdm(total=total)
    rows = []
    unsaved_changes = 0
    interrupted = False
    for openmlid in datasets:
        dsIndex = df["openmlid"] == openmlid
        
        print("DATASET", openmlid)
        X, y = getDataset(openmlid)
        n = X.shape[0]
        if n == 0:
            print("Omit empty dataset!")
        
        if interrupted:
            break
        
        for seed in seeds:
            if interrupted:
                break
            dsSeed = dsIndex & (df["seed"] == seed)
            for j, learner in enumerate(learners):
                if interrupted:
                    break
                dsLearner = dsSeed & (df["learner"] == str(learner))
                scores_val = []
                slopes = []
                for i, train_portion in enumerate(train_portions):
                    if interrupted:
                        break
                    random.seed(seed)
                    num_examples = train_portion if type(train_portion) == int else int(train_portion * n)
                    if n - num_examples < 100:
                        print("Dataset has only " + str(n) +" instances in total. That is not enough samples to train on " + str(num_examples) + ". Skipping")
                    else:
                        dsPortion = dsLearner & (df["train_size"] == num_examples)
                        if np.count_nonzero(dsPortion) == 0:
                            try:
                                indices_train = random.sample(range(n), num_examples)
                                indices_test = [i for i in range(n) if not i in indices_train]
                                indices_test = indices_test[:10000] # maximum 10k validation instances
                                X_train = X[indices_train]
                                y_train = y[indices_train]
                                X_test = X[indices_test]
                                y_test = y[indices_test]

                                inst = learner()
                                print("Training " + str(learner) + " on data of shape " + str(X_train.shape))
                                inst.fit(X_train, y_train)
                                print("Training ready. Obtaining predictions for " + str(X_test.shape[0]) + " instances.")
                                y_hat = inst.predict(X_test)
                                error_rate = 1 - sklearn.metrics.accuracy_score(y_test, y_hat)
                                row = [openmlid, str(learner), num_examples, seed, error_rate]
                                df.loc[len(df)] = row
                                unsaved_changes += 1

                                print(unsaved_changes, "unsaved changes")
                                if unsaved_changes >= 100:
                                    df.to_csv(FILENAME, index=False)
                                    unsaved_changes = 0
                            except KeyboardInterrupt:
                                print("Interrupted")
                                interrupted = True
                                break
                            except:
                                print("An error occurred on " + str(openmlid) + " with learner " + str(learner) + " under seed " + str(seed) + " for " + str(num_examples) + " examples.")
                    pbar.update(1)
    pbar.close()
    if unsaved_changes > 0:
        df.to_csv(FILENAME, index=False)
        unsaved_changes = 0
    return df