In [1]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = "ConicalClassificationGithub"
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd drive/My\ Drive
%cd $FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive
/content/drive/My Drive/ConicalClassificationGithub


In [2]:
from sklearn.datasets import load_files
import codecs as cs
from load_dataset import *

X_train, y_train, X_valid, y_valid, X_test, y_test = load_and_split()

In [3]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
import traceback
import math
import pickle
from sklearn.metrics import balanced_accuracy_score
from conical import CorpusClassification

def objective(args):
    try:
        argDict = args
        idfDict = argDict.pop("idf")
        for key in idfDict:
            argDict[key] = idfDict[key]
        #Fit
        conical = CorpusClassification(args)
        conical.fit(X_train)

        #Evaluate
        y_pred = conical.predict(X_valid)
        val = -balanced_accuracy_score(y_valid, y_pred)

        if math.isnan(val) or val is None:
            return {'loss': float('inf'), 'status': STATUS_FAIL }
        return {'loss': val, 'status': STATUS_OK }

    except Exception as e:
        return {'loss': float('inf'), 'status': STATUS_FAIL }

# define a search space
space = {
    'remove_outliers': hp.choice('remove_outliers', [True, False]),
    'use_bns': hp.choice('use_bns', [True, False]),
    'final_norm': hp.choice('final_norm', ['l1', 'l2', 'max', None]),
    'norm': hp.choice('norm', ['l1', 'l2', None]),
    'sublinear_tf': hp.choice('sublinear_tf', [True, False]),
    'max_features': ho_scope.int(hp.quniform('max_features', 1, 10000, q=1)),
    'idf': hp.choice('idf', [
        {
            'use_idf': True,
            'smooth_idf': hp.choice('smooth_idf', [True, False]),
        },
        {
            'use_idf': False,
        }
    ])
}

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
_model = "Conical_Demo"

def run_trials():
    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 20  # initial max_trials. put something small to not have to wait
    try:  # try to load an already saved trials object, and increase the max
        with open(f"{_model}.hyperopt", "rb") as f:
            trials = pickle.load(f)
            print("Found saved Trials! Loading...")
            max_trials = len(trials.trials) + trials_step
            print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))

    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_trials, trials=trials)
    print("Best:", space_eval(space, best))

    # save the trials object
    print("Saving...")
    with open(f"{_model}.hyperopt", "wb") as f:
        pickle.dump(trials, f)
    print("...Done")

# Loop run_trials indefinitely and stop whenever you like
# For the paper, we stopped once we reached max_trials
run_trials()

100%|██████████| 20/20 [02:39<00:00,  7.97s/it, best loss: -1.0]
Best: {'final_norm': 'l1', 'idf': {'smooth_idf': True, 'use_idf': True}, 'max_features': 8956, 'norm': 'l2', 'remove_outliers': False, 'sublinear_tf': True, 'use_bns': True}
Saving...
...Done
