In [1]:
%connect_info

{
  "shell_port": 63051,
  "iopub_port": 63052,
  "stdin_port": 63053,
  "control_port": 63055,
  "hb_port": 63054,
  "ip": "127.0.0.1",
  "key": "88f96dd4-a6dec9b03f3505fe8e2ddb95",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-fa16572b-ef4a-40c4-b92e-d143035687a5.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

## Possible models

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.

`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.hyperopt import HyperOptSearch
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

In [4]:
SEED = 42
BASE_DIR = os.path.expanduser("~")     # this will point to the user's home
TRAIN_DIR = BASE_DIR +  "/ray_results"


model_type = 'distilbert-base-multilingual-cased'
# model_type = 'bert-base-multilingual-cased'
with open('/Users/gian/Documents/research/mwe_sharedtask/data/{}.embdata.pkl'.format(model_type), 'rb') as f:
    data = pickle.load(f)

In [5]:
codes = ['DE', 'GA', 'HI', 'PT', 'ZH']


x_train = np.concatenate([data[code]['x_train'] for code in codes], axis=0)
y_train = np.concatenate([data[code]['y_train'] for code in codes], axis=0)
print(x_train.shape, y_train.shape)

x_dev = np.concatenate([data[code]['x_dev'] for code in codes], axis=0)
y_dev = np.concatenate([data[code]['y_dev'] for code in codes], axis=0)
print(x_dev.shape, y_dev.shape)

del data

(66338, 768) (66338,)
(4330, 768) (4330,)


In [6]:


knn_space = {
    'model': hp.choice('model', [KNeighborsClassifier]),
    'n_neighbors': hp.randint('n_neighbors', 1, 10),
    'weights':  hp.choice('weights', ['uniform', 'distance']),
    'p':  hp.randint('p', 1, 2) * 1,
    'n_jobs': hp.choice('n_jobs', [-1])
}

lsvm_space = {
    'model': hp.choice('model', [LinearSVC]),
    'penalty':hp.choice('penalty', ['l1', 'l2']),
    'loss':  hp.choice('loss', ['hinge', 'squared_hinge']),
    'C': hp.loguniform('C', np.log(1e-6), np.log(1e+4)),
    'class_weight': hp.choice('class_weight', ['balanced']),
    'random_state': hp.choice('random_state', [SEED]),
    'dual': hp.choice('dual', [True, False])
}


svm_space = {
    'model': hp.choice('model', [SVC]),
    'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'degree': hp.randint('degree', 1, 3) * 1,
    'gamma': hp.choice('gamma', ['scale', 'auto']),
    'C': hp.loguniform('C', np.log(1e-6), np.log(1e+4)),
    'class_weight': hp.choice('class_weight', ['balanced']),
    'random_state': hp.choice('random_state', [SEED]),
}

lreg_space = {
    'model': hp.choice('model', [LogisticRegression]),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'C': hp.loguniform('C', np.log(1e-6), np.log(1e+4)),
    'solver': hp.choice('solver', ['liblinear']),
    'n_jobs': hp.choice('n_jobs', [-1])
}

adab_space = {
    'model': hp.choice('model', [AdaBoostClassifier]),
    'base_estimator': hp.choice('base_estimator', [DecisionTreeClassifier(max_depth=hp.randint('max_depth', 1, 10))]),
    'n_estimators': hp.randint('n_estimators', 1, 100) * 5,
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'random_state': hp.choice('random_state', [SEED]),     
}

hist_space = {
    'model': hp.choice('model', [HistGradientBoostingClassifier]),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1.0),
    'max_leaf_nodes': hp.randint('max_leaf_nodes', 2, 100) * 1,
    'max_depth': hp.randint('max_depth', 2, 100) * 1,
    'min_samples_leaf': hp.randint('min_samples_leaf', 1, 50) * 1,
    'l2_regularization': hp.uniform('l2_regularization', 0.0, 10.0),
    'max_bins': hp.randint('max_bins', 1, 60) * 5,
    'random_state': hp.choice('random_state', [SEED]), 
}

rf_space = {
    'model': hp.choice('model', [RandomForestClassifier]),
    'n_estimators': hp.randint('n_estimators', 1, 100) * 5,
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_leaf_nodes': hp.randint('max_leaf_nodes', 2, 100) * 1,
    'max_depth': hp.randint('max_depth', 2, 100) * 1,
    'min_samples_leaf': hp.randint('min_samples_leaf', 1, 50) * 1,
    'min_samples_split': hp.randint('min_samples_split', 1, 10) * 1,
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'random_state': hp.choice('random_state', [SEED]), 
    'n_jobs': hp.choice('n_jobs', [-1])
}


In [7]:
search_space = {
    'classifier': hp.choice(
        'classifier', [
            knn_space,
            lsvm_space,
            svm_space,
            lreg_space,
            adab_space,
            hist_space,
            rf_space]
        )
}


In [20]:
_sample = sample(search_space)
_sample

{'classifier': {'C': 123.36800787123339,
  'class_weight': 'balanced',
  'degree': 1,
  'gamma': 'scale',
  'kernel': 'sigmoid',
  'model': sklearn.svm._classes.SVC,
  'random_state': 42}}

In [21]:
_sample = _sample['classifier']

In [22]:
_sample

{'C': 123.36800787123339,
 'class_weight': 'balanced',
 'degree': 1,
 'gamma': 'scale',
 'kernel': 'sigmoid',
 'model': sklearn.svm._classes.SVC,
 'random_state': 42}

In [23]:
clf = _sample.pop('model')
clf

In [24]:
clf = clf(**_sample)

In [25]:
clf.fit(x_dev, y_dev)

SVC(C=123.36800787123339, class_weight='balanced', degree=1, kernel='sigmoid',
    random_state=42)

In [26]:
clf

SVC(C=123.36800787123339, class_weight='balanced', degree=1, kernel='sigmoid',
    random_state=42)

In [None]:
searc