In [1]:
%connect_info

{
  "shell_port": 56716,
  "iopub_port": 56717,
  "stdin_port": 56718,
  "control_port": 56720,
  "hb_port": 56719,
  "ip": "127.0.0.1",
  "key": "35e558e6-4ac70019b11494fd73c0b9e7",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-7de0b04b-47f1-4309-b3ac-2542eef53c3e.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

## Possible models

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.

`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [9]:
SEED = 42
BASE_DIR = os.path.expanduser("~")     # this will point to the user's home
TRAIN_DIR = BASE_DIR +  "/ray_results"


model_type = 'distilbert-base-multilingual-cased'
# model_type = 'bert-base-multilingual-cased'
with open('/Users/gian/Documents/research/mwe_sharedtask/data/{}.embdata.pkl'.format(model_type), 'rb') as f:
    data = pickle.load(f)

In [10]:
codes = ['DE', 'GA', 'HI', 'PT', 'ZH']


x_train = np.concatenate([data[code]['x_train'] for code in codes], axis=0)
y_train = np.concatenate([data[code]['y_train'] for code in codes], axis=0)
print(x_train.shape, y_train.shape)

x_dev = np.concatenate([data[code]['x_dev'] for code in codes], axis=0)
y_dev = np.concatenate([data[code]['y_dev'] for code in codes], axis=0)
print(x_dev.shape, y_dev.shape)

del data

(66338, 768) (66338,)
(4330, 768) (4330,)


In [None]:
knn_space = {
    'model': [KNeighborsClassifier(n_jobs=-1)],
    'model__n_neighbors': Integer(1, 10),
    'model__weights':  Categorical(['uniform', 'distance']),
    'model__p':  Integer(1, 2)
}

svm_space = {
    'model': [LinearSVC(dual=False, class_weight='balanced', random_state=SEED)],
    'model__penalty': Categorical(['l1', 'l2']),
    'model__loss':  Categorical(['hinge', 'squared_hinge']),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
}

lreg_space = {
    'model': [LogisticRegression(solver='liblinear')],
    'model__penalty': Categorical(['l1', 'l2']),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
}

In [None]:
pipe = Pipeline([
    ('model', LogisticRegression(solver='liblinear'))
])

In [None]:
opt = BayesSearchCV(
   pipe,
    [(knn_space, 20), (svm_space, 20), (lreg_space, 20)], # (parameter space, # of evaluations)
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2
)

opt.fit(x_train, y_train)