In [1]:
%connect_info

{
  "shell_port": 52566,
  "iopub_port": 52567,
  "stdin_port": 52568,
  "control_port": 52570,
  "hb_port": 52569,
  "ip": "127.0.0.1",
  "key": "e610fee3-354b090fdd670f29db3b9d49",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-48648ad9-bee6-4098-b805-43c171777f8a.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

## Possible models

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.

`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from numpy.ma import MaskedArray
import sklearn.utils.fixes

sklearn.utils.fixes.MaskedArray = MaskedArray

import skopt
from skopt.space import Real, Categorical, Integer

In [4]:
SEED = 42
BASE_DIR = os.path.expanduser("~")     # this will point to the user's home
TRAIN_DIR = BASE_DIR +  "/ray_results"


model_type = 'avg.distilbert-base-multilingual-cased'
# model_type = 'bert-base-multilingual-cased'
with open('data/{}.embdata.pkl'.format(model_type), 'rb') as f:
    data = pickle.load(f)


In [5]:
codes = ['DE', 'GA', 'HI', 'PT', 'ZH']


x_train = np.concatenate([data[code]['x_train'] for code in codes], axis=0)
y_train = np.concatenate([data[code]['y_train'] for code in codes], axis=0)
print(x_train.shape, y_train.shape)

x_dev = np.concatenate([data[code]['x_dev'] for code in codes], axis=0)
y_dev = np.concatenate([data[code]['y_dev'] for code in codes], axis=0)
print(x_dev.shape, y_dev.shape)

del data

(66338, 768) (66338,)
(4330, 768) (4330,)


In [6]:

knn_space = {
#     'model': [KNeighborsClassifier(n_jobs=-1)],
    'n_neighbors': Integer(1, 10),
    'weights':  Categorical(['uniform', 'distance']),
    'p':  Integer(1, 2),
    'n_jobs': Categorical([-1])
}

l1svm_space = {
#     'model': [LinearSVC(dual=False, class_weight='balanced', random_state=SEED)],
    'penalty': Categorical(['l1']),
    'loss':  Categorical(['squared_hinge']),
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'dual': Categorical([False]),
    'class_weight': Categorical(['balanced']),
    'random_state': Categorical([SEED])
}

l2svm_space = {
#     'model': [LinearSVC(dual=True, class_weight='balanced', random_state=SEED)],
    'penalty': Categorical(['l2']),
    'loss':  Categorical(['hinge', 'squared_hinge']),
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'dual': Categorical([True]),
    'class_weight': Categorical(['balanced']),
    'random_state': Categorical([SEED])
}


svm_space = {
#     'model': [SVC(class_weight='balanced', random_state=SEED)],
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
    'degree': Integer(1, 3),
    'gamma': Categorical(['scale', 'auto']),
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'class_weight': Categorical(['balanced']),
    'random_state': Categorical([SEED])
}

lreg_space = {
#     'model': [LogisticRegression(solver='liblinear')],
    'penalty': Categorical(['l1', 'l2']),
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'solver': Categorical(['liblinear'])
}

adab_space = {
#     'model': [AdaBoostClassifier(
#         DecisionTreeClassifier(max_depth=Integer(1, 10)),
#         random_state=SEED)],
    'n_estimators': Integer(5, 50) ,
    'learning_rate': Real(0.1, 1.0, prior='uniform'),
    'random_state': Categorical([SEED])
}

hist_space = {
#     'model': [HistGradientBoostingClassifier(random_state=SEED)],
    'learning_rate': Real(0.1, 1.0, prior='uniform'),
    'max_leaf_nodes': Integer(2, 100),
    'max_depth': Integer(2, 100),
    'min_samples_leaf': Integer(1, 50),
    'l2_regularization': Real(0.0, 10.0, prior='uniform'),
    'max_bins': Integer(5, 300),
    'random_state': Categorical([SEED])
}

rf_space = {
#     'model': [RandomForestClassifier(random_state=SEED, n_jobs=-1)],
    'n_estimators': Integer(5, 500),
    'criterion': Categorical(['gini', 'entropy']),
    'max_leaf_nodes': Integer(2, 100),
    'max_depth': Integer(2, 100),
    'min_samples_leaf': Integer(1, 50),
    'min_samples_split': Integer(1, 10),
    'max_features': Categorical(['auto', 'sqrt', 'log2', None]),
    'l2_regularization': Real(0.0, 10.0, prior='uniform'),
    'random_state': Categorical([SEED]),
    'n_jobs': Categorical([-1])
}

In [7]:
opt = skopt.BayesSearchCV(
    KNeighborsClassifier(n_jobs=-1),
    knn_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 31.9min finished


TypeError: object.__init__() takes exactly one argument (the instance to initialize)

In [None]:
opt = BayesSearchCV(
    LinearSVC(dual=False, class_weight='balanced', random_state=SEED),
    l1svm_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    LinearSVC(dual=True, class_weight='balanced', random_state=SEED),
    l2svm_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    SVC(class_weight='balanced', random_state=SEED),
    svm_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    LogisticRegression(solver='liblinear'),
    lreg_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    KNeighborsClassifier(n_jobs=-1),
    knn_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    AdaBoostClassifier(),
    adab_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    HistGradientBoostingClassifier(random_state=SEED),
    hist_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

In [None]:
opt = BayesSearchCV(
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    rf_space,   
    n_iter=32,
    cv=3,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    scoring='f1'
)
opt.fit(x_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(x_dev, y_dev))

y_pred = opt.predict(x_dev)

print(confusion_matrix(y_dev, y_pred))
print(classification_report(y_dev, y_pred))