In [1]:
%connect_info

{
  "shell_port": 63953,
  "iopub_port": 63954,
  "stdin_port": 63955,
  "control_port": 63957,
  "hb_port": 63956,
  "ip": "127.0.0.1",
  "key": "de869a4c-ea5b6308c7335243215523df",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-70585f26-8860-491b-be7b-058c661c86c5.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

## Possible models

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.

`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [8]:
import pickle
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


SEED = 42

In [21]:
model_type = 'distilbert-base-multilingual-cased'
with open('data/{}.embdata.pkl'.format(model_type), 'rb') as f:
    data = pickle.load(f)

In [22]:
x_train = np.concatenate([
    data['DE']['x_train'],
    data['GA']['x_train'],
    data['HI']['x_train'],
    data['PT']['x_train'],
    data['ZH']['x_train'],
], axis=0)

y_train = np.concatenate([
    data['DE']['y_train'],
    data['GA']['y_train'],
    data['HI']['y_train'],
    data['PT']['y_train'],
    data['ZH']['y_train'],
], axis=0)

In [23]:
x_train.shape, y_train.shape

((66338, 768), (66338,))

In [24]:
x_dev = np.concatenate([
    data['DE']['x_dev'],
    data['GA']['x_dev'],
    data['HI']['x_dev'],
    data['PT']['x_dev'],
    data['ZH']['x_dev'],
], axis=0)

y_dev = np.concatenate([
    data['DE']['y_dev'],
    data['GA']['y_dev'],
    data['HI']['y_dev'],
    data['PT']['y_dev'],
    data['ZH']['y_dev'],
], axis=0)

In [25]:
x_dev.shape, y_dev.shape

((4330, 768), (4330,))

In [26]:
del data

In [27]:
# x_train, x_val, y_train, y_val = train_test_split(
#     x_train, y_train, test_size=0.33, random_state=SEED)

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
knn = KNeighborsClassifier(class_weight='balanced')
knn.fit(x_train, y_train)

TypeError: __init__() got an unexpected keyword argument 'class_weight'

In [30]:
y_pred = knn.predict(x_dev)

In [32]:
print(confusion_matrix(y_dev, y_pred))

array([[2775,  534],
       [ 560,  461]])

In [34]:
print(classification_report(y_dev, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      3309
           1       0.46      0.45      0.46      1021

    accuracy                           0.75      4330
   macro avg       0.65      0.65      0.65      4330
weighted avg       0.75      0.75      0.75      4330



In [35]:
from sklearn.svm import LinearSVC, SVC

In [38]:
svm = LinearSVC(dual=False, class_weight='balanced')
svm.fit(x_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [39]:
y_pred = svm.predict(x_dev)

In [40]:
print(confusion_matrix(y_dev, y_pred))

print(classification_report(y_dev, y_pred))

[[3082  227]
 [ 659  362]]
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      3309
           1       0.61      0.35      0.45      1021

    accuracy                           0.80      4330
   macro avg       0.72      0.64      0.66      4330
weighted avg       0.77      0.80      0.77      4330



In [44]:
svc = SVC(class_weight='balanced')
svc.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = svc.predict(x_dev)

In [None]:
print(confusion_matrix(y_dev, y_pred))

print(classification_report(y_dev, y_pred))

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)

In [None]:
y_pred = sgd.predict(x_dev)

In [None]:
print(confusion_matrix(y_dev, y_pred))

print(classification_report(y_dev, y_pred))

In [46]:
from sklearn.ensemble import AdaBoostClassifier

In [47]:
ada = AdaBoostClassifier()
ada.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [48]:
y_pred = ada.predict(x_dev)

In [49]:
print(confusion_matrix(y_dev, y_pred))

print(classification_report(y_dev, y_pred))

[[3108  201]
 [ 791  230]]
              precision    recall  f1-score   support

           0       0.80      0.94      0.86      3309
           1       0.53      0.23      0.32      1021

    accuracy                           0.77      4330
   macro avg       0.67      0.58      0.59      4330
weighted avg       0.73      0.77      0.73      4330

