In [1]:
%connect_info

{
  "shell_port": 53932,
  "iopub_port": 53933,
  "stdin_port": 53934,
  "control_port": 53936,
  "hb_port": 53935,
  "ip": "127.0.0.1",
  "key": "7f2b104b-d8c4ef6d3ac3bdc38ef75471",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-2905a8e1-b421-4a7f-b6cd-aab532051d63.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [None]:
%matplotlib inline

## Suggested models


`bert-base-multilingual-uncased`: (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on lower-cased text in the top 102 languages with the largest Wikipedia

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.


`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [8]:
import pickle
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from preprocess import load_dataset, Features, load_word_dataset

SEED = 42

In [4]:
train_files = ['data/GA/train.cupt']
dev_files = ['data/GA/train.cupt']

In [5]:
train_sentences, train_labels = load_word_dataset(train_files)
dev_sentences, dev_labels = load_word_dataset(dev_files)

In [6]:
len(train_sentences)

257

In [7]:
model_type = 'distilbert-base-multilingual-cased'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = TFAutoModel.from_pretrained(model_type)

In [14]:
input_ids = tokenizer.encode(
    train_sentences[0],
    add_special_tokens=True,
    return_tensors='tf')

In [15]:
output = model(input_ids)

In [17]:
output[0].shape

TensorShape([1, 37, 768])

In [20]:
tf.keras.layers.GlobalAveragePooling1D()(output[0]).numpy().shape

(1, 768)

In [5]:
model_type = 'distilbert-base-multilingual-cased'

In [9]:
code = 'DE'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    de = pickle.load(f)

In [10]:
code = 'GA'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    ga = pickle.load(f)

In [11]:
code = 'HI'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    hi = pickle.load(f)

In [12]:
code = 'PT'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    pt = pickle.load(f)

In [13]:
code = 'ZH'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    zh = pickle.load(f)

In [14]:
data = {
    'DE': de,
    'GA': ga,
    'HI': hi,
    'PT': pt,
    'ZH': zh,    
}

In [16]:
with open('{}.embdata.pkl'.format(model_type), 'wb') as f:
    pickle.dump(data, f)

In [17]:
import sys

In [18]:
sys.getsizeof(data)

248

In [21]:
(pt['x_train'].nbytes / 1024) / 1024

70.0341796875