In [1]:
%connect_info

{
  "shell_port": 53779,
  "iopub_port": 53780,
  "stdin_port": 53781,
  "control_port": 53783,
  "hb_port": 53782,
  "ip": "127.0.0.1",
  "key": "32ad02a4-a780c28010df68cd5340b8c6",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-6983e239-3446-4436-8c20-4b079ef19695.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

## Suggested models


`bert-base-multilingual-uncased`: (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on lower-cased text in the top 102 languages with the largest Wikipedia

`bert-base-multilingual-cased`: (New, recommended) 12-layer, 768-hidden, 12-heads, 110M parameters. Trained on cased text in the top 104 languages with the largest Wikipedias

`xlm-mlm-100-1280`: 16-layer, 1280-hidden, 16-heads XLM model trained with MLM (Masked Language Modeling) on 100 languages.


`distilbert-base-multilingual-cased`: 6-layer, 768-hidden, 12-heads, 134M parameters The multilingual DistilBERT model distilled from the Multilingual BERT model bert-base-multilingual-cased checkpoint.

In [3]:
import pickle
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from preprocess import load_dataset, Features, load_word_dataset

SEED = 42

In [7]:
train_files = ['../data/GA/train.cupt']
dev_files = ['../data/GA/train.cupt']

In [8]:
train_sentences, train_labels = load_word_dataset(train_files)
dev_sentences, dev_labels = load_word_dataset(dev_files)

In [9]:
len(train_sentences)

257

In [10]:
model_type = 'distilbert-base-multilingual-cased'

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = TFAutoModel.from_pretrained(model_type)

In [19]:
model

<transformers.modeling_tf_distilbert.TFDistilBertModel at 0x10f98b710>

In [32]:
input_ids = tokenizer.encode(
    train_sentences[0],
    add_special_tokens=True,
    return_tensors='tf')

In [37]:
output = model(input_ids),
output[0][0].numpy()

array([[[-0.26731867,  0.04377769, -0.27627617, ...,  0.2619894 ,
          0.26026088, -0.23037311],
        [-0.39187068,  0.13450783,  0.32470304, ...,  0.47873366,
          0.46392375, -0.80128056],
        [-0.31140715,  0.02606569, -0.53775966, ...,  0.29545897,
          0.18285292, -0.42649287],
        ...,
        [-0.8729709 ,  0.06480993, -0.10463849, ...,  0.48023617,
          0.48928964, -0.5513086 ],
        [-0.69835585, -0.08516489, -0.48497486, ...,  0.34225577,
          0.5646079 , -0.31343958],
        [-0.4513158 , -0.06368764, -0.10364896, ...,  0.22472593,
          0.44874197, -0.30655885]]], dtype=float32)

In [38]:
output[0][0].numpy()[:, 0]

array([[-2.67318666e-01,  4.37776893e-02, -2.76276171e-01,
         1.11780502e-01,  6.47455007e-02,  4.28896658e-02,
        -6.91449195e-02, -1.96380332e-01,  1.47280674e-02,
         2.00100541e-01,  1.60346389e-01, -1.76126957e-02,
         9.46132317e-02,  1.15133107e-01, -6.73948646e-01,
         2.19163224e-01, -1.33663416e-05,  3.61494422e-01,
        -1.73927218e-01,  1.87225163e-01,  1.31695569e-02,
         1.31302327e-01, -1.00700006e-01,  2.05842003e-01,
         2.97709644e-01, -3.64328086e-01,  2.63200402e-02,
         4.48846966e-01,  6.78964794e-01, -2.19417810e-01,
        -3.57347727e-02,  1.08223461e-01, -3.86584729e-01,
        -1.12254769e-01, -5.67049682e-02, -1.67787343e-01,
        -1.93233037e+00,  2.74989963e-01, -4.58877906e-02,
        -2.38275558e-01, -4.67977040e-02, -1.53626744e-02,
        -2.80302644e-01,  2.66601145e-01,  1.71466395e-02,
         1.22581851e+00,  1.55617714e-01, -1.73593350e-02,
         1.15643024e+00, -1.63122341e-01, -1.64613888e-0

In [17]:
# tf.keras.layers.GlobalAveragePooling1D()(output[0]).numpy().shape
output[0][1]

IndexError: tuple index out of range

In [5]:
model_type = 'distilbert-base-multilingual-cased'

In [9]:
code = 'DE'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    de = pickle.load(f)

In [10]:
code = 'GA'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    ga = pickle.load(f)

In [11]:
code = 'HI'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    hi = pickle.load(f)

In [12]:
code = 'PT'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    pt = pickle.load(f)

In [13]:
code = 'ZH'
with open('data/{}.{}.embdata.pkl'.format(code, model_type), 'rb') as f:
    zh = pickle.load(f)

In [14]:
data = {
    'DE': de,
    'GA': ga,
    'HI': hi,
    'PT': pt,
    'ZH': zh,    
}

In [16]:
with open('{}.embdata.pkl'.format(model_type), 'wb') as f:
    pickle.dump(data, f)

In [17]:
import sys

In [18]:
sys.getsizeof(data)

248

In [21]:
(pt['x_train'].nbytes / 1024) / 1024

70.0341796875