In [1]:
%connect_info

{
  "shell_port": 59971,
  "iopub_port": 59972,
  "stdin_port": 59973,
  "control_port": 59975,
  "hb_port": 59974,
  "ip": "127.0.0.1",
  "key": "a17a4a3f-1c45f4d5ec91773a4dc84304",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-a58010ed-00f5-4dcf-9b50-c095da8d7c91.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

In [3]:
import sys
sys.argv = sys.argv[:1]

In [4]:
import argparse
import os
import numpy as np

import torch
import torch.nn as nn

from sklearn.utils.class_weight import compute_class_weight
from skorch.callbacks import ProgressBar, EarlyStopping, Checkpoint
from skorch.helper import predefined_split

from transformers import AutoModel, AutoTokenizer

from models import CNNClassifier
from preprocess import load_tokenized_data
from skorch_custom import SentenceDataset, SkorchBucketIterator
from skorch_custom import IdiomClassifier, CustomScorer
from evaluation import evaluate_model

In [5]:
parser = argparse.ArgumentParser(description='Classifier using CNNs')
parser.add_argument(
    '--bert_type',
    type=str,
    default='distilbert-base-multilingual-cased',
    help='transormer model [should be a miltilingual model]')
parser.add_argument(
    '--bert_device',
    type=str,
    default='gpu',
    help='device to run the transformer model')
parser.add_argument(
    '--metric',
    type=str,
    default='f1',
    help='sklearn metric to evaluate the model while training')
parser.add_argument(
    '--nfilters',
    type=int,
    default=128,
    help='number of convolution filters')
parser.add_argument(
    '--kernels',
    type=list,
    default=[1, 2, 3, 4, 5],
    help='number of convolution filters')
parser.add_argument(
    '--pool_stride',
    type=int,
    default=3,
    help='size of the stride for the pooling operation')
parser.add_argument(
    '--dropout',
    type=float,
    default=0.2,
    help='dropout probability for the dense layer')
parser.add_argument(
    '--output_activation',
    type=str,
    default='sigmoid',
    help='output activation')
parser.add_argument(
    '--batch_size',
    type=int,
    default=32,
    help='training batch size')
parser.add_argument(
    '--max_epochs',
    type=int,
    default=100,
    help='max number of epochs to train the model')

args = parser.parse_args()
transformer_device = torch.device(
    'cuda' if torch.cuda.is_available() and args.bert_device == 'gpu'
    else 'cpu')
ONE_HOT_OUTPUT = args.output_activation == 'softmax'

In [6]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')     # pylint: disable=no-member
LANGUAGE_CODES = ['DE', 'GA', 'HI', 'PT', 'ZH']
CWD = os.getcwd()

In [7]:
(x_train, y_train), (x_val, y_val), (x_dev, y_dev) = load_tokenized_data(
    datafile='{}/data/{}.tokenized.pkl'.format(CWD, args.bert_type),
    language_codes=LANGUAGE_CODES,
    seed=SEED)

targets = np.concatenate(y_train).reshape(-1)
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(targets),
                                     y=targets).tolist()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(args.bert_type)
transformer = AutoModel.from_pretrained(args.bert_type)

In [9]:
model = CNNClassifier(args, transformer, transformer_device)
model.to(DEVICE)     # pylint: disable=no-member
model.freeze_transformer()

In [14]:
net = IdiomClassifier(
    module=model,
    class_weights=class_weights,
     #
    iterator_train=SkorchBucketIterator,
    iterator_train__batch_size=args.batch_size,
    iterator_train__sort_key=lambda x: len(x.sentence),
    iterator_train__shuffle=True,
    iterator_train__device=DEVICE,
    iterator_train__one_hot=ONE_HOT_OUTPUT,
     #
    iterator_valid=SkorchBucketIterator,
    iterator_valid__batch_size=32,
    iterator_valid__sort_key=lambda x: len(x.sentence),
    iterator_valid__shuffle=True,
    iterator_valid__device=DEVICE,
    iterator_valid__one_hot=ONE_HOT_OUTPUT,

    train_split=predefined_split(SentenceDataset(data=(x_val[0:50], y_val[0:50]))),
    optimizer=torch.optim.Adam,
    criterion=nn.BCELoss,
    callbacks=[
        ProgressBar(batches_per_epoch=len(x_train) // args.batch_size + 1),
        CustomScorer(scoring=None, lower_is_better=False, use_caching=False),
        EarlyStopping(monitor='score_best', patience=5),
        Checkpoint(monitor='score_best')
    ],
    device=DEVICE,
)


In [15]:
net.fit(SentenceDataset(data=(x_train[0:2], y_train[0:2])), y=None, epochs=1)

HBox(children=(FloatProgress(value=0.0, max=706.0), HTML(value='')))

Confusion matrix
[[1270    0]
 [  85    0]]
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      1270
         1.0       0.00      0.00      0.00        85

    accuracy                           0.94      1355
   macro avg       0.47      0.50      0.48      1355
weighted avg       0.88      0.94      0.91      1355



  _warn_prf(average, modifier, msg_start, len(result))


  epoch    score    train_loss    valid_loss    cp     dur
-------  -------  ------------  ------------  ----  ------
      1   [36m0.0000[0m        [32m0.2833[0m        [35m4.3100[0m     +  4.1306


<class 'skorch_custom.IdiomClassifier'>[initialized](
  module_=CNNClassifier(
    (transformer): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [13]:

code = 'GA'
print('#' * 20)
print('# Evaluating Language: {}'.format(code))
print('#' * 20)
test_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(x_dev[code], y_dev[code])),
    batch_size=32,
    sort=False,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    train=False,
    one_hot=args.output_activation == 'softmax',
    device=DEVICE)
args.dev_file = '{}/data/{}/dev.cupt'.format(CWD, code)
evaluate_model(net, test_iterator, tokenizer, args)

####################
# Evaluating Language: GA
####################
0
## Global evaluation
* MWE-based: P=102/497=0.2052 R=102/126=0.8095 F=0.3274
* Tok-based: P=287/7024=0.0409 R=287/287=1.0000 F=0.0785

## Per-category evaluation (partition of Global)
* IAV: MWE-proportion: gold=42/126=33% pred=38/497=8%
* IAV: MWE-based: P=33/38=0.8684 R=33/42=0.7857 F=0.8250
* IAV: Tok-based: P=77/88=0.8750 R=77/86=0.8953 F=0.8851
* LVC.cause: MWE-proportion: gold=22/126=17% pred=22/497=4%
* LVC.cause: MWE-based: P=18/22=0.8182 R=18/22=0.8182 F=0.8182
* LVC.cause: Tok-based: P=49/58=0.8448 R=49/49=1.0000 F=0.9159
* LVC.full: MWE-proportion: gold=29/126=23% pred=27/497=5%
* LVC.full: MWE-based: P=25/27=0.9259 R=25/29=0.8621 F=0.8929
* LVC.full: Tok-based: P=61/66=0.9242 R=61/65=0.9385 F=0.9313
* <unlabeled>: MWE-proportion: gold=0/126=0% pred=381/497=77%
* <unlabeled>: MWE-based: P=0/381=0.0000 R=0/0=0.0000 F=0.0000
* <unlabeled>: Tok-based: P=0/6733=0.0000 R=0/0=0.0000 F=0.0000
* VID: MWE-proportio