In [1]:
%connect_info

{
  "shell_port": 60030,
  "iopub_port": 60031,
  "stdin_port": 60032,
  "control_port": 60034,
  "hb_port": 60033,
  "ip": "127.0.0.1",
  "key": "47146ae0-8dea40da288c9f5a4ff6caf7",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-9bae83af-928d-46de-b5ad-256344c3b240.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

In [3]:
import sys
sys.argv = sys.argv[:1]

In [4]:
import argparse
import os
import numpy as np

import torch
import torch.nn as nn

from sklearn.utils.class_weight import compute_class_weight
from skorch.callbacks import ProgressBar, EarlyStopping, Checkpoint, GradientNormClipping
from skorch.helper import predefined_split

from transformers import AutoModel, AutoTokenizer

# from models import CNNClassifier
from preprocess import load_tokenized_data
from skorch_custom import SentenceDataset, SkorchBucketIterator
from skorch_custom import IdiomClassifier, CustomScorer
from evaluation import evaluate_model

In [5]:
parser = argparse.ArgumentParser(description='Classifier using CNNs')
parser.add_argument(
    '--bert_type',
    type=str,
    default='distilbert-base-multilingual-cased',
    help='transormer model [should be a miltilingual model]')
parser.add_argument(
    '--bert_device',
    type=str,
    default='gpu',
    help='device to run the transformer model')
parser.add_argument(
    '--metric',
    type=str,
    default='f1',
    help='sklearn metric to evaluate the model while training')
parser.add_argument(
    '--nlayers',
    type=int,
    default=2,
    help='number of convolution filters')
parser.add_argument(
    '--lstm_size',
    type=list,
    default=50,
    help='number of convolution filters')
parser.add_argument(
    '--dropout',
    type=float,
    default=0.2,
    help='dropout probability for the dense layer')
parser.add_argument(
    '--initrange',
    type=float,
    default=0.1,
    help='range to initialize the lstm layers')
parser.add_argument(
    '--clipnorm',
    type=float,
    default=5.0,
    help='limit to clip the l2 norm of gradients')
parser.add_argument(
    '--output_activation',
    type=str,
    default='sigmoid',
    help='output activation')
parser.add_argument(
    '--batch_size',
    type=int,
    default=32,
    help='training batch size')
parser.add_argument(
    '--max_epochs',
    type=int,
    default=100,
    help='max number of epochs to train the model')

args = parser.parse_args()
transformer_device = torch.device(
    'cuda' if torch.cuda.is_available() and args.bert_device == 'gpu'
    else 'cpu')
ONE_HOT_OUTPUT = args.output_activation == 'softmax'

In [6]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')     # pylint: disable=no-member
LANGUAGE_CODES = ['DE', 'GA', 'HI', 'PT', 'ZH']
CWD = os.getcwd()

In [7]:
(x_train, y_train), (x_val, y_val), (x_dev, y_dev) = load_tokenized_data(
    datafile='{}/data/{}.tokenized.pkl'.format(CWD, args.bert_type),
    language_codes=LANGUAGE_CODES,
    seed=SEED)

targets = np.concatenate(y_train).reshape(-1)
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(targets),
                                     y=targets).tolist()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(args.bert_type)
transformer = AutoModel.from_pretrained(args.bert_type)

In [9]:
class RNNClassifier(nn.Module):
    def __init__(self, config, transformer, transformer_device):
        super(RNNClassifier, self).__init__()

        self.transformer_device = transformer_device
        self.model_device = transformer_device
        self.transformer = transformer

        self.lstm = nn.LSTM(
            input_size=transformer.embeddings.word_embeddings.embedding_dim,
            hidden_size=config.lstm_size,
            num_layers=config.nlayers,
            batch_first=True,
            dropout=config.dropout)

        self.dropout = nn.Dropout(config.dropout)
        noutputs = (1 if config.output_activation == 'sigmoid' else 2)

        self.fully_connected = nn.Linear(config.lstm_size, noutputs)

        self.output_activation = (torch.sigmoid  # pylint: disable=no-member
                                  if noutputs == 1
                                  else F.softmax)
        self.init_weights(config.initrange)

    def to(self, *args, **kwargs):
        self = super().to(*args, **kwargs)
        self.transformer = self.transformer.to(
            torch.device(self.transformer_device))
        self.model_device = next(self.fully_connected.parameters()).device.type
        return self

    def freeze_transformer(self):
        for param in self.transformer.parameters():
            param.requires_grad = False

    def unfreeze_transformer(self):
        for param in self.transformer.parameters():
            param.requires_grad = True

    def forward(self, x):
        x = x.to(self.transformer_device)
        m = (x > 0).int()
        x = self.transformer(x, attention_mask=m)[0]
        #
        seq_len = x.shape[-1]
        #
        if self.transformer_device != self.model_device:
            x = x.to(self.model_device)
        #
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fully_connected(x)

        return self.output_activation(x).squeeze()

    def init_weights(self, initrange):
        for names in self.lstm._all_weights:
            for name in filter(lambda n: "bias" in n, names):
                bias = getattr(self.lstm, name)
                n = bias.size(0)
                start, end = n//4, n//2
                bias.data[start:end].fill_(1.)
            for name in filter(lambda n: "weight" in n,  names):
                weight = getattr(self.lstm, name)
                weight.data.uniform_(-initrange, initrange)

        self.fully_connected.bias.data.fill_(0)
        self.fully_connected.weight.data.uniform_(-initrange, initrange)

In [13]:
model = RNNClassifier(args, transformer, transformer_device)
model.to(DEVICE)     # pylint: disable=no-member
model.freeze_transformer()

In [14]:
net = IdiomClassifier(
    module=model,
    class_weights=class_weights,
     #
    iterator_train=SkorchBucketIterator,
    iterator_train__batch_size=args.batch_size,
    iterator_train__sort_key=lambda x: len(x.sentence),
    iterator_train__shuffle=True,
    iterator_train__device=DEVICE,
    iterator_train__one_hot=ONE_HOT_OUTPUT,
     #
    iterator_valid=SkorchBucketIterator,
    iterator_valid__batch_size=1,
    iterator_valid__sort_key=lambda x: len(x.sentence),
    iterator_valid__shuffle=True,
    iterator_valid__device=DEVICE,
    iterator_valid__one_hot=ONE_HOT_OUTPUT,

    train_split=predefined_split(SentenceDataset(data=(x_val[0:5], y_val[0:5]))),
    optimizer=torch.optim.Adam,
    criterion=nn.BCELoss,
    callbacks=[
        ProgressBar(batches_per_epoch=len(x_train) // args.batch_size + 1),
        CustomScorer(scoring=None, lower_is_better=False, use_caching=False),
        EarlyStopping(monitor='score_best', patience=5),
        Checkpoint(monitor='score_best'),
        GradientNormClipping(gradient_clip_value=args.clipnorm)
    ],
    device=DEVICE,
)


In [15]:
net.fit(SentenceDataset(data=(x_train[0:100], y_train[0:100])), y=None, epochs=1)

HBox(children=(FloatProgress(value=0.0, max=706.0), HTML(value='')))

Confusion matrix
[[ 9 79]
 [ 0  3]]
              precision    recall  f1-score   support

         0.0       1.00      0.10      0.19        88
         1.0       0.04      1.00      0.07         3

    accuracy                           0.13        91
   macro avg       0.52      0.55      0.13        91
weighted avg       0.97      0.13      0.18        91

  epoch    score    train_loss    valid_loss    cp      dur
-------  -------  ------------  ------------  ----  -------
      1   [36m0.0706[0m        [32m0.2707[0m        [35m0.5756[0m     +  11.0456


<class 'skorch_custom.IdiomClassifier'>[initialized](
  module_=RNNClassifier(
    (transformer): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  