In [1]:
%connect_info

{
  "shell_port": 51672,
  "iopub_port": 51673,
  "stdin_port": 51674,
  "control_port": 51676,
  "hb_port": 51675,
  "ip": "127.0.0.1",
  "key": "8c4f4364-7810aae2ab912810b3c675fc",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-3b9fecd6-ee80-42be-b0a5-baf2e7edae8b.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

In [3]:
import sys
sys.argv = sys.argv[:1]

In [4]:
import argparse
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import skorch
from sklearn.utils.class_weight import compute_class_weight
from skorch.callbacks import ProgressBar, EarlyStopping, Checkpoint
from skorch.helper import predefined_split

from transformers import AutoModel, AutoTokenizer

# from models import CNNClassifier
from preprocess import load_tokenized_data
from skorch_custom import SentenceDataset, SkorchBucketIterator
from skorch_custom import IdiomClassifier, CustomScorer
from evaluation import evaluate_model

In [5]:

parser = argparse.ArgumentParser(description='Classifier using CNNs')
parser.add_argument(
    '--bert_type',
    type=str,
    default='distilbert-base-multilingual-cased',
    help='transormer model [should be a miltilingual model]')
parser.add_argument(
    '--bert_device',
    type=str,
    default='gpu',
    help='device to run the transformer model')
parser.add_argument(
    '--metric',
    type=str,
    default='f1',
    help='sklearn metric to evaluate the model while training')
parser.add_argument(
    '--nfilters',
    type=int,
    default=768,
    help='number of convolution filters')
parser.add_argument(
    '--kernels',
    type=list,
    default=[1, 3, 5],
    help='number of convolution filters')
parser.add_argument(
    '--pool_stride',
    type=int,
    default=2,
    help='size of the stride for the pooling operation')
parser.add_argument(
    '--nlayers',
    type=int,
    default=2,
    help='number of convolution filters')
parser.add_argument(
    '--lstm_size',
    type=int,
    default=50,
    help='number of convolution filters')
parser.add_argument(
    '--dropout',
    type=float,
    default=0.2,
    help='dropout probability for the dense layer')
parser.add_argument(
    '--initrange',
    type=float,
    default=0.1,
    help='range to initialize the lstm layers')
parser.add_argument(
    '--clipnorm',
    type=float,
    default=5.0,
    help='limit to clip the l2 norm of gradients')
parser.add_argument(
    '--output_activation',
    type=str,
    default='sigmoid',
    help='output activation')
parser.add_argument(
    '--batch_size',
    type=int,
    default=32,
    help='training batch size')
parser.add_argument(
    '--eval_batch_size',
    type=int,
    default=32,
    help='validation/evaluation batch size')
parser.add_argument(
    '--max_epochs',
    type=int,
    default=100,
    help='max number of epochs to train the model')
parser.add_argument(
    "--eval",
    action="store_true",
    help="eval at the end of the training process")


args = parser.parse_args()
args.kernels = [int(i) for i in args.kernels if ',' not in str(i)]
transformer_device = torch.device(
    'cuda' if torch.cuda.is_available() and args.bert_device == 'gpu'
    else 'cpu')
ONE_HOT_OUTPUT = args.output_activation == 'softmax'

In [6]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')     # pylint: disable=no-member
LANGUAGE_CODES = ['DE', 'GA', 'HI', 'PT', 'ZH']
CWD = os.getcwd()

In [7]:
(x_train, y_train), (x_val, y_val), (x_dev, y_dev) = load_tokenized_data(
    datafile='{}/data/{}.tokenized.pkl'.format(CWD, args.bert_type),
    language_codes=LANGUAGE_CODES,
    seed=SEED)

targets = np.concatenate(y_train).reshape(-1)
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(targets),
                                     y=targets).tolist()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(args.bert_type)
transformer = AutoModel.from_pretrained(args.bert_type)

In [9]:
class RNNCNNClassifier(nn.Module):
    def __init__(self, config, transformer, transformer_device):
        super(RNNCNNClassifier, self).__init__()

        self.transformer_device = transformer_device
        self.model_device = transformer_device
        self.transformer = transformer

        self.convolutions = nn.ModuleList([
            nn.Conv1d(
                in_channels=transformer.embeddings.word_embeddings.embedding_dim,
                out_channels=config.nfilters,
                kernel_size=kernel_size,
                stride=1) for kernel_size in config.kernels])

        self.pool_stride = config.pool_stride

        ninputs = (config.nfilters // config.pool_stride) * len(config.kernels)
        self.lstm = nn.LSTM(
            input_size=ninputs,
            hidden_size=config.lstm_size,
            num_layers=config.nlayers,
            batch_first=True,
            dropout=config.dropout)

        self.dropout = nn.Dropout(config.dropout)

        noutputs = (1 if config.output_activation == 'sigmoid' else 2)
        self.fully_connected = nn.Linear(config.lstm_size, noutputs)

        self.output_activation = (torch.sigmoid  # pylint: disable=no-member
                                  if noutputs == 1
                                  else F.softmax)
        self.init_weights(config.initrange)

    def to(self, *args, **kwargs):
        self = super().to(*args, **kwargs)
        self.transformer = self.transformer.to(
            torch.device(self.transformer_device))
        self.model_device = next(self.fully_connected.parameters()).device.type
        return self

    def freeze_transformer(self):
        for param in self.transformer.parameters():
            param.requires_grad = False

    def unfreeze_transformer(self):
        for param in self.transformer.parameters():
            param.requires_grad = True

    def forward(self, x):
        x = x.to(self.transformer_device)
        m = (x > 0).int()
        x = self.transformer(x, attention_mask=m)[0].transpose(1, 2)
        #
        seq_len = x.shape[-1]
        if self.transformer_device != self.model_device:
            x = x.to(self.model_device)
        #
        x = [F.relu(conv(x)).transpose(1, 2) for conv in self.convolutions]
        x = [nn.functional.pad(i, (0, 0, 0, seq_len - i.shape[1])) for i in x]
        x = [F.max_pool1d(c, self.pool_stride) for c in x]
        x = torch.cat(x, dim=2)  # pylint: disable=no-member
        x = self.dropout(x)
        #
        x, _ = self.lstm(x)
        x = self.dropout(x)
        #
        x = self.fully_connected(x)
        #
        return self.output_activation(x).squeeze()

    def init_weights(self, initrange):
        for names in self.lstm._all_weights:
            for name in filter(lambda n: "bias" in n, names):
                bias = getattr(self.lstm, name)
                n = bias.size(0)
                start, end = n//4, n//2
                bias.data[start:end].fill_(1.)
            for name in filter(lambda n: "weight" in n,  names):
                weight = getattr(self.lstm, name)
                weight.data.uniform_(-initrange, initrange)

        # self.fully_connected.bias.data.fill_(0)
        # self.fully_connected.weight.data.uniform_(-initrange, initrange)



In [10]:
model = RNNCNNClassifier(args, transformer, transformer_device)
model.to(DEVICE)     # pylint: disable=no-member
model.freeze_transformer()

In [11]:
net = IdiomClassifier(
    module=model,
    class_weights=class_weights,
    print_report=False,
     #
    iterator_train=SkorchBucketIterator,
    iterator_train__batch_size=args.batch_size,
    iterator_train__sort_key=lambda x: len(x.sentence),
    iterator_train__shuffle=True,
    iterator_train__device=DEVICE,
    iterator_train__one_hot=ONE_HOT_OUTPUT,
     #
    iterator_valid=SkorchBucketIterator,
    iterator_valid__batch_size=32,
    iterator_valid__sort_key=lambda x: len(x.sentence),
    iterator_valid__shuffle=True,
    iterator_valid__device=DEVICE,
    iterator_valid__one_hot=ONE_HOT_OUTPUT,

    train_split=predefined_split(SentenceDataset(data=(x_val[0:50], y_val[0:50]))),
    optimizer=torch.optim.Adam,
    criterion=nn.BCELoss,
    callbacks=[
        ProgressBar(batches_per_epoch=len(x_train) // args.batch_size + 1),
        CustomScorer(scoring=None, lower_is_better=False, use_caching=False),
        EarlyStopping(monitor='score_best', patience=5),
        Checkpoint(monitor='score_best')
    ],
    device=DEVICE,
)


In [12]:
net.fit(SentenceDataset(data=(x_train[0:2], y_train[0:2])), y=None, epochs=1)

HBox(children=(FloatProgress(value=0.0, max=478.0), HTML(value='')))

  epoch    score    train_loss    valid_loss    cp     dur
-------  -------  ------------  ------------  ----  ------
      1   [36m0.1070[0m        [32m0.6578[0m        [35m0.2291[0m     +  6.2206


<class 'skorch_custom.IdiomClassifier'>[initialized](
  module_=RNNCNNClassifier(
    (transformer): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [None]:

code = 'GA'
print('#' * 20)
print('# Evaluating Language: {}'.format(code))
print('#' * 20)
test_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(x_dev[code], y_dev[code])),
    batch_size=32,
    sort=False,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    train=False,
    one_hot=args.output_activation == 'softmax',
    device=DEVICE)
args.dev_file = '{}/data/{}/dev.cupt'.format(CWD, code)
evaluate_model(net, test_iterator, tokenizer, args)

In [None]:
code = 'GA'
test_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(x_dev[code], y_dev[code])),
    batch_size=1,
    sort=False,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    train=False,
    one_hot=args.output_activation == 'softmax',
    device=DEVICE)

In [None]:
x, y = next(iter(test_iterator))

In [None]:
x.shape

In [None]:
x = transformer(x, attention_mask=(x > 0).int())[0].transpose(1,2)

In [None]:
seq_len = x.shape[-1]
x.shape

In [None]:
convolutions = nn.ModuleList([
    nn.Conv1d(
        in_channels=transformer.embeddings.word_embeddings.embedding_dim,
        out_channels=128,
        kernel_size=kernel_size,
        stride=1) for kernel_size in [1,2,3,4,5]])

In [None]:
 x = [F.relu(conv(x)).transpose(1, 2) for conv in convolutions]

In [None]:
for i in x:
    print(i.shape)

In [None]:
 x = [nn.functional.pad(i, (0, 0, 0, seq_len - i.shape[1])) for i in x]

In [None]:
for i in x:
    print(i.shape)

In [None]:
x = [F.max_pool1d(c, 5) for c in x]

In [None]:
for i in x:
    print(i.shape)

In [None]:
x = torch.cat(x, dim=2)

In [None]:
x.shape

In [None]:
ninputs = (128 // 5) * len([1,2,3,4,5])
noutputs = 1#(1 if config.output_activation == 'sigmoid' else 2)

In [None]:
lstm = nn.LSTM(
    input_size=ninputs,
    hidden_size=100,
    num_layers=1,
    batch_first=True,
    dropout=0.2)

In [None]:
x, _ = lstm(x)

In [None]:
x.shape

In [None]:
fully_connected = nn.Linear(100, noutputs)

In [None]:
x = fully_connected(x)

In [None]:
x.shape