In [1]:
%connect_info

{
  "shell_port": 64813,
  "iopub_port": 64814,
  "stdin_port": 64815,
  "control_port": 64817,
  "hb_port": 64816,
  "ip": "127.0.0.1",
  "key": "71c91fe2-e00fd3d689b744dd38af60df",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-f764f69d-f605-4fd7-8fef-bd5353052127.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

In [3]:
import sys
sys.argv = sys.argv[:1]

In [4]:
import os
import pickle
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import BucketIterator

from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# from models import CNNClassifier
from preprocess import load_tokenized_data#, SentenceDataset, SkorchBucketIterator
from utils import build_model_name, convert_flags_to_dict, define_cnn_flags

from transformers import AutoModel, AutoTokenizer

In [5]:
from torchtext.data import Dataset, Field, Example
from torchtext.data.iterator import BucketIterator
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from skorch import NeuralNetClassifier
from skorch.callbacks import ProgressBar, EpochScoring, EarlyStopping, Checkpoint
from skorch.callbacks.scoring import ScoringBase
from skorch.helper import predefined_split

In [6]:
class SentenceDataset(Dataset):

    def __init__(self, data, min_len=5, **kwargs):
        self.min_len = min_len
        text_field = Field(use_vocab=False, pad_token=0, batch_first=True)
        label_field = Field(use_vocab=False, pad_token=0, batch_first=True)
        fields = [("sentence", text_field), ("labels", label_field)]
        examples = []
        for (x, y) in zip(data[0], data[1]):
            if len(x) < self.min_len:  # pad all sequences shorter than this
                x += [0] * (5 - len(x))
                y += [0] * (5 - len(y))
            examples.append(Example.fromlist([x, y], fields))
        super().__init__(examples, fields, **kwargs)

In [7]:
class SkorchBucketIterator(BucketIterator):
    def __iter__(self):
        for batch in super().__iter__():
            # We make a small modification: Instead of just returning batch
            # we return batch.text and batch.label, corresponding to X and y
            # if self.train:
            y =  batch.labels.to('cpu')
            y = to_categorical(y, num_classes=2)#[:, :, 1:]
            y = torch.tensor(y).to(self.device)
            batch.labels = y
            # else:
            #     batch.labels = batch.labels.float()
            yield batch.sentence, batch.labels

In [8]:
class CNNClassifier(nn.Module):
    def __init__(self, config):
        super(CNNClassifier, self).__init__()

        self.transformer_device = config["transformer_device"]
        self.model_device = config["transformer_device"]
        self.transformer = config["bert"]

        self.convolutions = nn.ModuleList([
            nn.Conv1d(
                in_channels=self.transformer.embeddings.word_embeddings.embedding_dim,
                out_channels=config["nfilters"],
                kernel_size=kernel_size,
                stride=1) for kernel_size in config["kernels"]])

        self.pool_stride = config["pool_stride"]

        self.dropout = nn.Dropout(config["dropout"])
        self.fully_connected = nn.Linear(
            (config["nfilters"] // config["pool_stride"]) * len(config["kernels"]), 2)

        self.output_activation = (torch.sigmoid  # pylint: disable=no-member
                                  if config["output_activation"] == 'sigmoid'
                                  else F.softmax)

    def to(self, *args, **kwargs):
        self = super().to(*args, **kwargs)
        self.transformer = self.transformer.to(
            torch.device(self.transformer_device))
        self.model_device = next(self.fully_connected.parameters()).device.type
        return self

    def freeze_transformer(self):
        for param in self.transformer.parameters():
            param.requires_grad = False

    def unfreeze_transformer(self):
        for param in self.transformer.parameters():
            param.requires_grad = True

    def forward(self, x):
        m = (x > 0).int()
        x = self.transformer(x, attention_mask=m)[0]
        #
        x = x * m.unsqueeze(2)
        x = torch.where(x > 0, x, torch.tensor(-1.0))
        x = x.transpose(1, 2)
        seq_len = x.shape[-1]
        #
        if self.transformer_device != self.model_device:
            x = x.to(self.model_device)
        #
        x = [F.relu(conv(x)).transpose(1, 2) for conv in self.convolutions]
        x = [nn.functional.pad(i, (0, 0, 0, seq_len - i.shape[1])) for i in x]
        x = [F.max_pool1d(c, self.pool_stride) for c in x]
        x = torch.cat(x, dim=2)  # pylint: disable=no-member
        x = self.fully_connected(x)
        x = self.dropout(x)

        return self.output_activation(x)


In [9]:
class IdiomClassifier(NeuralNetClassifier):

    def __init__(self, print_report=True, *args, **kwargs):
        self.print_report = print_report
        super(IdiomClassifier, self).__init__(*args, **kwargs)
        self.set_params(callbacks__valid_acc=None)


    def predict(self, X):
        self.module.eval()
        return torch.argmax(self.module(X), dim=2)


    def score(self, X, y=None):
        self.module.eval()
        ds = self.get_dataset(X)
        target_iterator = self.get_iterator(ds, training=False)

        y_true = []
        y_pred = []
        for x, y in target_iterator:
            preds = self.predict(x)
            y_pred.append(preds.view(-1))
            y = torch.argmax(y, dim=2)
            y_true.append(y.view(-1))
        y_true = torch.cat(y_true).detach().numpy()
        y_pred = torch.cat(y_pred).detach().numpy()

        if self.print_report:
            print(classification_report(y_true, y_pred))
        return f1_score(y_true, y_pred, average='binary')

In [10]:
class CustomScorer(EpochScoring):
    def on_epoch_end(self, net, dataset_train, dataset_valid, **kwargs):
        current_score = net.score(dataset_valid)
        self._record_score(net.history, current_score)


In [11]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [12]:
bert_type = 'distilbert-base-multilingual-cased'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
METRIC = "F1"
language_codes = ['DE', 'GA', 'HI', 'PT', 'ZH']
cwd = os.getcwd()

In [13]:
(x_train, y_train), (x_val, y_val), (x_dev, y_dev) = load_tokenized_data(
    datafile='{}/data/{}.tokenized.pkl'.format(cwd, bert_type),
    language_codes=language_codes,
    seed=SEED)

175
228
99
338
204


In [14]:
# len(y_train)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(bert_type)
transformer = AutoModel.from_pretrained(bert_type)

In [16]:
config = {
    'nfilters': 128,
    'kernels': [1, 2, 3, 4, 5],
    'pool_stride': 3,
    'dropout': 0.2,
    'output_activation': 'sigmoid',
    'transformer_device': 'cpu',
    'bert': transformer
}

In [17]:
model = CNNClassifier(config)
# model.to(DEVICE)   # pylint: disable=no-member
model.freeze_transformer()

In [18]:
net = IdiomClassifier(
    module=model,
    #
    iterator_train=SkorchBucketIterator,    
    iterator_train__batch_size=32,
    iterator_train__sort_key=lambda x: len(x.sentence),
    iterator_train__shuffle=True,
    iterator_train__device=DEVICE,
    #
    iterator_valid=SkorchBucketIterator, 
    iterator_valid__batch_size=1,
    iterator_valid__sort_key=lambda x: len(x.sentence),
    iterator_valid__shuffle=True,
    iterator_valid__device=DEVICE,
    
    train_split=predefined_split(SentenceDataset(data=(x_val[0:10], y_val[0:10]))),
            
    optimizer=torch.optim.Adam,
    
    criterion=nn.BCELoss,
#     criterion__ignore_index=-1,
    
    callbacks=[
        ProgressBar(batches_per_epoch=len(x_train) // 32 + 1),
        CustomScorer(scoring=None, lower_is_better=False, use_caching=False),
        EarlyStopping(monitor='score_best', patience=5),
        Checkpoint(monitor='score_best')
        
    ],

    device=DEVICE,
)
# net.set_params(callbacks__valid_acc=None)
print()




In [19]:
net.callbacks_

[('epoch_timer', <skorch.callbacks.logging.EpochTimer at 0x14e77bb90>),
 ('train_loss', <skorch.callbacks.scoring.PassthroughScoring at 0x14e77bc10>),
 ('valid_loss', <skorch.callbacks.scoring.PassthroughScoring at 0x14e77bcd0>),
 ('valid_acc', None),
 ('ProgressBar', <skorch.callbacks.logging.ProgressBar at 0x14e77b290>),
 ('CustomScorer', <__main__.CustomScorer at 0x14e77b310>),
 ('EarlyStopping', <skorch.callbacks.training.EarlyStopping at 0x14e77b990>),
 ('Checkpoint', <skorch.callbacks.training.Checkpoint at 0x14e77ba10>),
 ('print_log', <skorch.callbacks.logging.PrintLog at 0x14e77bdd0>)]

In [20]:
# net.fit(SentenceDataset(data=(x_train[0:32], y_train[0:32])), y=None)

In [21]:
_x_dev, _y_dev = x_dev['GA'], y_dev['GA']
test_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(_x_dev, _y_dev)),
    batch_size=1,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    device=DEVICE)

In [22]:
from evaluation import evaluate_model

In [24]:
for code in language_codes:
    evaluate_model(net, test_iterator, tokenizer, '{}/data/{}/dev.cupt'.format(cwd, code))

0
40
80
120
160
200
240
280
320
## Global evaluation
* MWE-based: P=29/665=0.0436 R=29/126=0.2302 F=0.0733
* Tok-based: P=287/7024=0.0409 R=287/287=1.0000 F=0.0785

## Per-category evaluation (partition of Global)
* IAV: MWE-proportion: gold=42/126=33% pred=17/665=3%
* IAV: MWE-based: P=7/17=0.4118 R=7/42=0.1667 F=0.2373
* IAV: Tok-based: P=35/95=0.3684 R=35/86=0.4070 F=0.3867
* LVC.cause: MWE-proportion: gold=22/126=17% pred=10/665=2%
* LVC.cause: MWE-based: P=6/10=0.6000 R=6/22=0.2727 F=0.3750
* LVC.cause: Tok-based: P=21/37=0.5676 R=21/49=0.4286 F=0.4884
* LVC.full: MWE-proportion: gold=29/126=23% pred=12/665=2%
* LVC.full: MWE-based: P=7/12=0.5833 R=7/29=0.2414 F=0.3415
* LVC.full: Tok-based: P=26/69=0.3768 R=26/65=0.4000 F=0.3881
* <unlabeled>: MWE-proportion: gold=0/126=0% pred=609/665=92%
* <unlabeled>: MWE-based: P=0/609=0.0000 R=0/0=0.0000 F=0.0000
* <unlabeled>: Tok-based: P=0/6728=0.0000 R=0/0=0.0000 F=0.0000
* VID: MWE-proportion: gold=22/126=17% pred=10/665=2%
* VID: MWE-b