In [1]:
%connect_info

{
  "shell_port": 51608,
  "iopub_port": 51609,
  "stdin_port": 51610,
  "control_port": 51612,
  "hb_port": 51611,
  "ip": "127.0.0.1",
  "key": "1633b359-721812679b3a3110cb417a53",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-01ef77db-99f0-4ea4-a65b-ec8d8310d2f3.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [2]:
%matplotlib inline

In [3]:
import sys
sys.argv = sys.argv[:1]

In [4]:
import os
import pickle
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import BucketIterator

from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report

from torchtext.data.iterator import BucketIterator

from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, RunningAverage
from ignite.handlers import ModelCheckpoint, EarlyStopping
from ignite.contrib.handlers import ProgressBar

from models import CNNClassifier
from preprocess import load_tokenized_data, SentenceDataset
from utils import build_model_name, convert_flags_to_dict, define_cnn_flags

from transformers import AutoModel

In [5]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [6]:
bert_type = 'distilbert-base-multilingual-cased'

In [7]:
(x_train, y_train), (x_val, y_val), (x_dev, y_dev) = load_tokenized_data(
    datafile='{}/data/{}.tokenized.pkl'.format(os.getcwd(), bert_type), 
    language_codes=['DE', 'GA', 'HI', 'PT', 'ZH'],
    seed=SEED)

175
228
99
338
204


In [11]:
import os
import pickle

import numpy as np
import torch
from torchtext.data import Dataset, Field, Example
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [36]:
class SentenceDataset(Dataset):

    def __init__(self, data, **kwargs):
        text_field = Field(use_vocab=False, pad_token=0, batch_first=True)
        label_field = Field(use_vocab=False, pad_token=0, batch_first=True)
        fields = [("sentence", text_field), ("mask", text_field), ("labels", label_field)]
        examples = []
        for (x, y) in zip(data[0], data[1]):
            m = (np.array(x) > 0).astype(int).tolist()            
            examples.append(Example.fromlist([x, m, y], fields))
        super().__init__(examples, fields, **kwargs)

In [47]:
class SkorchBucketIterator(BucketIterator):
    def __iter__(self):
        for batch in super().__iter__():
            # We make a small modification: Instead of just returning batch
            # we return batch.text and batch.label, corresponding to X and y
            y =  batch.labels.to('cpu')
            y = to_categorical(y)
            y = torch.tensor(y).to(self.device)
            yield batch.sentence, batch.mask, y

In [48]:
train_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(x_train, y_train)),
    batch_size=32,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    device=torch.device("cpu"))

valid_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(x_val, y_val)),
    batch_size=32,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    device=torch.device("cpu"))

test_iterator = SkorchBucketIterator(
    dataset=SentenceDataset(data=(x_dev, y_dev)),
    batch_size=32,
    sort_key=lambda x: len(x.sentence),
    shuffle=False,
    device=torch.device("cpu"))

In [49]:
next(iter(train_iterator))

(tensor([[  101, 11583, 10995,  ...,     0,     0,     0],
         [  101, 10685, 11289,  ...,     0,     0,     0],
         [  101,  2280,  3241,  ...,     0,     0,     0],
         ...,
         [  101, 10190, 19979,  ...,     0,     0,     0],
         [  101,  4489,  2429,  ...,     0,     0,     0],
         [  101,  5639,  3204,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[[1., 0.],
          [1., 0.],
          [1., 0.],
          ...,
          [1., 0.],
          [1., 0.],
          [1., 0.]],
 
         [[1., 0.],
          [1., 0.],
          [1., 0.],
          ...,
          [1., 0.],
          [1., 0.],
          [1., 0.]],
 
         [[1., 0.],
          [1., 0.],
          [1., 0.],
          ...,
          [1., 0.],
          [1., 0.],
          [1.

In [None]:
transformer = AutoModel.from_pretrained(bert_type)

for param in transformer.parameters():
    param.requires_grad = False

transformer.to(torch.device("cpu"))
# tokenizer = AutoTokenizer.from_pretrained(bert_type)

In [None]:
config = {
    'nfilters': 128,
    'kernels': [1, 2, 3, 4, 5],
    'pool_stride': 3,
    'dropout': 0.2,
    'output_activation': 'sigmoid',
    'bert': transformer,
#     'emb_dim': transformer.embeddings.word_embeddings.embedding_dim,
    
}
model = CNNClassifier(config)

In [None]:
for par in model.parameters():
    print(par.device)

In [None]:
class CNNClassifier(nn.Module):
    def __init__(self, config):
        super(CNNClassifier, self).__init__()
        
        self.transformer = config["bert"]

        self.convolutions = nn.ModuleList([
            nn.Conv1d(
                in_channels=self.transformer.embeddings.word_embeddings.embedding_dim,
                out_channels=config["nfilters"],
                kernel_size=kernel_size,
                stride=1) for kernel_size in config["kernels"]])

        self.pool_stride = config["pool_stride"]

        self.dropout = nn.Dropout(config["dropout"])
        self.fully_connected = nn.Linear(
            (config["nfilters"] // config["pool_stride"]) * len(config["kernels"]), 2)

        self.output_activation = (torch.sigmoid  # pylint: disable=no-member
                                  if config["output_activation"] == 'sigmoid'
                                  else F.softmax)

    def to(self, *args, **kwargs):
        self = super().to(*args, **kwargs) 
        self.transformer = self.transformer.to(torch.device("cpu"))

    def freeze_transformer(self):
        for param in model.transformer.parameters():
            param.requires_grad = False
        
    def forward(self, x, mask):
        x = self.transformer(x, attention_mask=mask)[0].transpose(1,2)
        seq_len = x.shape[-1]
        #
        x = [F.relu(conv(x)).transpose(1, 2) for conv in self.convolutions]
        x = [nn.functional.pad(i, (0, 0, 0, seq_len - i.shape[1])) for i in x]
        x = [F.max_pool1d(c, self.pool_stride) for c in x]
        x = torch.cat(x, dim=2)  # pylint: disable=no-member
        x = self.fully_connected(x)
        x = self.dropout(x)
        return self.output_activation(x)

In [None]:
model = CNNClassifier(config)
model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))   # pylint: disable=no-member

In [10]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)
criterion = nn.BCELoss()
criterion = criterion.to(cuda())

In [None]:
def process_function(engine, batch):
    x, m, y = batch.sentence, batch.mask, batch.labels
    x = transformer(x, attention_mask=m)[0].transpose(1, 2)
    model.train()
    optimizer.zero_grad()
    y_pred = model(x)
    loss = criterion(y_pred, torch.tensor(to_categorical(y)))
    loss.backward()
    optimizer.step()
    return loss.item()


def eval_function(engine, batch):
    x, m, y = batch.sentence, batch.mask, batch.labels
    x = transformer(x, attention_mask=m)[0].transpose(1, 2)
    model.eval()
    with torch.no_grad():
        y_pred = model(x)
        return y_pred, y

In [None]:
trainer = Engine(process_function)
train_evaluator = Engine(eval_function)
validation_evaluator = Engine(eval_function)

RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

In [None]:
def thresholded_output_transform(output):
    y_pred, y = output
    y_pred = torch.round(y_pred)
    return y_pred, y


Accuracy(output_transform=thresholded_output_transform).attach(train_evaluator, 'accuracy')
Loss(criterion).attach(train_evaluator, 'bce')

Accuracy(output_transform=thresholded_output_transform).attach(validation_evaluator, 'accuracy')
Loss(criterion).attach(validation_evaluator, 'bce')

pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])

In [None]:

def score_function(engine):
    val_loss = engine.state.metrics['bce']
    return -val_loss

handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer)
validation_evaluator.add_event_handler(Events.COMPLETED, handler)


In [None]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    train_evaluator.run(train_iterator)
    metrics = train_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_bce = metrics['bce']
    pbar.log_message(
        "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
        .format(engine.state.epoch, avg_accuracy, avg_bce))

def log_validation_results(engine):
    validation_evaluator.run(valid_iterator)
    metrics = validation_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_bce = metrics['bce']
    pbar.log_message(
        "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
        .format(engine.state.epoch, avg_accuracy, avg_bce))
    pbar.n = pbar.last_print_n = 0

In [None]:
trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

checkpointer = ModelCheckpoint('/tmp/models', 'textcnn', n_saved=2, create_dir=True, save_as_state_dict=True)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'textcnn': model})


In [None]:
trainer.run(train_iterator, max_epochs=20)