## Pytext Library Prototype: xlm_roberta_for_doc_classification

In [1]:
%load_ext autoreload
%autoreload 2

In [38]:
%pdb
import pytext.fb  # monkey patch for internal. e.g. manifold handler

Automatic pdb calling has been turned ON


## Fine tune a XLM-Roberta with a task
* The task provides high-level APIs and it builds data, model, trainer, metric for users

In [3]:
from pytext.contrib.pytext_lib.tasks import XLMRobertaForDocClassificationTask


tiny_data_path = "/mnt/vol/pytext/users/stevenliu/glue_data/CoLA/dev_10.tsv"

task = XLMRobertaForDocClassificationTask(
    train_data_path=tiny_data_path,
    valid_data_path=tiny_data_path,
    test_data_path=tiny_data_path,
    column_names=["dummy1", "label", "dummy2", "text"],
    label_vocab=["0", "1"],
    model_name="xlm_roberta_dummy",
    epoch=2
)

In [4]:
task.prepare()
task.train()

Epoch: 0, Train Loss: 1.5804810762405395
Epoch: 1, Train Loss: 1.6011629581451416


In [5]:
batch_inputs=[{"text": "hello world"}]
prediction_labels, scores = task.predict(batch_inputs)
print(f"prediction_labels: {prediction_labels}, \tscores: {scores}")

prediction_labels: tensor([0]), 	scores: tensor([[-0.7977, -1.1323]], grad_fn=<LogSigmoidBackward>)


## Fine tune a XLM-Roberta from scratch
* construct transforms, which must be consistent with the ones used during pre-training. (We may load them from the Hub)

In [6]:
from pytext.contrib.pytext_lib import transforms


# find available transforms with auto completion in IDE/Notebook by typing "transforms." and tab
tokenizer_transform = transforms.SpmTokenizerTransform()

vocab = transforms.build_vocab("manifold://nlp_technologies/tree/xlm/models/xlm_r/vocab")
vocab_transform = transforms.VocabTransform(vocab)

cap_transform = transforms.CapTransform(vocab.get_bos_index(), vocab.get_eos_index(), max_seq_len=256)

transform_list = [tokenizer_transform, vocab_transform, cap_transform]

input_transform = transforms.RobertaInputTransform(transform_list, 1, "text")

batch_inputs = [{"text": "hello world"}]
text_inputs = input_transform.extract_inputs(batch_inputs)
print(f"tokens: {tokenizer_transform(text_inputs)}\n")

model_inputs = input_transform(text_inputs)
print(f"model_inputs: {model_inputs}\n")

tokens: [Tokens(values=['▁hell', 'o', '▁world'], start_idxs=[0, 4, 6], end_idxs=[4, 5, 11])]

model_inputs: {'tokens': tensor([[    0, 33600,    31,  8999,     2]]), 'pad_mask': tensor([[1, 1, 1, 1, 1]]), 'segment_labels': tensor([[0, 0, 0, 0, 0]]), 'positions': tensor([[0, 1, 2, 3, 4]])}



In [7]:
from pytext.data.utils import Vocabulary

label_vocab = Vocabulary(["0", "1"])
label_transform = transforms.LabelTransform(
    label_vocab, field_name="label", pad_idx=-1
)

batch_inputs = [{"text": "hello world", "label": "1"}]
label_transform(batch_inputs)

{'label': tensor([1])}

* load pre-trained model weights

In [8]:
from pytext.contrib.pytext_lib import models

# find available transforms with auto completion in IDE/Notebook by typing "models." and tab
model = models.xlm_roberta_dummy_binary_doc_classifier(pretrained=True)

* predict with a batch of inputs

In [None]:
batch_inputs = [{"text": "hello world"}]
text_inputs = input_transform.extract_inputs(batch_inputs)

model_inputs = input_transform(text_inputs)
print(f"model_inputs: {model_inputs}\n")

logits = model(model_inputs)
print(f"logits: {logits}\n")

prediction_labels, scores = model.get_pred(logits)
print(f"prediction_labels: {prediction_labels}, \tscores: {scores}\n")

model_inputs: {'tokens': tensor([[    0, 33600,    31,  8999,     2]]), 'pad_mask': tensor([[1, 1, 1, 1, 1]]), 'segment_labels': tensor([[0, 0, 0, 0, 0]]), 'positions': tensor([[0, 1, 2, 3, 4]])}

logits: tensor([[0.7910, 0.1671]], grad_fn=<AddmmBackward>)

prediction_labels: tensor([0]), 	scores: tensor([[-0.3739, -0.6131]], grad_fn=<LogSigmoidBackward>)



* get loss for training

In [None]:
batch_inputs = [{"text": "hello world", "label": "1"}]
text_inputs = input_transform.extract_inputs(batch_inputs)
model_inputs = input_transform(text_inputs)
logits = model(model_inputs)

targets = label_transform(batch_inputs)
loss = model.get_loss(logits, targets["label"])
print(f"loss: {loss}, logits: {logits}, targets: {targets}")

loss: 1.486728549003601, logits: tensor([[0.6509, 0.6617]], grad_fn=<AddmmBackward>), targets: {'label': tensor([1])}


* setup datasets

In [None]:
from torch.utils.data import DataLoader
from pytext.contrib.pytext_lib.datasets import TsvDataset


tiny_data_path = "/mnt/vol/pytext/users/stevenliu/glue_data/CoLA/dev_10.tsv"

train_dataset = TsvDataset(
    file_path=tiny_data_path,
    batch_size=2,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)
valid_dataset = TsvDataset(
    file_path=tiny_data_path,
    batch_size=2,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)
test_dataset = TsvDataset(
    file_path=tiny_data_path,
    batch_size=2,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)

train_dataloader = DataLoader(train_dataset, batch_size=None)
valid_dataloader = DataLoader(train_dataset, batch_size=None)
test_dataloader = DataLoader(train_dataset, batch_size=None)

for batch in train_dataloader:
    print(batch)
    break

{'tokens': tensor([[     0,    581,     57,   3632,      7,  66397,     70,  12562,   6659,
          34735,    111,     70,  13950,      7,      5,      2],
        [     0,    581,  57888,      7,   7228,     70,      6, 131803, 115851,
            645,     70,   5452,   4293,      5,      2,      1]]), 'pad_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'segment_labels': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'positions': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0]]), 'label': tensor([1, 1])}


* write your own train loop. e.g. SimpleTrainer

In [None]:
from pytext.contrib.pytext_lib.trainers import SimpleTrainer
from pytext.fb.optimizer import FairSeqAdam


optimizer = FairSeqAdam(
    model.parameters(),
    lr=0.00001,
    betas=[0.9, 0.999],
    eps=1e-8,
    weight_decay=0,
    amsgrad=False,
)

trainer = SimpleTrainer()

trainer.train(
    dataloader=train_dataloader,
    model=model,
    optimizer=optimizer,
    epoch=2,
)

Epoch: 0, Train Loss: 0.7989357113838196
Epoch: 1, Train Loss: 0.7629225134849549


## Support Custom batcher (e.g. shuffling, sorting by sequence length)

In [37]:
from pytext.contrib.pytext_lib.datasets import TsvDataset
from pytext.contrib.pytext_lib.datasets import PoolingBatcher, NestedDataset


tiny_data_path = "/mnt/vol/pytext/users/stevenliu/glue_data/CoLA/dev_10.tsv"

# apply transforms to each row and skip batching by setting batch size=1
tokenized_dataset = TsvDataset(
    file_path=tiny_data_path,
    batch_size=1,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)

batches = [batch for batch in tokenized_dataset]
print(f"tokenized: {batches[0]['tokens']}")

# apply custom Batcher and skip transforms
# sort by length of tokens
sort_key=lambda row: len(row['tokens'])

batched_dataset = NestedDataset(
    dataset=tokenized_dataset,
    batcher=PoolingBatcher(batch_size=6, pool_num_batches=5, sort_key=sort_key),
)

batches = [batch for batch in batched_dataset]
print(f"batched: {repr(batches[0]['tokens'])}")

sequence_length_in_a_batch = [len(t) for t in batches[0]['tokens']]
print(f"length of tokens for each row in a batch: {sequence_length_in_a_batch}")

tokenized: tensor([[    0,   581,    57,  3632,     7, 66397,    70, 12562,  6659, 34735,
           111,    70, 13950,     7,     5,     2]])
batched: [tensor([    0, 23213, 44632,     7,    47,    70,  6524,    67,  7844, 51952,
            4,  2412, 62163,     8, 11856,   297,     5,     2]), tensor([    0,   581,    57,  3632,     7, 66397,    70, 12562,  6659, 34735,
          111,    70, 13950,     7,     5,     2]), tensor([     0,    581,  57888,      7,   7228,     70,      6, 131803, 115851,
           645,     70,   5452,   4293,      5,      2]), tensor([    0,    87, 35968,   450,    70,  1286,  4939, 73203,     4,    70,
         1286,   764, 19859,     5,     2]), tensor([    0,   581,   348, 25388, 23213,  4163,     4,    70,  1286,  2412,
        54811,    99, 75073,     5,     2]), tensor([     0,    581, 135969,    289,     54,   1181,    148,  10384,  32502,
         68034,   5078,    184,      5,      2])]
length of tokens for each row in a batch: [18, 16, 15, 15, 