## Pytext Library Prototype: xlm_roberta_for_doc_classification

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pytext.fb  # monkey patch for internal. e.g. manifold handler

I0503 105748.051 cross_entropy.py:26] using fused cross entropy


## Fine tune a XLM-Roberta with a task
* The task provides high-level APIs and it builds data, model, trainer, metric for users

In [72]:
from pytext.contrib.pytext_lib.tasks import XLMRobertaForDocClassificationTask


tiny_data_path = "/mnt/vol/pytext/users/stevenliu/glue_data/CoLA/dev_10.tsv"

task = XLMRobertaForDocClassificationTask(
    train_data_path=tiny_data_path,
    valid_data_path=tiny_data_path,
    test_data_path=tiny_data_path,
    column_names=["dummy1", "label", "dummy2", "text"],
    label_vocab=["0", "1"],
    model_name="xlm_roberta_dummy",
    epoch=1
)

In [6]:
task.prepare()
task.train()

Epoch: 0, Train Loss: 1.5845496833324433


In [7]:
batch_inputs=[{"text": "hello world"}]
prediction_labels, scores = task.predict(batch_inputs)
print(f"prediction_labels: {prediction_labels}, \tscores: {scores}")

prediction_labels: tensor([0]), 	scores: tensor([[-0.6255, -1.0933]], grad_fn=<LogSigmoidBackward>)


## Fine tune a XLM-Roberta from scratch
* construct transforms, which must be consistent with the ones used during pre-training. (We may load them from the Hub)

In [64]:
from pytext.contrib.pytext_lib import transforms


tokenizer_transform = transforms.SpmTokenizerTransform()

vocab = transforms.build_vocab("manifold://nlp_technologies/tree/xlm/models/xlm_r/vocab")
vocab_transform = transforms.VocabTransform(vocab)

cap_transform = transforms.CapTransform(vocab.get_bos_index(), vocab.get_eos_index(), max_seq_len=256)

transform_list = [tokenizer_transform, vocab_transform, cap_transform]

input_transform = transforms.RobertaInputTransform(transform_list, 1, "text")

batch_inputs = [{"text": "hello world"}]
text_inputs = input_transform.extract_inputs(batch_inputs)
print(f"tokens: {tokenizer_transform(text_inputs)}\n")

model_inputs = input_transform(text_inputs)
print(f"model_inputs: {model_inputs}\n")

tokens: [Tokens(values=['▁hell', 'o', '▁world'], start_idxs=[0, 4, 6], end_idxs=[4, 5, 11])]

model_inputs: {'tokens': tensor([[    0, 33600,    31,  8999,     2]]), 'pad_mask': tensor([[1, 1, 1, 1, 1]]), 'segment_labels': tensor([[0, 0, 0, 0, 0]]), 'positions': tensor([[0, 1, 2, 3, 4]])}



In [61]:
from pytext.data.utils import Vocabulary

label_vocab = Vocabulary(["0", "1"])
label_transform = transforms.LabelTransform(
    label_vocab, field_name="label", pad_idx=-1
)

batch_inputs = [{"text": "hello world", "label": "1"}]
label_transform(batch_inputs)

{'label': tensor([1])}

* load pre-trained model weights

In [62]:
from pytext.contrib.pytext_lib import models

model = models.xlm_roberta_dummy_binary_doc_classifier(pretrained=True)

* predict with a batch of inputs

In [65]:
batch_inputs = [{"text": "hello world"}]
text_inputs = input_transform.extract_inputs(batch_inputs)

model_inputs = input_transform(text_inputs)
print(f"model_inputs: {model_inputs}\n")

logits = model(model_inputs)
print(f"logits: {logits}\n")

prediction_labels, scores = model.get_pred(logits)
print(f"prediction_labels: {prediction_labels}, \tscores: {scores}\n")

model_inputs: {'tokens': tensor([[    0, 33600,    31,  8999,     2]]), 'pad_mask': tensor([[1, 1, 1, 1, 1]]), 'segment_labels': tensor([[0, 0, 0, 0, 0]]), 'positions': tensor([[0, 1, 2, 3, 4]])}

logits: tensor([[0.0674, 0.2159]], grad_fn=<AddmmBackward>)

prediction_labels: tensor([1]), 	scores: tensor([[-0.6600, -0.5910]], grad_fn=<LogSigmoidBackward>)



* get loss for training

In [71]:
batch_inputs = [{"text": "hello world", "label": "1"}]
text_inputs = input_transform.extract_inputs(batch_inputs)
model_inputs = input_transform(text_inputs)
logits = model(model_inputs)

targets = label_transform(batch_inputs)
loss = model.get_loss(logits, targets["label"])
print(f"loss: {loss}, logits: {logits}, targets: {targets}")

loss: 1.2587757110595703, logits: tensor([[-0.9213, -0.4178]], grad_fn=<AddmmBackward>), targets: {'label': tensor([1])}


* setup datasets

In [78]:
from torch.utils.data import DataLoader
from pytext.contrib.pytext_lib.datasets import TsvDataset


tiny_data_path = "/mnt/vol/pytext/users/stevenliu/glue_data/CoLA/dev_10.tsv"

train_dataset = TsvDataset(
    file_path=tiny_dataset,
    batch_size=2,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)
valid_dataset = TsvDataset(
    file_path=tiny_dataset,
    batch_size=2,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)
test_dataset = TsvDataset(
    file_path=tiny_dataset,
    batch_size=2,
    field_names=["dummy1", "label", "dummy2", "text"],
    transform=input_transform,
    label_transform=label_transform,
)

train_dataloader = DataLoader(train_dataset, batch_size=None)
valid_dataloader = DataLoader(train_dataset, batch_size=None)
test_dataloader = DataLoader(train_dataset, batch_size=None)

for batch in train_dataloader:
    print(batch)
    break

{'tokens': tensor([[     0,    581,     57,   3632,      7,  66397,     70,  12562,   6659,
          34735,    111,     70,  13950,      7,      5,      2],
        [     0,    581,  57888,      7,   7228,     70,      6, 131803, 115851,
            645,     70,   5452,   4293,      5,      2,      1]]), 'pad_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'segment_labels': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'positions': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,  0]]), 'label': tensor([1, 1])}


* write your own train loop. e.g. SimpleTrainer

In [79]:
from pytext.contrib.pytext_lib.trainers import SimpleTrainer
from pytext.fb.optimizer import FairSeqAdam


optimizer = FairSeqAdam(
    model.parameters(),
    lr=0.00001,
    betas=[0.9, 0.999],
    eps=1e-8,
    weight_decay=0,
    amsgrad=False,
)

trainer = SimpleTrainer()

trainer.train(
    dataloader=train_dataloader,
    model=model,
    optimizer=optimizer,
    epoch=2,
)

Epoch: 0, Train Loss: 0.7677797436714172
Epoch: 1, Train Loss: 0.7366044402122498
