# Using RoBERTa with Fastai - ReCoRD Tutorial 

This notebook follows the tutorial @ https://medium.com/@devkosal/superglue-roberta-with-fastai-for-rte-task-c362961be957

## This notebook is under construction

In [1]:
from fastai.text import *
from fastai.metrics import *
from pytorch_transformers import RobertaTokenizer
import jsonlines

In [2]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    task = "ReCoRD",
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchanged with roberta-large 
    max_lr=1e-5,
    epochs=10,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 3,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
    mark_fields=True,
)

In [3]:
path = Path(".")
data_path = path/"data"

In [4]:
#load json object
def record_parser(path_to_json, test=False):
    with jsonlines.open(path_to_json) as f:
        i = 0
        idx,passage,question,entity,label = [],[],[],[],[]
        for obj in f:
            i += 1
            text = obj["passage"]["text"]
            for q in obj['qas']:
                if not test: answers = [a["text"] for a in q["answers"]] 
                for e in obj["passage"]["entities"]:
                    idx.append(i)
                    passage.append(text)
                    ques = q["query"].replace("@placeholder","<mask>")
                    question.append(ques)
                    ent = text[e["start"]:e["end"]+1]
                    entity.append(ent)
                    label.append(ent in answers) if not test else label.append(None)
    return pd.DataFrame({"idx":idx,"passage":passage,"question":question,"entity":entity,"label":label})


In [5]:
train = record_parser(data_path/config.task/"train.jsonl")
val = record_parser(data_path/config.task/"val.jsonl")
test = record_parser(data_path/config.task/"test.jsonl",test=True)


In [6]:
train.head()

Unnamed: 0,idx,passage,question,entity,label
0,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Afghanistan,False
1,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Mariam,False
2,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Badam Bagh,False
3,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Nuria,True
4,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Nuria,True


In [7]:
       
if config.testing:
    train = train[:1000]
    val = val[:300]
    test = test[:100]
    
print(train.shape)

(1796856, 5)


In [8]:
train.head()

Unnamed: 0,idx,passage,question,entity,label
0,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Afghanistan,False
1,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Mariam,False
2,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Badam Bagh,False
3,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Nuria,True
4,1,The harrowing stories of women and children lo...,The baby she gave birth to is her husbands and...,Nuria,True


In [9]:
train.label.value_counts()

False    1537301
True      259555
Name: label, dtype: int64

In [10]:
feat_cols = ["question","passage","entity"]
label_cols = "label"

## Setting Up the Tokenizer

In [11]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        if config.mark_fields:
            sub = 2 # subtraction in totoal seq_length to be made due to adding spcl tokens
            assert "xxfld" in t
            t = t.replace("xxfld 1","") # remove the xxfld 1 special token from fastai
            # converting fastai field sep token to Roberta
            t = re.split(r'xxfld \d+', t) 
            res = []
            for i in range(len(t)-1): # loop over the number of additional fields and the Roberta sep
                res += self._pretrained_tokenizer.tokenize(t[i]) + [config.end_tok, config.end_tok]
                sub += 2 # increase our subtractions since we added more spcl tokens
            res += self._pretrained_tokenizer.tokenize(t[-1]) # add the last sequence
            return [config.start_tok] + res[:self.max_seq_len - sub] + [config.end_tok] 
        
        res = self._pretrained_tokenizer.tokenize(t)
        return [config.start_tok] + res[:self.max_seq_len - sub] + [config.end_tok]

In [12]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

In [13]:
# create fastai vocabulary for roberta
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [14]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False, mark_fields=config.mark_fields)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=fastai_roberta_vocab, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab)]

## Setting up the DataBunch

In [15]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [16]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

In [17]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# creating our databunch 
data = ItemLists(".", RobertaTextList.from_df(train, ".", cols=feat_cols, processor=processor),
                      RobertaTextList.from_df(val, ".", cols=feat_cols, processor=processor)
                ) \
       .label_from_df(cols=label_cols, label_cls=CategoryList) \
       .add_test(RobertaTextList.from_df(test, ".", cols=feat_cols, processor=processor)) \
       .databunch(bs=config.bs,pad_first=False)

OSError: [Errno 12] Cannot allocate memory

# Building the Model

In [None]:
import torch
import torch.nn as nn
from pytorch_transformers import RobertaForSequenceClassification

# defining our model architecture 
class RobertaForSequenceClassificationModel(nn.Module):
    def __init__(self,num_labels=config.num_labels):
        super(RobertaForSequenceClassificationModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaForSequenceClassification.from_pretrained(config.roberta_model_name,num_labels= self.num_labels)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids, token_type_ids, attention_mask)
        logits = outputs[0] 
        return logits

In [None]:
roberta_model = RobertaForSequenceClassificationModel() 

learn = Learner(data, roberta_model, metrics=[accuracy])

In [None]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

# Getting Predictions

In [None]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [None]:
# val preds
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)

In [None]:
# accuracy for valid valid
(pred_values == data.valid_ds.y.items).mean()

In [None]:
# test preds
_, test_pred_values = get_preds_as_nparray(DatasetType.Test)