In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 

import os

import torch
import torch.optim as optim

from tqdm.notebook import trange, tqdm

import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import RobertaForTokenClassification

In [2]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

fastai version : 1.0.59
transformers version : 2.2.0


# Data

### To pandas

In [3]:
#export
def readdf(filename):
    ''' read file to dataframe '''
    f = open(filename)
    data, sentence, label = [], [], []
    sentence_idx = 0
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                for word, tag in zip(sentence, label):
                    data.append( (word, tag, sentence_idx) )
                sentence_idx += 1
                sentence, label = [], []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, label))
    return pd.DataFrame(data, columns=['word', 'tag', 'sentence_idx'])

In [4]:
#export
def readdfsentences(filename):
    df = readdf(filename)
    
    agg_func = lambda s: ' '.join(s["word"].values)
    sentences = df.groupby("sentence_idx").apply(agg_func)
    agg_func = lambda s: ' '.join(s["tag"].values)
    labels = df.groupby("sentence_idx").apply(agg_func)
    
    df = pd.concat([sentences, labels], axis=1)
    df.columns = ['sentences', 'labels']
    return df

In [5]:
d = readdfsentences('NER_datasets/CONLL2003/train.txt')
d.head()

Unnamed: 0_level_0,sentences,labels
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O
1.0,Peter Blackburn,B-PER I-PER
2.0,BRUSSELS 1996-08-22,B-LOC O
3.0,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...
4.0,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...


In [6]:
DATA_ROOT = 'NER_datasets/CONLL2003/'
train_df = readdfsentences(DATA_ROOT + 'train.txt')
valid_df = readdfsentences(DATA_ROOT + 'valid.txt')
test_df = readdfsentences(DATA_ROOT + 'test.txt')
print(train_df.shape, valid_df.shape, test_df.shape)
test_df.head()

(14040, 2) (3249, 2) (3452, 2)


Unnamed: 0_level_0,sentences,labels
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...",O O B-LOC O O O O B-PER O O O O
1.0,Nadim Ladki,B-PER I-PER
2.0,"AL-AIN , United Arab Emirates 1996-12-06",B-LOC O B-LOC I-LOC I-LOC O
3.0,Japan began the defence of their Asian Cup tit...,B-LOC O O O O O B-MISC I-MISC O O O O O O O B-...
4.0,But China saw their luck desert them in the se...,O B-LOC O O O O O O O O O O O O O O O O O O O ...


In [7]:
# join train and valid dfsb
train_df['valid'] = False
valid_df['valid'] = True
train_df = pd.concat([train_df, valid_df])
train_df.head()

Unnamed: 0_level_0,sentences,labels,valid
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O,False
1.0,Peter Blackburn,B-PER I-PER,False
2.0,BRUSSELS 1996-08-22,B-LOC O,False
3.0,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...,False
4.0,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...,False


### Config

In [8]:
MODEL_CLASSES = {
    'roberta': (RobertaForTokenClassification, RobertaTokenizer, RobertaConfig)
}

In [9]:
# Parameters
seed = 42
use_fp16 = False
bs = 16

model_type = 'roberta'
pretrained_model_name = 'roberta-base' # 'roberta-base-openai-detector'

In [10]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [11]:
model_class.pretrained_model_archive_map.keys()

dict_keys(['roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector'])

In [12]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [13]:
seed_all(seed)

### Tokenizer

In [14]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
        return [CLS] + tokens + [SEP]

In [15]:
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In [16]:
tokenizer_class.pretrained_vocab_files_map

{'vocab_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json',
  'distilroberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json',
  'roberta-base-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json'},
 'merges_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt',
  'distilroberta-base':

### Vocab

In [17]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)

In [18]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)

tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

### Label config

In [19]:
PAD_label = '<x>'
BOS_label = '<s>'
EOS_label = '</s>'

labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", BOS_label, EOS_label, PAD_label]

### Fix labels (to work for byte encoding)

In [20]:
tokenize_processor.process_one('Goodmorning')

['<s>', 'ĠGood', 'morning', '</s>']

In [21]:
def pad_labels(df, PAD='<x>', test_length=False):

    new_labels = []
    for str_sentence, labels in tqdm(zip(df['sentences'], df['labels']), total=len(df['labels'])):
        labels = labels.split(' ')
        sentence = str_sentence.split(' ')
        assert len(sentence) == len(labels)

        padded_labels = []
        for word, label in zip(sentence, labels):

            tokens = tokenize_processor.process_one(word)
            tokens = tokens[1:-1] # remove <s> and </s>

            padded_labels.extend( [label] + [PAD] * int(len(tokens)-1) )

        new_sentence_labels = ' '.join(padded_labels)
        new_labels.append(new_sentence_labels)

        if test_length:
            tokens = tokenize_processor.process_one(str_sentence)
            assert len(tokens)-2 == len(new_sentence_labels.split(' '))

    if not test_length:
        df['labels'] = new_labels

In [22]:
# test
# pad_labels(train_df, PAD_label, test_length=True)
# pad_labels(test_df, PAD_label, test_length=True)

In [23]:
pad_labels(train_df, PAD_label)
pad_labels(test_df, PAD_label)

HBox(children=(IntProgress(value=0, max=17289), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3452), HTML(value='')))




In [24]:
test_df.head()

Unnamed: 0_level_0,sentences,labels
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...",O <x> <x> O B-LOC <x> <x> O O <x> <x> O O B-PE...
1.0,Nadim Ladki,B-PER <x> I-PER <x>
2.0,"AL-AIN , United Arab Emirates 1996-12-06",B-LOC <x> <x> O B-LOC I-LOC I-LOC O <x> <x> <x...
3.0,Japan began the defence of their Asian Cup tit...,B-LOC O O O O O B-MISC I-MISC O O O O O <x> <x...
4.0,But China saw their luck desert them in the se...,O B-LOC O O O O O O O O O O O O O O O O O O <x...


### Labels

In [25]:
class LabelTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, max_seq_len=999, **kwargs):
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        return [BOS_label] + t.split(' ') + [EOS_label]

In [26]:
label_tokenizer = Tokenizer(tok_func = LabelTokenizer(), pre_rules=[], post_rules=[])

In [27]:
class LabelVocab(Vocab):
    def __init__(self, labels):
        super().__init__(itos = [])
        self.int2str = labels
        self.str2int = { s:i for i,s in enumerate(labels) }
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return list(map(lambda i: self.str2int[i], t))
    
    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(list(map(lambda i: self.int2str[i], nums)))

In [28]:
def LabelNumericalizeProcessor(ds, **kwargs):
    return NumericalizeProcessor(ds, vocab=LabelVocab(labels))

In [29]:
def LabelTokenizeProcessor(ds, **kwargs):
    return TokenizeProcessor(ds, tokenizer=label_tokenizer, include_bos=False, include_eos=False)

In [30]:
class TextLabelList(TextList):
    _processor = [LabelTokenizeProcessor, LabelNumericalizeProcessor]

### Databunch

In [31]:
def seq2seq_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    samples = to_data(samples)
    max_len_x,max_len_y = max([len(s[0]) for s in samples]),max([len(s[1]) for s in samples])
    res_x = torch.zeros(len(samples), max_len_x).long() + pad_idx
    res_y = torch.zeros(len(samples), max_len_y).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: 
            res_x[i,-len(s[0]):],res_y[i,-len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
        else:         
            res_x[i,:len(s[0]):],res_y[i,:len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
    if backwards: res_x,res_y = res_x.flip(1),res_y.flip(1)
    return res_x,res_y

In [32]:
class Seq2SeqDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training an RNN classifier."
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1,
               dl_tfms=None, pad_first=False, device:torch.device=None, no_check:bool=False, backwards:bool=False, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(seq2seq_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, collate_fn=collate_fn, no_check=no_check)

In [33]:
class Seq2SeqTextList(TextList):
    _bunch = Seq2SeqDataBunch

In [34]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

In [35]:
data = (Seq2SeqTextList.from_df(train_df, cols='sentences', processor=transformer_processor)
       .split_from_df(col='valid')
       .label_from_df(cols='labels', label_cls=TextLabelList)
       .add_test(test_df)
       .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

In [36]:
i = 2
print((len(data.train_ds[i][0].data), len(data.train_ds[i][1].data)))
data.train_ds[i][0].text, data.train_ds[i][1].text

(10, 10)


('<s> ĠBR USS ELS Ġ1996 - 08 - 22 </s>',
 '<s> B-LOC <x> <x> O <x> <x> <x> <x> </s>')

In [37]:
print((data.train_ds[i][0].data, data.train_ds[i][1].data))

(array([    0,  6823, 16551, 16416,  8008,    12,  3669,    12,  2036,     2]), array([ 9,  7, 11, 11,  0, 11, 11, 11, 11, 10]))


In [38]:
print('[CLS] id :', transformer_tokenizer.cls_token_id)
print('[SEP] id :', transformer_tokenizer.sep_token_id)
print('[PAD] id :', pad_idx)
test_one_batch = data.one_batch()[0]
print('Batch shape : ',test_one_batch.shape)
print(test_one_batch)

[CLS] id : 0
[SEP] id : 2
[PAD] id : 1
Batch shape :  torch.Size([16, 156])
tensor([[    0, 15231,  3935,  ...,    92,  1666,     2],
        [    0, 14624,   324,  ...,     1,     1,     1],
        [    0,    20,  3781,  ...,     1,     1,     1],
        ...,
        [    0, 30143,  2456,  ...,     1,     1,     1],
        [    0,  4307, 11839,  ...,     1,     1,     1],
        [    0,   248,  4581,  ...,     1,     1,     1]])


In [39]:
a = test_one_batch
a[1]

tensor([    0, 14624,   324,  2156,     5,  1002,     9,    10,  3089, 29549,
            9,  1030,  2163,    81,    39,   122,    12, 42742,   196,   265,
        15167,     8,     5, 23294,  4061,  4191,   165,    37,   683,  2075,
         2156,    34,    10,  8996,   774,    11,   226,   523,  1438,  2871,
          128,    29,    22, 11858,  1794,  2156, 24295, 12579,  4832,  5745,
          385,   108, 33037, 12467,    22,    36,  4011,  2156,   390,  4832,
         9223,    13,   304,  4839,   479,     2,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

### Model & Learner

In [40]:
# defining our model architecture 
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids, attention_mask=None):
        # input_ids: [b,s] s - words in sentence
        
#         attention_mask = (input_ids!=1).type(input_ids.type()) # Test attention_mask for RoBERTa
        
        out = self.transformer(input_ids, attention_mask = attention_mask)
        logits = out[0] # [b,s,C]
        return logits

In [41]:
config = config_class.from_pretrained(pretrained_model_name)
config.num_labels = len(labels) - 3 # <s>, </s>, <x>
config.use_bfloat16 = use_fp16
print(config)

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 9,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "use_bfloat16": false,
  "vocab_size": 50265
}



In [42]:
transformer_model = model_class.from_pretrained(pretrained_model_name, config = config)

custom_transformer_model = CustomTransformerModel(transformer_model = transformer_model)

In [43]:
loss = CrossEntropyFlat()
def Loss(y_pred, y_true):
    return loss(y_pred, y_true)

In [44]:
from fastai.callbacks import *
from transformers import AdamW

learner = Learner(data, custom_transformer_model, loss_func=Loss,
                  opt_func = lambda input: AdamW(input,correct_bias=False), 
                  metrics=[])

# Show graph of learner stats and metrics after each epoch.
learner.callbacks.append(ShowGraph(learner))

# Put learn in FP16 precision mode. --> Seems to not working
if use_fp16: learner = learner.to_fp16()

In [45]:
# print(learner.model)

In [46]:
# list_layers = [learner.model.transformer.roberta.embeddings,
#               learner.model.transformer.roberta.encoder.layer[0],
#               learner.model.transformer.roberta.encoder.layer[1],
#               learner.model.transformer.roberta.encoder.layer[2],
#               learner.model.transformer.roberta.encoder.layer[3],
#               learner.model.transformer.roberta.encoder.layer[4],
#               learner.model.transformer.roberta.encoder.layer[5],
#               learner.model.transformer.roberta.encoder.layer[6],
#               learner.model.transformer.roberta.encoder.layer[7],
#               learner.model.transformer.roberta.encoder.layer[8],
#               learner.model.transformer.roberta.encoder.layer[9],
#               learner.model.transformer.roberta.encoder.layer[10],
#               learner.model.transformer.roberta.encoder.layer[11],
#               learner.model.transformer.roberta.pooler]

# learner.split(list_layers)
# num_groups = len(learner.layer_groups)
# print('Learner split in',num_groups,'groups')
# print(learner.layer_groups)

In [47]:
# learner.freeze_to(-1)
# learner.summary()

In [48]:
learner.lr_find()

RuntimeError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 7.77 GiB total capacity; 2.09 GiB already allocated; 10.56 MiB free; 87.53 MiB cached)

In [None]:
# learner.recorder.plot(skip_end=7,suggestion=True)