In [39]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 

import os

import torch
import torch.optim as optim

import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import RobertaForTokenClassification

In [36]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

fastai version : 1.0.59
transformers version : 2.2.0


# Data

### To pandas

In [31]:
#export
def readdf(filename):
    ''' read file to dataframe '''
    f = open(filename)
    data, sentence, label = [], [], []
    sentence_idx = 0
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                for word, tag in zip(sentence, label):
                    data.append( (word, tag, sentence_idx) )
                sentence_idx += 1
                sentence, label = [], []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, label))
    return pd.DataFrame(data, columns=['word', 'tag', 'sentence_idx'])

In [59]:
#export
def readdfsentences(filename):
    df = readdf(filename)
    
    agg_func = lambda s: ' '.join(s["word"].values)
    sentences = df.groupby("sentence_idx").apply(agg_func)
    agg_func = lambda s: ' '.join(s["tag"].values)
    labels = df.groupby("sentence_idx").apply(agg_func)
    
    df = pd.concat([sentences, labels], axis=1)
    df.columns = ['sentences', 'labels']
    return df

In [62]:
d = readdfsentences('NER_datasets/CONLL2003/train.txt')
d.head()

Unnamed: 0_level_0,sentences,labels
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O
1.0,Peter Blackburn,B-PER I-PER
2.0,BRUSSELS 1996-08-22,B-LOC O
3.0,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...
4.0,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...


In [102]:
DATA_ROOT = 'NER_datasets/CONLL2003/'
train_df = readdfsentences(DATA_ROOT + 'train.txt')
valid_df = readdfsentences(DATA_ROOT + 'valid.txt')
test_df = readdfsentences(DATA_ROOT + 'test.txt')
print(train_df.shape, valid_df.shape, test_df.shape)
test_df.head()

(14040, 2) (3249, 2) (3452, 2)


Unnamed: 0_level_0,sentences,labels
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...",O O B-LOC O O O O B-PER O O O O
1.0,Nadim Ladki,B-PER I-PER
2.0,"AL-AIN , United Arab Emirates 1996-12-06",B-LOC O B-LOC I-LOC I-LOC O
3.0,Japan began the defence of their Asian Cup tit...,B-LOC O O O O O B-MISC I-MISC O O O O O O O B-...
4.0,But China saw their luck desert them in the se...,O B-LOC O O O O O O O O O O O O O O O O O O O ...


In [103]:
# join train and valid dfsb
train_df['valid'] = False
valid_df['valid'] = True
train_df = pd.concat([train_df, valid_df])
train_df.head()

Unnamed: 0_level_0,sentences,labels,valid
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O,False
1.0,Peter Blackburn,B-PER I-PER,False
2.0,BRUSSELS 1996-08-22,B-LOC O,False
3.0,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...,False
4.0,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...,False


### Config

In [265]:
MODEL_CLASSES = {
    'roberta': (RobertaForTokenClassification, RobertaTokenizer, RobertaConfig)
}

In [266]:
# Parameters
seed = 42
use_fp16 = False
bs = 16

model_type = 'roberta'
pretrained_model_name = 'roberta-base' # 'roberta-base-openai-detector'

In [267]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [268]:
model_class.pretrained_model_archive_map.keys()

dict_keys(['roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector'])

In [269]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [270]:
seed_all(seed)

### Tokenizer

In [271]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
        return [CLS] + tokens + [SEP]

In [272]:
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In [273]:
tokenizer_class.pretrained_vocab_files_map

{'vocab_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json',
  'distilroberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json',
  'roberta-base-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json'},
 'merges_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt',
  'distilroberta-base':

### Vocab

In [274]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)

In [275]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)

tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

In [276]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

### Labels

In [259]:
class LabelTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, labels, max_seq_len=999, **kwargs):
        self.labels = labels
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        return self.labels

In [260]:
labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]", "X"]

label_base_tokenizer = LabelTokenizer(labels)
label_tokenizer = Tokenizer(tok_func = label_base_tokenizer, pre_rules=[], post_rules=[])

In [261]:
class LabelVocab(Vocab):
    def __init__(self, labels):
        super().__init__(itos = [])
        self.int2str = labels
        self.str2int = { s:i for i,s in enumerate(labels) }
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return list(map(lambda i: self.str2int[i], t))
    
    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(list(map(lambda i: self.int2str[i], nums)))

In [262]:
def LabelNumericalizeProcessor(ds, **kwargs):
    return NumericalizeProcessor(ds, vocab=LabelVocab(labels))

In [263]:
def LabelTokenizeProcessor(ds, **kwargs):
    return TokenizeProcessor(ds, tokenizer=label_tokenizer, include_bos=False, include_eos=False)

In [264]:
class TextLabelList(TextList):
    _processor = [LabelTokenizeProcessor, LabelNumericalizeProcessor]

### Databunch

In [248]:
def seq2seq_collate(samples, pad_idx=1, pad_first=True, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    samples = to_data(samples)
    max_len_x,max_len_y = max([len(s[0]) for s in samples]),max([len(s[1]) for s in samples])
    res_x = torch.zeros(len(samples), max_len_x).long() + pad_idx
    res_y = torch.zeros(len(samples), max_len_y).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: 
            res_x[i,-len(s[0]):],res_y[i,-len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
        else:         
            res_x[i,:len(s[0]):],res_y[i,:len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
    if backwards: res_x,res_y = res_x.flip(1),res_y.flip(1)
    return res_x,res_y

In [249]:
class Seq2SeqDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training an RNN classifier."
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1,
               dl_tfms=None, pad_first=False, device:torch.device=None, no_check:bool=False, backwards:bool=False, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(seq2seq_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs//2)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, collate_fn=collate_fn, no_check=no_check)

In [250]:
class Seq2SeqTextList(TextList):
    _bunch = Seq2SeqDataBunch

In [277]:
src = (Seq2SeqTextList.from_df(train_df, cols='sentences', processor=transformer_processor)
       .split_from_df(col='valid')
       .label_from_df(cols='labels', label_cls=TextLabelList)
       .add_test(test_df))
src

LabelLists;

Train: LabelList (14040 items)
x: Seq2SeqTextList
<s> ĠEU Ġrejects ĠGerman Ġcall Ġto Ġboycott ĠBritish Ġlamb Ġ. </s>,<s> ĠPeter ĠBlackburn </s>,<s> ĠBR USS ELS Ġ1996 - 08 - 22 </s>,<s> ĠThe ĠEuropean ĠCommission Ġsaid Ġon ĠThursday Ġit Ġdisagreed Ġwith ĠGerman Ġadvice Ġto Ġconsumers Ġto Ġshun ĠBritish Ġlamb Ġuntil Ġscientists Ġdetermine Ġwhether Ġmad Ġcow Ġdisease Ġcan Ġbe Ġtransmitted Ġto Ġsheep Ġ. </s>,<s> ĠGermany Ġ' s Ġrepresentative Ġto Ġthe ĠEuropean ĠUnion Ġ' s Ġveterinary Ġcommittee ĠWerner ĠZ wing mann Ġsaid Ġon ĠWednesday Ġconsumers Ġshould Ġbuy Ġsheep meat Ġfrom Ġcountries Ġother Ġthan ĠBritain Ġuntil Ġthe Ġscientific Ġadvice Ġwas Ġclearer Ġ. </s>
y: TextLabelList
O B-MISC I-MISC B-PER I-PER B-ORG I-ORG B-LOC I-LOC [CLS] [SEP] X,O B-MISC I-MISC B-PER I-PER B-ORG I-ORG B-LOC I-LOC [CLS] [SEP] X,O B-MISC I-MISC B-PER I-PER B-ORG I-ORG B-LOC I-LOC [CLS] [SEP] X,O B-MISC I-MISC B-PER I-PER B-ORG I-ORG B-LOC I-LOC [CLS] [SEP] X,O B-MISC I-MISC B-PER I-PER B-ORG I-ORG