<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Tokenizador_para_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install magic_timer
!pip install transformers

In [17]:
import torch
import numpy as np
from magic_timer import MagicTimer

from transformers import BertTokenizerFast, BertModel, BertPreTrainedModel
from transformers import logging
logging.set_verbosity_error()

def pickle_file(path, data=None):
    import pickle
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Fake data

In [10]:
X = [
        ['quais','sao','as','providencias','tomadas','em','relacao','a','pandemia','?','tem','alguma','negociacao','para','a','fatura','?'],
        ['oi', 'quero', 'cancelar', 'carta', 'pre', 'pago'],
        ['nao', 'pude', 'pagar', 'a', 'parcela', 'do', 'acordo'],
        ['parcelar', 'fatura', 'do', 'cartao', 'de', 'credito'],
        ['qual', 'o', 'numero', 'da', 'central', 'de', 'atendimento'],
]

In [12]:
# BILOU Schema B - 'beginning' I - 'inside' L - 'last' O - 'outside' U - 'unit' 
Y = [
        ['O','O','O','O','O','O','O','O','U-outros','O','O','O','U-produto','O','O','U-documento','O'],
        ['O', 'O', 'O', 'B-produto', 'I-produto', 'L-produto'],
        ['O', 'O', 'O', 'O', 'U-servico', 'O', 'U-produto'],
        ['O', 'U-documento', 'O', 'B-produto', 'I-produto', 'L-produto'],
        ['O', 'O', 'U-dados', 'O', 'B-canal', 'I-canal', 'L-canal'],
]

In [13]:
def get_unique_labels(labels, verbose=True):
    labels_flat = []
    for label in labels:
        labels_flat.extend(label)

    unique_labels = set()
    for ulabel in labels_flat:
        unique_labels.add(ulabel)

    label_map = {}

    for (i, label) in enumerate(unique_labels):
        label_map[label] = i

    if verbose:
        print(f'Number of sentences: {len(labels):,}') 
        print(f'unique-labels len: {len(unique_labels)}')
    
    return unique_labels, label_map

unique_labels, label_map  = get_unique_labels(labels=Y, verbose=True)

Number of sentences: 5
unique-labels len: 12


In [14]:
label_map

{'U-dados': 0,
 'B-canal': 1,
 'U-servico': 2,
 'U-documento': 3,
 'U-outros': 4,
 'L-produto': 5,
 'I-canal': 6,
 'O': 7,
 'U-produto': 8,
 'I-produto': 9,
 'B-produto': 10,
 'L-canal': 11}

In [15]:
BERT_path = 'neuralmind/bert-base-portuguese-cased'

tokenizer = BertTokenizerFast.from_pretrained(BERT_path, lowercase=True)

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [18]:
def get_data_stats(data):
    lengths = []
    for sen in data:
        sen = ' '.join(sen)
        encoded_sent = tokenizer.encode(sen, add_special_tokens=True)
        lengths.append(len(encoded_sent))
    print(f'   Min length: {min(lengths):,} tokens')
    print(f'   Max length: {max(lengths):,} tokens')
    print(f'Median length: {int(np.median(lengths)):,} tokens\n')
    return int(np.median(lengths))

avg_len = get_data_stats(x)

   Min length: 3 tokens
   Max length: 88 tokens
Median length: 12 tokens



In [27]:
tokenizer.convert_ids_to_tokens(22280)

'##o'

In [28]:
MAX_LEN = avg_len

def prepare_inputs(data, labels, tokenizer, max_seq_len=MAX_LEN, train=True):
    input_ids, attention_masks = tokenize_data(tokenizer, data, max_seq_len)
    new_labels = []
    null_label_id = -100

    for i, (sen, orig_labels) in enumerate(zip(input_ids, labels)):
        padded_labels = []
        orig_labels_i = 0 
        for token_id in sen:
            token_id = token_id.numpy().item()
            if (token_id == tokenizer.pad_token_id) or \
                (token_id == tokenizer.cls_token_id) or \
                (token_id == tokenizer.sep_token_id):
                
                padded_labels.append(null_label_id)

            elif tokenizer.convert_ids_to_tokens(token_id)[0:2] == '##':
                padded_labels.append(null_label_id)
            else:
                label_str = orig_labels[orig_labels_i]
                padded_labels.append(label_map[label_str])
                orig_labels_i += 1

        assert(len(sen) == len(padded_labels))    
        new_labels.append(padded_labels)

    return new_labels
new_labels = prepare_inputs(X, Y, tokenizer, max_seq_len=MAX_LEN, train=True)
new_labels

[[-100, 7, 7, -100, 7, 7, -100, -100, 7, 7, 7, -100],
 [-100, 7, -100, 7, 7, -100, 10, 9, 5, -100, -100, -100],
 [-100, 7, -100, 7, -100, 7, 7, 2, 7, 8, -100, -100],
 [-100, 7, -100, 3, 7, 10, -100, 9, 5, -100, -100, -100],
 [-100, 7, 7, 0, -100, 7, 1, 6, 11, -100, -100, -100]]

![](https://drive.google.com/uc?id=1TnGVa13ufIq3Yu0v-eNuYH1BZaqcym-z)

![](https://drive.google.com/uc?id=1YIMz23PbdzhYrBmzsICvLE-deuzgFRdK)


In [29]:
def tokenize_data(data, max_seq_len=MAX_LEN):
    input_ids, attention_masks = [], []

    for sent in data:
        sent_str = ' '.join(sent)
        encoded_dict = tokenizer.encode_plus(
                            sent_str,
                            max_length=max_seq_len,
                            padding='max_length',        
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt',
                            )
        
        input_ids.append(encoded_dict['input_ids'][0])
        attention_masks.append(encoded_dict['attention_mask'][0])

    return input_ids, attention_masks

input_ids, attention_masks = tokenize_data(X)    
input_ids, attention_masks

([tensor([  101,  1647,   629, 22280,   260, 12950,  3292, 22281, 13842,   173,
            689,   102]),
  tensor([  101,   146, 22283, 18691,  7450,   159,  3743,   466, 12659,   102,
              0,     0]),
  tensor([  101,   229, 22280, 10340, 22279,  7198,   123, 16672,   171,  1365,
            102,     0]),
  tensor([  101, 16672, 22282, 19971,   171,  3743, 22280,   125,  6884,   373,
            102,     0]),
  tensor([  101,   615,   146,  5492, 22280,   180,  2692,   125, 11289,   102,
              0,     0])],
 [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0])])

In [30]:
def convert_to_tensor(input_ids, attention_masks, new_labels):
    pt_input_ids = torch.stack(input_ids, dim=0)
    pt_attention_masks = torch.stack(attention_masks, dim=0)
    pt_labels = torch.tensor(new_labels, dtype=torch.long)
    
    return pt_input_ids, pt_attention_masks, pt_labels    

pt_input_ids, pt_attention_masks, pt_labels = convert_to_tensor(input_ids, attention_masks, new_labels)

In [31]:
print('\nSentence:    ', X[2])
print('\nLabels:      ', Y[2])
print('\nBERT Tokens: ', tokenizer.tokenize(' '.join(X[2])))
print('\nToken IDs:   ', input_ids[2])
print('\nNew Labels:  ', new_labels[2])
print('\nMask:        ', attention_masks[2])


Sentence:     ['nao', 'pude', 'pagar', 'a', 'parcela', 'do', 'acordo']

Labels:       ['O', 'O', 'O', 'O', 'U-servico', 'O', 'U-produto']

BERT Tokens:  ['na', '##o', 'pud', '##e', 'pagar', 'a', 'parcela', 'do', 'acordo']

Token IDs:    tensor([  101,   229, 22280, 10340, 22279,  7198,   123, 16672,   171,  1365,
          102,     0])

New Labels:   [-100, 7, -100, 7, -100, 7, 7, 2, 7, 8, -100, -100]

Mask:         tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])
