# Bert named entity detection

In [24]:
from zipfile import ZipFile
from tools.dumps import wrap_open
import pandas as pd
from tqdm import tqdm, trange

!cd data && unzip ner.csv.zip

Archive:  ner.csv.zip
  inflating: ner.csv                 


In [47]:
# The columns are a bit irregular.
names = []
with wrap_open("ner.csv", "r", encoding="latin1") as f:
    names = f.readline().strip().split(",")[1:]
    names = names + list(range(34 - len(names)))

print(names)

with wrap_open("ner.csv", "rb") as f:
    data = pd.read_csv(f, encoding="latin1", names=names).fillna(method="ffill")
data.tail(10)

['Sentence #', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos', 'next-next-shape', 'next-next-word', 'next-pos', 'next-shape', 'next-word', 'pos', 'prev-iob', 'prev-lemma', 'prev-pos', 'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape', 'prev-prev-word', 'prev-shape', 'prev-word', 'sentence_idx', 'shape', 'word', 'tag', 0, 1, 2, 3, 4, 5, 6, 7, 8]


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Sentence #,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,tag,0,1,2,3,4,5,6,7,8
1050787,1048565.0,impact,.,__end1__,__END1__,wildcard,__END1__,.,punct,.,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050788,1048566.0,.,__end1__,__end2__,__END2__,wildcard,__END2__,__END1__,wildcard,__END1__,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050789,1048567.0,indian,forc,said,VBD,lowercase,said,NNS,lowercase,forces,...,B-gpe,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050790,1048568.0,forc,said,they,PRP,lowercase,they,VBD,lowercase,said,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050791,1048569.0,said,they,respond,VBD,lowercase,responded,PRP,lowercase,they,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050792,1048570.0,they,respond,to,TO,lowercase,to,VBD,lowercase,responded,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050793,1048571.0,respond,to,the,DT,lowercase,the,TO,lowercase,to,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050794,1048572.0,to,the,attack,NN,lowercase,attack,DT,lowercase,the,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050795,1048573.0,the,attack,with,IN,lowercase,with,NN,lowercase,attack,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
1050796,1048574.0,attack,with,machine-gun,JJ,contains-hyphen,machine-gun,IN,lowercase,with,...,O,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag


In [52]:
def aggregate(s):
    return [(w, p, t) for w, p, t in zip(s["word"].values.tolist(),
                                               s["pos"].values.tolist(),
                                               s["tag"].values.tolist())]

sentences = [s for s in data.groupby("sentence_idx").apply(aggregate)]

In [53]:
sentences[2]

[('The', 'DT', 'O'),
 ('African', 'JJ', 'B-gpe'),
 ('United', 'NNP', 'B-org'),
 ('Democratic', 'NNP', 'I-org'),
 ('Party', 'NNP', 'I-org'),
 ('tried', 'VBD', 'O'),
 ('unsuccessfully', 'RB', 'O'),
 ('to', 'TO', 'O'),
 ('register', 'VB', 'O'),
 ('as', 'IN', 'O'),
 ('an', 'DT', 'O'),
 ('official', 'JJ', 'O'),
 ('political', 'JJ', 'O'),
 ('party', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('mid', 'JJ', 'O'),
 ('2006', 'CD', 'B-tim'),
 ('.', '.', 'O')]

In [75]:
utterances = [[w[0] for w in s] for s in sentences]
tags = [[w[1] for w in s] for s in sentences]
labels = [[w[2] for w in s] for s in sentences]

tag_values = list(set(data["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [77]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [79]:
MAX_LEN = 75 ## can replace with 512 as per the original paper
bs = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 2080 Ti'

In [80]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

def tokenize_preserve_labels(sentence, text_labels):
    """
    Tokenize the given sentence. Extend the corresponding label
    for all the tokens the word is made of.
    
    Assumption: len(sentence) == len(text_labels)
    """
    
    tokenized_sentence = []
    labels = []
    
    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)
    
    return tokenized_sentence, labels

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [86]:
tokenized_texts_labels = [
    tokenize_preserve_labels(sent, labs) for sent, labs in zip(utterances, labels)
]

In [87]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_labels]
tokenized_labels = [token_label_pair[1] for token_label_pair in tokenized_texts_labels]

In [90]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                maxlen=MAX_LEN, dtype="long", value=0.0, truncating="post", padding="post")

In [95]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                         maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                         dtype="long", truncating="post")

In [96]:
# apply a mask to the padding only.
# Normally, BERT masks are used for cloze-style questions, but this is a translation problem
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

## Setting up a dataloader

In [101]:
# Prepare train and validation sets

train_inputs, validation_inputs, train_tags, validation_tags = train_test_split(input_ids, tags, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_tags = torch.tensor(train_tags)
validation_tags = torch.tensor(validation_tags)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [102]:
train_data = TensorDataset(train_inputs, train_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_tags)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=bs)

## Load and finetune the model

In [106]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

'2.11.0'

In [139]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)                                                  

In [140]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [141]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    
    
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [142]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [143]:
from seqeval.metrics import f1_score, accuracy_score

In [164]:
import numpy as np

loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask, labels=b_labels)
    
        loss = outputs[0]

        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(),
                                              max_norm=max_grad_norm)
        
        # update parameters, and choose next learning rate
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    loss_values.append(avg_train_loss)
    
    # Now evaluate on the validation set
    
    model.eval()
    eval_loss, eval_accuracy = 0.0, 0.0
    number_eval_steps, number_eval_examples = 0, 0
    predictions, true_labels = [], []
    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                               attention_mask=b_input_mask, labels=b_labels)
        
        # Move logits and labels to cpu
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch
        eval_loss += outputs[0].mean().item()
        
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
    
    
    eval_loss = eval_loss / len(validation_dataloader)
    validation_loss_values.append(eval_loss)
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                          for p_i, l_i in zip(p, l)]
    true_tags = [tag_values[l_i] for l in true_labels
                                      for l_i in l]
    
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, true_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, true_tags)))
    print()

Epoch:   0%|          | 0/3 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [167]:
pred_tags = [[tag_values[p_i] for p_i, l_i in zip(p, l)]
                 for p, l in zip(predictions, true_labels)]
    
true_tags = [[tag_values[l_i] for l_i in l]
                 for l in true_labels]

print(pred_tags[0])
print(true_tags[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD', 'PAD', 'PAD', 'PAD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD', 'PAD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD', 'PAD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [166]:
print("Validation Accuracy: {}".format(accuracy_score(pred_tags, true_tags)))
print("Validation F1-Score: {}".format(f1_score(pred_tags, true_tags)))

Validation Accuracy: 0.5321949845026768




Validation F1-Score: 0.31665105270943245


In [154]:
pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                          for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
true_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]

print("Validation Accuracy: {}".format(accuracy_score(pred_tags, true_tags)))
print("Validation F1-Score: {}".format(f1_score(pred_tags, true_tags)))
print()

Validation Accuracy: 0.920246335990834


TypeError: Found input variables without list of list.

In [155]:
help(f1_score)

Help on function f1_score in module seqeval.metrics.sequence_labeling:

f1_score(y_true:List[List[str]], y_pred:List[List[str]], *, average:Union[str, NoneType]='micro', suffix:bool=False, mode:Union[str, NoneType]=None, sample_weight:Union[List[int], NoneType]=None, zero_division:str='warn', scheme:Union[Type[seqeval.scheme.Token], NoneType]=None)
    Compute the F1 score.
    
    The F1 score can be interpreted as a weighted average of the precision and
    recall, where an F1 score reaches its best value at 1 and worst score at 0.
    The relative contribution of precision and recall to the F1 score are
    equal. The formula for the F1 score is::
    
        F1 = 2 * (precision * recall) / (precision + recall)
    
    Args:
        y_true : 2d array. Ground truth (correct) target values.
    
        y_pred : 2d array. Estimated targets as returned by a tagger.
    
        average : string, [None, 'micro' (default), 'macro', 'weighted']
            If ``None``, the scores for e