In [1]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## Data

In [2]:
data = pd.read_csv('joint_corpus_with_pos_gazet.csv')

In [3]:
data.head()

Unnamed: 0,sent,WORD,orig_form,POS_CONC,NER,corpus_form,is_in_gazet_loc,is_in_gazet_org,is_in_gazet_per,is_in_gazet_loc_fuzzy,is_in_gazet_org_fuzzy,is_in_gazet_per_fuzzy
0,1,1,נראה,VB,O,נראה,False,False,False,False,False,False
1,1,2,שאביטל,REL|NNP,I_PERS,שאביטל,False,False,False,False,False,True
2,1,3,אברג'יל,NNP,I_PERS,אברג'יל,False,False,False,False,False,True
3,1,4,(,yyLRB,O,(,False,False,False,False,False,False
4,1,5,לשעבר,RB,O,לשעבר,False,False,False,False,False,False


Now, as bert expect sequences, let's create a sentence getter:

In [4]:
class SentenceGetter(object):

    def __init__(self, data, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = data["corpus_form"]
        self.labels = data["NER"]

    def sentences(self):
        sent = []
        counter = 0

        for token, label in zip(self.tokens, self.labels):
            if token == "DOCSTART":
                continue
            sent.append((token, label))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

    def get_next(self):
        try:
            while True:
                sent = []
                next_token = self.tokens[self.index]
                if next_token == "DOCSTART":
                    continue
                next_label = self.labels[self.index]
                sent.append((next_token, next_label))
                self.index += 1
                if next_token.strip() == ".":
                    return sent
        except:
            return None

getter = SentenceGetter(data)

Let's check our deep learning libraries are working properley:

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

Device: cuda
Number of gpus: 4
Name of gpu: GeForce GTX 1080 Ti


We'll also add some constants that will determine the maximum sequence length and maximum batch sizes that we will feed the gpu:

In [6]:
MAX_LEN = 75
bs = 32

Next, let's get all of our sentences and labels:

In [7]:
all_sentences = [[token for token, label in sent] for sent in getter.sentences()]
all_orig_labels = [[label for token, label in sent] for sent in getter.sentences()]

print(all_sentences[0])
print(all_orig_labels[0])

['נראה', 'שאביטל', "אברג'יל", '(', 'לשעבר', 'אוז', ')', ',', 'אוהבת', 'לא', 'רק', 'לשחק', 'אצל', 'דן', "תורג'מן", '(', 'בסרט', '"', 'משהו', 'מתוק', '"', ')', ',', 'אלא', 'גם', 'איתו', '.']
['O', 'I_PERS', 'I_PERS', 'O', 'O', 'I_PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I_PERS', 'I_PERS', 'O', 'O', 'O', 'I_MISC__ENT', 'I_MISC__ENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Moving forward, we'll want to split our dataset into train and test:

In [8]:
train_sentences, test_sentences, train_orig_labels, test_orig_labels = train_test_split(all_sentences, all_orig_labels, random_state=42, test_size=0.25)

As bert expects a tokenized sentence, we'll need to use the BertTokenizer with multilingual support. We'll create a function to achieve this. It's important to note that bert tend to split words, or as they call it, split into word pieces. Therefore, we'll need to update our labels arrays and expend them.

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize(sentences, orig_labels):
    tokenized_texts = []
    labels = []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)

        assert len(bert_tokens) == len(bert_labels)

    return tokenized_texts, labels


train_tokenized_texts, train_labels = tokenize(train_sentences, train_orig_labels)
print(train_tokenized_texts[0])
print(train_labels[0])

['ל', '##ר', '##פ', '##פורט', 'לא', 'היה', 'מ', '##עולם', 'קשר', 'לי', '##הו', '##דים', 'או', 'לישראל', '.']
['I_PERS', 'I_PERS', 'I_PERS', 'I_PERS', 'O', 'O', 'O', 'O', 'O', 'I_MISC__AFF', 'I_MISC__AFF', 'I_MISC__AFF', 'O', 'I_LOC', 'O']


Next we need to create sequences with padding to give to bert. We'll add first some utilties to convert labels into numbers:

In [10]:
tags_vals = list(set(data["NER"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i: t for i, t in enumerate(tags_vals)}

Now, we can convert our sentences and labels into sequences with paddings

In [11]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                         maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                         dtype="long", truncating="post")

    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    
    return input_ids, tags, attention_masks
  

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_labels)

We're almost done. All that is left is to make tensors and  data loaders:

In [12]:
tr_inputs = torch.tensor(input_ids, dtype=torch.long)
tr_tags = torch.tensor(tags, dtype=torch.long)
tr_masks = torch.tensor(attention_masks, dtype=torch.long)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

Now we're ready to create our bert model and train it:

In [13]:
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-uncased", num_labels=len(tag2idx))

model.cuda()

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


epochs = 5
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:  20%|██        | 1/5 [00:23<01:32, 23.25s/it]

Train loss: 0.37259750548554094


Epoch:  40%|████      | 2/5 [00:46<01:09, 23.26s/it]

Train loss: 0.10909128620436317


Epoch:  60%|██████    | 3/5 [01:10<00:46, 23.36s/it]

Train loss: 0.06594536041742877


Epoch:  80%|████████  | 4/5 [01:34<00:23, 23.56s/it]

Train loss: 0.04130892623460999


Epoch: 100%|██████████| 5/5 [01:58<00:00, 23.69s/it]

Train loss: 0.026234773520723377





Great, we now have a trained model. Let's test it:

In [14]:
def test_model():
  classes_without_O = [x.replace('_', '-') for x in data["NER"].tolist() if x!='O']
      
  test_tokenized_texts, test_labels = tokenize(test_sentences, test_orig_labels)
  input_ids, tags, attention_masks = pad_sentences_and_labels(test_tokenized_texts, test_labels)

  val_inputs = torch.tensor(input_ids, dtype=torch.long)
  val_tags = torch.tensor(tags, dtype=torch.long)
  val_masks = torch.tensor(attention_masks, dtype=torch.long)

  test_data = TensorDataset(val_inputs, val_masks, val_tags)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  predictions, true_labels = [], []
  counter = 0
  for batch in test_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
          tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
          logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
      true_labels.append(label_ids)

      tmp_eval_accuracy = flat_accuracy(logits, label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

      nb_eval_examples += b_input_ids.size(0)
      nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  print("Validation loss: {}".format(eval_loss))
  print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
  pred_tags = [tags_vals[p_i].replace('_', '-') for p in predictions for p_i in p]
  test_tags = [tags_vals[l_ii].replace('_', '-') for l in true_labels for l_i in l for l_ii in l_i]
  print("F1-Score: {}".format(f1_score(pred_tags, test_tags)))

  y_true = pd.Series(test_tags)
  y_pred = pd.Series(pred_tags)
  cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
  report = classification_report(y_true, y_pred, labels=classes_without_O, target_names=classes_without_O)
  report_with_O = classification_report(y_true, y_pred)

  return cross_tab, report, report_with_O, y_true, y_pred

#     print(test_tokenized_texts[0])
#     print([idx2tag.get(i) for i in predictions[0]])
#     print([idx2tag.get(i) for i in true_labels[0][0]])
    

cross_tab, report, report_with_O, y_true, y_pred = test_model()

Validation loss: 0.08596838836092502
Validation Accuracy: 0.9820673076923079
F1-Score: 0.7622478386167146


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [15]:
epochs = 8

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:  12%|█▎        | 1/8 [00:23<02:45, 23.60s/it]

Train loss: 0.020349489990621805


Epoch:  25%|██▌       | 2/8 [00:47<02:21, 23.62s/it]

Train loss: 0.014265910281162513


Epoch:  38%|███▊      | 3/8 [01:10<01:58, 23.63s/it]

Train loss: 0.012433759858954306


Epoch:  50%|█████     | 4/8 [01:34<01:34, 23.64s/it]

Train loss: 0.010185604933404216


Epoch:  62%|██████▎   | 5/8 [01:58<01:10, 23.64s/it]

Train loss: 0.008413136302567037


Epoch:  75%|███████▌  | 6/8 [02:21<00:47, 23.58s/it]

Train loss: 0.007044485827607691


Epoch:  88%|████████▊ | 7/8 [02:45<00:23, 23.63s/it]

Train loss: 0.005511880198977643


Epoch: 100%|██████████| 8/8 [03:09<00:00, 23.63s/it]

Train loss: 0.005124702012644296





In [16]:
cross_tab, report, report_with_O, y_true, y_pred = test_model()

Validation loss: 0.08489784598350525
Validation Accuracy: 0.9842735042735045
F1-Score: 0.8152327221438646


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [17]:
from conlleval import evaluate
evaluate(y_true, y_pred)

processed 60225 tokens with 1415 phrases; found: 1421 phrases; correct: 1156.
accuracy:  87.88%; (non-O)
accuracy:  98.45%; precision:  81.35%; recall:  81.70%; FB1:  81.52
             DATE: precision:  66.99%; recall:  64.49%; FB1:  65.71  103
              LOC: precision:  86.02%; recall:  85.50%; FB1:  85.76  329
        MISC--AFF: precision:  90.96%; recall:  91.94%; FB1:  91.44  188
        MISC--ENT: precision:  70.59%; recall:  54.55%; FB1:  61.54  17
       MISC-EVENT: precision:  72.22%; recall:  81.25%; FB1:  76.47  18
            MONEY: precision: 100.00%; recall:  95.56%; FB1:  97.73  43
              ORG: precision:  64.89%; recall:  69.06%; FB1:  66.91  282
          PERCENT: precision:  93.55%; recall:  95.08%; FB1:  94.31  62
             PERS: precision:  85.95%; recall:  85.48%; FB1:  85.71  370
             TIME: precision:  66.67%; recall:  60.00%; FB1:  63.16  9


(81.35116115411682, 81.69611307420494, 81.52327221438645)