In [231]:
import numpy as np
import json

In [232]:
# Some config variables
MAX_LEN = 75
bs      = 32
tag2idx = {'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16}

In [233]:
idx2tag = {}
for key in list(tag2idx.keys()) :
    idx2tag[tag2idx[key]] = key

In [234]:
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertForTokenClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model     = BertForTokenClassification.from_pretrained('bert-base-uncased',num_labels=len(tag2idx))
model.load_state_dict(torch.load("ner.dataset.4.pth",map_location=torch.device('cpu')))
model.eval()
print("Ready to use")

Ready to use


In [239]:
# sentence = "He said last week's tsunami and the massive underwater earthquake that triggered it has affected millions in Asia and Africa."
# sentence = "In Beirut, a string of officials voiced their anger, while at the United Nations summit in New York, Prime Minister Fouad Siniora said the Lebanese people are resolute in preventing such attempts from destroying their spirit."
# sentence = 'Lebanon has suffered a series of bombings since the massive explosion in February that killed former Prime Minister Rafik Hariri and 20 other people.'
# sentence = "The attacks came as Britain 's Foreign Secretary Jack Straw met in Baghdad with President Jalal Talabani about the slow progress in forming a new Iraqi government ."
# sentence = "In another development, an Italian court has scheduled an August 17 extradition hearing for one of the suspects in the July 21 failed London bombings."


In [240]:
# The tokenizer encoding is leading to screwy results when the labels are propagated out
# tok_sen   = tokenizer.encode(sentence, add_special_tokens=False)
# tok_senl  = [tokenizer.decode([tok_sen[i]]) for i in range(len(tok_sen))]

tokenized_text = tokenizer.encode(sentence, add_special_tokens=False) # [s.lower() for s in sentence.split(" ")]
tok_senl       = [tokenizer.decode([tokenized_text[i]]) for i in range(len(tokenized_text))]
input_ids      = torch.tensor([tokenized_text])

# input_ids = torch.tensor(tok_sen).unsqueeze(0)  # Batch size 1

labels    = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
outputs   = model(input_ids, labels=labels)

loss, scores = outputs[:2]

# Have to filter out the padding
pred_labels = np.array([np.argmax(scores[0].detach().numpy()[j]) for j in range(len(scores[0]))])

# Sanity check
assert(len(pred_labels)==len(tokenized_text))

# Convert the labels back to text representation
txt_labels = [idx2tag[i] for i in pred_labels]

# Create an output JSON for the tokenized text
odict = {'sentence':sentence,'predictions':[]}
for tt in zip(tok_senl,txt_labels) :
    odict['predictions'].append({
        'token' : tt[0],
        'label' : tt[1]
    })
print(json.dumps(odict,indent=1))

{
 "sentence": "He said last week's tsunami and the massive underwater earthquake that triggered it has affected millions in Asia and Africa.",
 "predictions": [
  {
   "token": "he",
   "label": "O"
  },
  {
   "token": "said",
   "label": "O"
  },
  {
   "token": "last",
   "label": "O"
  },
  {
   "token": "week",
   "label": "O"
  },
  {
   "token": "'",
   "label": "O"
  },
  {
   "token": "s",
   "label": "O"
  },
  {
   "token": "tsunami",
   "label": "O"
  },
  {
   "token": "and",
   "label": "O"
  },
  {
   "token": "the",
   "label": "O"
  },
  {
   "token": "massive",
   "label": "O"
  },
  {
   "token": "underwater",
   "label": "O"
  },
  {
   "token": "earthquake",
   "label": "O"
  },
  {
   "token": "that",
   "label": "O"
  },
  {
   "token": "triggered",
   "label": "O"
  },
  {
   "token": "it",
   "label": "O"
  },
  {
   "token": "has",
   "label": "O"
  },
  {
   "token": "affected",
   "label": "O"
  },
  {
   "token": "millions",
   "label": "O"
  },
  {
   "to