In [2]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
import json

In [4]:
import pandas as pd
import numpy as np
# from tqdm import tqdm, trange

data = pd.read_csv("ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)

In [7]:
tags_vals = sorted(list(set(data["Tag"].values)))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [8]:
tag2idx

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16}

In [9]:
idx2tag = {}
for key in list(tag2idx.keys()) :
    idx2tag[tag2idx[key]] = key

In [10]:
idx2tag

{0: 'B-art',
 1: 'B-eve',
 2: 'B-geo',
 3: 'B-gpe',
 4: 'B-nat',
 5: 'B-org',
 6: 'B-per',
 7: 'B-tim',
 8: 'I-art',
 9: 'I-eve',
 10: 'I-geo',
 11: 'I-gpe',
 12: 'I-nat',
 13: 'I-org',
 14: 'I-per',
 15: 'I-tim',
 16: 'O'}

In [11]:
MAX_LEN = 75
bs      = 32

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

**<font color='red'>Need to change the input ID / tag creation to reflect the fact that BERT's tokenizer $\neq$ splitting by spaces. Some words will be tokenized into multiple segments which need to be assigned either O's or I- tags as necessary. </font>**

**Then we can pad sequences once it's guaranteed the IDs and labels have equal length.**

**NB: This means I will have to retain the model on Colab using this new splitting algorithm.**

In [14]:
tokenized_texts = []
labels          = []
for sentence in tqdm(getter.sentences) :
    # Split into tokens by spaces
    # Now split each token into sub-tokens using the tokenizer
    # such that any new sub-tokens receive either a "O" or "I-"
    # label as necessary
    words  = []
    lls    = []
    for i in range(len(sentence)) :
        # Not sure why there's some whitespaces but OK
        word  = tokenizer.tokenize(sentence[i][0])
        if len(word) == 0 :
            continue
        label = [sentence[i][2]]
        if len(word)>1 :
            label.extend([label[0].replace("B-","I-")]*(len(word)-1))
#             print(word,label)
#             input()
        try : 
            assert(len(word)==len(label))
        except : 
            print("+"+sentence[i][0]+"+")
            print(word,label)
            raise Exception()
        words.extend(word)
        lls.extend(label)
    tokenized_texts.append(words)
    labels.append(lls)

HBox(children=(IntProgress(value=0, max=47959), HTML(value='')))




In [15]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [16]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [17]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [18]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

# val_inputs = input_ids
# val_tags   = tags
# val_masks  = attention_masks

In [19]:
tr_inputs = torch.tensor(tr_inputs)
tr_tags = torch.tensor(tr_tags)
tr_masks = torch.tensor(tr_masks)

val_inputs = torch.tensor(val_inputs)
val_tags = torch.tensor(val_tags)
val_masks = torch.tensor(val_masks)

In [20]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [21]:
# Reload the model
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model.load_state_dict(torch.load("ner.dataset.4.pth",map_location=torch.device('cpu')))
model.eval()
if str(device) != 'cpu' : 
    model.cuda()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [23]:
y_true = []
y_pred = []

isent=0
for i in tqdm(range(500)) : # tqdm(np.random.randint(0,len(val_inputs),500)) : # range(len(val_inputs))) :
    input_ids       = torch.tensor([val_inputs[i].cpu().numpy()]).to(device)
    tags            = torch.tensor([val_tags[i].cpu().numpy()]).to(device)
    attention_masks = torch.tensor([val_masks[i].cpu().numpy()]).to(device)
    
    outputs   = model(input_ids, token_type_ids=None,
                      attention_mask=attention_masks) # , labels=tags)
    
    ovar = outputs.to('cpu').detach().numpy()
    
    pred_labels = [np.argmax(ovar[0][j]) for j in range(len(ovar[0]))]
    
#     if i==6 :
#         words   = tokenizer.convert_ids_to_tokens(val_inputs[6].cpu().numpy())
#         my_true = tags.numpy()[0].tolist()
#         my_pred = pred_labels
#         for zz in zip(words,my_true,my_pred) :
#             print(zz)
    
    y_true.extend(tags.numpy()[0].tolist())
    y_pred.extend(pred_labels)
    
    isent+=1 
    
#     print(tags.numpy()[0])
#     print(pred_labels)
#     input()
        
#     input()

  # loss, scores = outputs[:2]

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




In [24]:
# Convert back to label representations
y_true = [idx2tag[i] for i in y_true]
y_pred = [idx2tag[i] for i in y_pred]
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       B-art       0.33      0.25      0.29         4
       B-eve       0.33      0.25      0.29         4
       B-geo       0.85      0.93      0.89       393
       B-gpe       0.96      0.93      0.95       158
       B-nat       1.00      0.33      0.50         3
       B-org       0.79      0.67      0.72       229
       B-per       0.83      0.89      0.86       175
       B-tim       0.94      0.92      0.93       238
       I-art       0.22      0.33      0.27         6
       I-eve       0.00      0.00      0.00         5
       I-geo       0.78      0.88      0.83       254
       I-gpe       1.00      0.56      0.71         9
       I-nat       0.33      1.00      0.50         1
       I-org       0.84      0.62      0.72       396
       I-per       0.85      0.94      0.89       441
       I-tim       0.80      0.64      0.71       113
           O       1.00      1.00      1.00     35071

    accuracy              

In [None]:
# Let's also get the results without "O"
y_true_filt = []
y_pred_filt = []
for i in range(len(y_true)) :
    if y_true[i] == "O" : 
        continue
    y_true_filt.append(y_true[i])
    y_pred_filt.append(y_pred[i])
    
print(classification_report(y_true_filt, y_pred_filt))

<hr>

Want to write a little routine that takes in a sentence and returns an array of the tokens along with the appropriate labels.

In [103]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model     = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model.load_state_dict(torch.load("ner.dataset.4.pth",map_location=torch.device('cpu')))
model.eval()
print("Model ready")

Model ready


In [104]:
MAX_LEN = 75
bs      = 32

In [117]:
def split_str_token(s) : 
    istart=0
    iend=0
    ovec=[]
    for i in range(len(s)) : 
        if str(s[i]).lower() in list('abcdefghijklmnopqrstuvwxyz1234567890') :
            continue
        elif s[i] in list('-/.?,!%*') :
            
            # Capture the term before it
            iend   = i
            ovec.append([s[istart:iend],istart,iend])
            istart = iend+1
            
            # Now capture the punctuation
            istart = i
            iend   = i+1
            ovec.append([s[i],istart,iend])
            istart = i+1
            continue
        else : 
            iend   = i
            ovec.append([s[istart:iend],istart,iend])
            istart = iend+1
    return ovec

In [153]:
def my_converter(tt) :
    ts = []
    for itt in tt :
        tok = 0
        try : 
            tok = tokenizer.convert_tokens_to_ids([itt])[0]
        except :
            pass
        ts.append(tok)
    return ts

In [360]:
# Break up the sentence into tokens
sentence       = sentences[43000]

# Ideally it would be good to have a pre-tokenzied sentence provided

tokenized_text = tokenizer.tokenize(sentence)
input_id       = pad_sequences([tokenizer.convert_tokens_to_ids(tokenized_text)],maxlen=MAX_LEN, 
                               dtype="long", truncating="post", padding="post")
attention_mask = [[float(i>0) for i in ii] for ii in input_id]
input_id       = torch.tensor(input_id)
attention_mask = torch.tensor(attention_mask)
outputs        = model(input_id,token_type_ids=None,attention_mask=attention_mask)
ovar           = outputs.to('cpu').detach().numpy()

# Have to filter out the padding
pred_labels = np.array([np.argmax(ovar[0][j]) for j in range(len(ovar[0]))])

# pred_labels = pred_labels[(input_id.numpy()[0]>0)]
pred_labels = pred_labels[0:len(tokenized_text)]

# Sanity check
assert(len(pred_labels)==len(tokenized_text))

# Convert the labels back to text representation
txt_labels = [idx2tag[i] for i in pred_labels]

# Create an output JSON for the tokenized text
odict = {'sentence':sentence,'predictions':[]}
for tt in zip(tokenized_text,txt_labels) :
    odict['predictions'].append({
        'token' : tt[0],
        'label' : tt[1]
    })
print(json.dumps(odict,indent=1))

{
 "sentence": "The U.S. Environmental Protection Agency celebrated Earth Day this week in Washington ( 22 April ) by showcasing environmentally friendly new designs that could be the wave of the future .",
 "predictions": [
  {
   "token": "the",
   "label": "O"
  },
  {
   "token": "u",
   "label": "B-org"
  },
  {
   "token": ".",
   "label": "I-org"
  },
  {
   "token": "s",
   "label": "I-org"
  },
  {
   "token": ".",
   "label": "I-org"
  },
  {
   "token": "environmental",
   "label": "I-org"
  },
  {
   "token": "protection",
   "label": "I-org"
  },
  {
   "token": "agency",
   "label": "I-org"
  },
  {
   "token": "celebrated",
   "label": "O"
  },
  {
   "token": "earth",
   "label": "B-eve"
  },
  {
   "token": "day",
   "label": "I-eve"
  },
  {
   "token": "this",
   "label": "O"
  },
  {
   "token": "week",
   "label": "O"
  },
  {
   "token": "in",
   "label": "O"
  },
  {
   "token": "washington",
   "label": "B-geo"
  },
  {
   "token": "(",
   "label": "O"
  },
  {
