In [1]:
import pandas as pd
import numpy as np
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [3]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [4]:
getter = SentenceGetter(data)

In [5]:
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [41]:
sentences[6]

"He said last week 's tsunami and the massive underwater earthquake that triggered it has affected millions in Asia and Africa ."

In [6]:
tags_vals = sorted(list(set(data["Tag"].values)))
tags_vals.append('[PAD]') 
tag2idx   = {t: i for i, t in enumerate(tags_vals)}

In [7]:
tag2idx

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16,
 '[PAD]': 17}

In [8]:
idx2tag = {}
for key in list(tag2idx.keys()) :
    idx2tag[tag2idx[key]] = key

In [9]:
# !pip install pytorch-pretrained-bert==0.4.0
# !pip install transformers

In [10]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

Using TensorFlow backend.


In [11]:
MAX_LEN = 75
bs      = 32

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [13]:
device

device(type='cpu')

In [15]:
if "cuda" in str(device) : 
    print(torch.cuda.get_device_name(0))

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

BERT does some weird tokenization of out-of-vocabulary words into pieces, e.g. `"gunships"` becomes `[ "guns", "##hips" ]`, so we need to account for that in our labelling.

In [17]:
tokenized_texts = []
labels          = []
for sentence in tqdm(getter.sentences) :
    # Split into tokens by spaces
    # Now split each token into sub-tokens using the tokenizer
    # such that any new sub-tokens receive either a "O" or "I-"
    # label as necessary
    words  = []
    lls    = []
    for i in range(len(sentence)) :
        # Not sure why there's some whitespaces but OK
        word  = tokenizer.tokenize(sentence[i][0])
        if len(word) == 0 :
            continue
        label = [sentence[i][2]]
        if len(word)>1 :
            label.extend([label[0].replace("B-","I-")]*(len(word)-1))
#             print(word,label)
#             input()
        try : 
            assert(len(word)==len(label))
        except : 
            print("+"+sentence[i][0]+"+")
            print(word,label)
            raise Exception()
        words.extend(word)
        lls.extend(label)
    tokenized_texts.append(words)
    labels.append(lls)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=47959), HTML(value='')))




In [18]:
assert(len(tokenized_texts[6])==len(labels[6]))

In [19]:
# Pad with zeroes (0 = attention mask index in BERT)
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [20]:
# Not sure what the best option is for padding here
# attention masks get ignored during loss calculation anyway - maybe not an issue?
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["[PAD]"], padding="post",
                     dtype="long", truncating="post")

In [21]:
# Used to flag which terms are padding and which are real data
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [22]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [23]:
tr_inputs  = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags    = torch.tensor(tr_tags)
val_tags   = torch.tensor(val_tags)
tr_masks   = torch.tensor(tr_masks)
val_masks  = torch.tensor(val_masks)

In [24]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [0]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

In [0]:
if "cuda" in str(device) : 
  model.cuda();

In [0]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
!pip install seqeval



In [0]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels, pad_index):
    pred_flat   = np.array(np.argmax(preds, axis=2).flatten())
    labels_flat = np.array(labels.flatten())
    return np.sum(pred_flat[labels_flat!=pad_index] == labels_flat[labels_flat!=pad_index]) / len(labels_flat[labels_flat!=pad_index])

In [0]:
from datetime import datetime
from google.colab import output

In [0]:
# Number of batches per epoch
niter = len(list(enumerate(train_dataloader)))

# Training the model

In [0]:
max_grad_norm  = 1.0
dstart=datetime.now()
for iepoch in range(epochs) : 
    print("Working on epoch %i"%(iepoch+1))
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    ientry=0
    tepoch_start = datetime.now()
    for step, batch in enumerate(train_dataloader) :
        ientry+=1
        if ientry%50==0 :
          tnow    = datetime.now()
          remtime = ((tnow - tepoch_start)/ientry)*(niter-ientry)
          print("Epoch %i/%i : Processed %i/%i batches, elapsed time: %s, remaining time in epoch: %s"%(iepoch+1,epochs,
                                                                                                        ientry,niter,
                                                                                                        datetime.now()-dstart,remtime))

        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        loss    = outputs[0]
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        scheduler.step()
        
        model.zero_grad()

    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    # See: https://pytorch.org/tutorials/beginner/saving_loading_models.html
    print("Saving checkpoint to: ner.dataset.transformers.%i.pth"%(iepoch))
    torch.save(model.state_dict(), "/content/drive/My Drive/ner.dataset.transformers.%i.pth"%(iepoch))

    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
          outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask, labels=b_labels)
          tmp_eval_loss, logits = outputs[:2]

        logits    = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # NOTE: Need to ignore padding when evaluating accuracy
        # (NB: transformers does this already in loss function, but not
        # pytorch-pretrained-bert==0.4.0 - this is a known bug)

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids, pad_index=tag2idx['[PAD]'])
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags  = np.array([tags_vals[p_i] for p in predictions for p_i in p])
    valid_tags = np.array([tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i])

    # Need to explicitly factor padding out of F1 score calculation
    print("F1-Score: {}".format(f1_score(pred_tags[valid_tags!=['[PAD]']].tolist(), valid_tags[valid_tags!=['[PAD]']].tolist())))

Working on epoch 1
Epoch 1/3 : Processed 50/1349 batches, elapsed time: 0:00:23.603158, remaining time in epoch: 0:10:13.160475
Epoch 1/3 : Processed 100/1349 batches, elapsed time: 0:00:47.869139, remaining time in epoch: 0:09:57.861328
Epoch 1/3 : Processed 150/1349 batches, elapsed time: 0:01:12.214282, remaining time in epoch: 0:09:37.217784
Epoch 1/3 : Processed 200/1349 batches, elapsed time: 0:01:36.662213, remaining time in epoch: 0:09:15.312849
Epoch 1/3 : Processed 250/1349 batches, elapsed time: 0:02:01.307265, remaining time in epoch: 0:08:53.257879
Epoch 1/3 : Processed 300/1349 batches, elapsed time: 0:02:26.013936, remaining time in epoch: 0:08:30.555643
Epoch 1/3 : Processed 350/1349 batches, elapsed time: 0:02:50.777435, remaining time in epoch: 0:08:07.442070
Epoch 1/3 : Processed 400/1349 batches, elapsed time: 0:03:15.628844, remaining time in epoch: 0:07:44.124583
Epoch 1/3 : Processed 450/1349 batches, elapsed time: 0:03:40.599404, remaining time in epoch: 0:07:20


<hr>

# Reload the model and evaluate accuracy

In [0]:
# Reload the model
# model = TheModelClass(*args, **kwargs)
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model.load_state_dict(torch.load("/content/drive/My Drive/ner.dataset.transformers.1.pth"))
model.eval()
model.cuda()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
y_true = []
y_pred = []

isent=0
for i in tqdm(range(500)) : # tqdm(np.random.randint(0,len(val_inputs),500)) : # range(len(val_inputs))) :
    input_ids       = torch.tensor([val_inputs[i].cpu().numpy()]).to(device)
    tags            = torch.tensor([val_tags[i].cpu().numpy()]).to(device)
    attention_masks = torch.tensor([val_masks[i].cpu().numpy()]).to(device)
    
    outputs   = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=tags)
    loss, prediction_scores = outputs[:2]

    ovar        = prediction_scores.cpu().detach().numpy()
    pred_labels = np.array([np.argmax(ovar[0][j]) for j in range(len(ovar[0]))])  
    noid_masks  = (input_ids[0]>0).detach().cpu().numpy()

    y_true.extend(tags.cpu().numpy()[0][noid_masks].tolist())
    y_pred.extend(pred_labels[noid_masks])
    
    isent+=1

# Convert back to label representations
y_true = [idx2tag[i] for i in y_true]
y_pred = [idx2tag[i] for i in y_pred]
print(classification_report(y_true, y_pred))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         4
       B-eve       0.00      0.00      0.00         4
       B-geo       0.84      0.92      0.88       393
       B-gpe       0.95      0.92      0.94       158
       B-nat       0.33      0.33      0.33         3
       B-org       0.80      0.62      0.70       229
       B-per       0.79      0.86      0.83       175
       B-tim       0.95      0.91      0.93       238
       I-art       0.00      0.00      0.00         6
       I-eve       0.50      0.20      0.29         5
       I-geo       0.79      0.82      0.81       254
       I-gpe       0.83      0.56      0.67         9
       I-nat       1.00      1.00      1.00         1
       I-org       0.79      0.60      0.68       396
       I-per       0.82      0.96      0.88       441
       I-tim       0.84      0.71      0.77       113
           O       0.98      0.99      0.99      9999

    accuracy             

  _warn_prf(average, modifier, msg_start, len(result))


# Reload analogous model from `pytorch-pretrained-bert`

In [28]:
# Some config variables
# NOTE: Back when I trained this model I didn't have an explicit [PAD] label
tag2idx = {'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16}

# Reload the model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model.load_state_dict(torch.load("ner.dataset.4.pth",map_location=torch.device('cpu')))
model.eval()
# model.cuda()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

y_true = []
y_pred = []

isent=0
for i in tqdm(range(500)) : # tqdm(np.random.randint(0,len(val_inputs),500)) : # range(len(val_inputs))) :
    input_ids       = torch.tensor([val_inputs[i].cpu().numpy()]).to(device)
    tags            = torch.tensor([val_tags[i].cpu().numpy()]).to(device)
    attention_masks = torch.tensor([val_masks[i].cpu().numpy()]).to(device)
    
    outputs   = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=tags)
    loss, prediction_scores = outputs[:2]

    ovar        = prediction_scores.cpu().detach().numpy()
    pred_labels = np.array([np.argmax(ovar[0][j]) for j in range(len(ovar[0]))])  
    noid_masks  = (input_ids[0]>0).detach().cpu().numpy()

    y_true.extend(tags.cpu().numpy()[0][noid_masks].tolist())
    y_pred.extend(pred_labels[noid_masks])
    
    isent+=1

# Convert back to label representations
y_true = [idx2tag[i] for i in y_true]
y_pred = [idx2tag[i] for i in y_pred]
print(classification_report(y_true, y_pred))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

(tensor([[[-2.2406, -3.0015, -1.4142,  ..., -2.1016, -2.4216, 12.0613],
         [-2.4263, -3.1180, -1.7738,  ..., -1.9558, -2.2086, 12.1448],
         [-2.3871, -3.0961, -1.6323,  ..., -1.9416, -2.3062, 12.1193],
         ...,
         [-2.3336, -2.9671, -1.7897,  ..., -1.8481, -2.0898, 12.2888],
         [-2.3318, -2.9709, -1.7947,  ..., -1.8535, -2.0813, 12.2904],
         [-2.3335, -2.9719, -1.7941,  ..., -1.8620, -2.0559, 12.2922]]],
       grad_fn=<AddBackward0>),)



ValueError: max() arg is an empty sequence

**Note:** Results appear to be within a few percent of the `tranformers` model, so the two seem to be in agreement.

<hr>

In [0]:
# Let's also get the results without "O"
y_true_filt = []
y_pred_filt = []
for i in range(len(y_true)) :
    if y_true[i] == "O" : 
        continue
    y_true_filt.append(y_true[i])
    y_pred_filt.append(y_pred[i])
    
print(classification_report(y_true_filt, y_pred_filt))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00         4
       B-eve       0.50      0.25      0.33         4
       B-geo       0.87      0.92      0.90       393
       B-gpe       0.98      0.92      0.95       158
       B-nat       0.50      0.67      0.57         3
       B-org       0.86      0.69      0.76       229
       B-per       0.87      0.88      0.87       175
       B-tim       0.99      0.92      0.96       238
       I-art       0.00      0.00      0.00         6
       I-eve       0.25      0.20      0.22         5
       I-geo       0.86      0.84      0.85       254
       I-gpe       1.00      0.56      0.71         9
       I-nat       1.00      1.00      1.00         1
       I-org       0.89      0.71      0.79       396
       I-per       0.88      0.96      0.92       441
       I-tim       0.97      0.73      0.83       113
           O       0.00      0.00      0.00         0

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
