<a href="https://colab.research.google.com/github/ericburdett/named-entity-recognition/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition

This notebook is for the purpose of understanding Named Entity Recognition and if it would be a viable option for use with the French Death Records.

## Imports and Dataset Creation

In [1]:
!pip install pytorch_pretrained_bert



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm
from torch.nn.parameter import Parameter
import pdb
import torchvision
import os
import gzip
import tarfile
import pandas as pd
from PIL import Image, ImageOps
import gc
import pdb
from google.colab import drive
drive.mount('/content/drive')
from IPython.core.ultratb import AutoFormattedTB
__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

assert torch.cuda.is_available(), "Request a GPU from Runtime > Change Runtime"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!cp -r "drive/My Drive/datasets/twitter-ner/" "/content"

In [0]:
class TwitterDataset(Dataset):
  def __init__(self, dataset_type='train'):
    if not os.path.exists('/content/twitter-ner/'):
      raise Exception('Twitter-ner dataset does not exist!')
    
    if dataset_type == 'train':
      dataset = self.load_csv('a.conll.txt')
      dataset = pd.concat((dataset, self.load_csv('b.conll.txt')), ignore_index=True)
      dataset = pd.concat((dataset, self.load_csv('e.conll.txt')), ignore_index=True)
      dataset = pd.concat((dataset, self.load_csv('f.conll.txt')), ignore_index=True)
      self.input, self.target, self.classes = self.split_input_target(dataset)

    else:
      dataset = self.load_csv('g.conll.txt')
      dataset = pd.concat((dataset, self.load_csv('h.conll.txt')), ignore_index=True)
      self.input, self.target, self.classes = self.split_input_target(dataset)

  def __getitem__(self, index):
    return self.input[index], self.target[index]

  def __len__(self):
    return len(self.target)

  def num_classes(self):
    return len(self.classes)

  def class_list(self):
    return self.classes

  def load_csv(self, filename):
    return pd.read_csv('/content/twitter-ner/' + filename, header=None, names=['input'], error_bad_lines=False, quoting=3)

  def split_input_target(self, df):
    input = []
    target = []
    classes = []
    
    for _, row in df.iterrows():
      words = str(row['input']).split('~`~`')

      tweet_input = ''
      tweet_target = []

      for word in words:
        if len(word) == 0 or word.find('\t') == -1:
          continue
        splits = word.split('\t')
        tweet_input += ' ' + splits[0]
        tweet_target.append(splits[1])

        if splits[1] not in classes:
          classes.append(splits[1])
      # tweet_input += '</s>'
      # tweet_target.append('O')

      input.append(tweet_input)
      target.append(tweet_target)
    
    return input, target, classes

In [69]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = TwitterDataset()
tokenizer.tokenize(dataset[0][0])
# len(dataset[0][1])

['happy',
 'new',
 'year',
 '!',
 '!',
 '!',
 'may',
 'this',
 'year',
 'bring',
 'with',
 'it',
 'all',
 'the',
 'best',
 'from',
 'years',
 'past',
 '&',
 'new',
 'moments',
 'to',
 'remember',
 'for',
 'a',
 'lifetime',
 '.',
 '#',
 'new',
 '##year',
 '2012']

In [52]:

dataset = TwitterDataset()
print(dataset[2][0])
tokenizer.convert_tokens_to_ids(dataset[2][0])

s


[1055]

## Model Implementation

In [70]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased")


TypeError: ignored

In [0]:
NUM_EPOCHS = 5
BATCH_SIZE = 50

train_dataset = TwitterDataset(dataset_type='train')
val_dataset = TwitterDataset(dataset_type='val')

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=train_dataset.num_classes())
model.cuda()

optimizer = BertAdam(model.classifier.parameters(), lr=3e-5)

# Train for NUM_EPOCHS
for epoch in range(NUM_EPOCHS):
  # Training Loop
  train_loop = tqdm(total=len(train_loader), position=0, leave=False)
  for batch, (x, y_truth) in enumerate(train_loader):
    optimizer.zero_grad()

    pdb.set_trace()

    # x, y_truth = x.cuda(async=True), y_truth.cuda(async=True)

    loss = model(x, labels=y_truth)

    loss.backward()

    train_loop.set_description('epoch:{}, loss:{}'.format(epoch, loss.item()))
    train_loop.update()

    optimizer.step()
    model.zero_grad()

  train_loop.close()

  # Validation Loop
  val_loop = tqdm(total=len(val_loader), position=0, leave=False)
  for batch, (x, y_truth) in enumerate(val_loader):
    
    x, y_truth = x.cuda(async=True), y_truth.cuda(async=True)
    loss = model(x, labels=y_truth)

    val_loop.set_description('epoch:{}, loss:{}'.format(epoch, loss.item()))
    val_loop.update(1)
  
  val_loop.close()

## BERT Tutorial

In [0]:
!cp -r "drive/My Drive/datasets/ner_dataset.csv" "/content"

In [72]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data = pd.read_csv("ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
labels = [[s[2] for s in sent] for sent in getter.sentences]
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [75]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [76]:
MAX_LEN = 75
bs = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [0]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [0]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [0]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [0]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [0]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [0]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [0]:
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
model.cuda();

In [0]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [88]:
!pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=3728c1554fa4248172c2deeee9f7e65d0e8dbb21f6a6ce830719b8e7a1e06483
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [0]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
epochs = 5
max_grad_norm = 1.0

for epoch in range(epochs):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    loop = tqdm(total=len(train_dataloader), position=0, leave=False)
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        
        loop.set_description('epoch:{}, loss:{}'.format(epoch, loss.item()))
        loop.update(1)

        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    
    loop.close()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))