# Install Dependencies

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Project: Build your own NER Tagger

__Named Entity Recognition (NER)__ , also known as entity chunking/extraction , is a popular technique used in information extraction to identify and segment the named entities and classify or categorize them under various predefined classes.

There are various off the shelf solutions which offer capabilites to perform named entity extraction (some of which we discussed in the previous units). Yet there are times when the requirements are beyond the capabilities of off-the-shelf classifiers.

In this notebook, we will go through an exercise to build our own NER using Transformers

## Load Dataset

Named Entity Recognition is a sequence modeling problem at it's core. It is more related to classification class of problems where in we need a labeled dataset to train a classifier. 

There are various labeled datasets for NER class of problems. We would be utilizing a pre-processed version of __GMB(Groningen Meaning Bank) corpus__ for this notebook. The preprocessed version is availble at the following link : [kaggle/ner](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus)

We have provided the dataset in the code repository itself using some intelligent compression and you can access it directly from `pandas` as follows.

In [None]:
import pandas as pd

df = pd.read_csv('https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2008%20-%20Project%206%20-%20Build%20your%20NER%20Tagger/ner_dataset.csv.gz', compression='gzip', encoding='ISO-8859-1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  47959 non-null    object
 1   Word        1048575 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [None]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1048535,1048536,1048537,1048538,1048539,1048540,1048541,1048542,1048543,1048544,1048545,1048546,1048547,1048548,1048549,1048550,1048551,1048552,1048553,1048554,1048555,1048556,1048557,1048558,1048559,1048560,1048561,1048562,1048563,1048564,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
Sentence #,Sentence: 1,,,,,,,,,,,,,,,,,,,,,,,,Sentence: 2,,,,,,,,,,,,,,,,...,,,,,,,,,,,Sentence: 47957,,,,,,,,,,,Sentence: 47958,,,,,,,,,,,Sentence: 47959,,,,,,,
Word,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,war,in,Iraq,and,demand,the,withdrawal,of,British,troops,from,that,country,.,Families,of,soldiers,killed,in,the,conflict,joined,the,protesters,who,carried,banners,with,such,slogans,...,of,the,rockets,landed,near,a,border,security,outpost,.,Two,more,landed,in,fields,belonging,to,a,nearby,village,.,They,say,not,all,of,the,rockets,exploded,upon,impact,.,Indian,forces,said,they,responded,to,the,attack
POS,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,NN,IN,NNP,CC,VB,DT,NN,IN,JJ,NNS,IN,DT,NN,.,NNS,IN,NNS,VBN,IN,DT,NN,VBD,DT,NNS,WP,VBD,NNS,IN,JJ,NNS,...,IN,DT,NNS,VBD,IN,DT,NN,NN,NN,.,CD,JJR,VBD,IN,NNS,VBG,TO,DT,JJ,NN,.,PRP,VBP,RB,DT,IN,DT,NNS,VBD,IN,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
Tag,O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-gpe,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-gpe,O,O,O,O,O,O,O


## Basic Data Formatting

In [None]:
df = df.fillna(method='ffill')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  1048575 non-null  object
 1   Word        1048575 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [None]:
df['sentence_id'] = [item.split(':')[1].strip() for item in df['Sentence #'].values]
df['words'] = df['Word']
df['pos'] = df['POS']
df['labels'] = df['Tag']
df = df[['sentence_id', 'words', 'pos', 'labels']]

In [None]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1048535,1048536,1048537,1048538,1048539,1048540,1048541,1048542,1048543,1048544,1048545,1048546,1048547,1048548,1048549,1048550,1048551,1048552,1048553,1048554,1048555,1048556,1048557,1048558,1048559,1048560,1048561,1048562,1048563,1048564,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
sentence_id,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...,47956,47956,47956,47956,47956,47956,47956,47956,47956,47956,47957,47957,47957,47957,47957,47957,47957,47957,47957,47957,47957,47958,47958,47958,47958,47958,47958,47958,47958,47958,47958,47958,47959,47959,47959,47959,47959,47959,47959,47959
words,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,war,in,Iraq,and,demand,the,withdrawal,of,British,troops,from,that,country,.,Families,of,soldiers,killed,in,the,conflict,joined,the,protesters,who,carried,banners,with,such,slogans,...,of,the,rockets,landed,near,a,border,security,outpost,.,Two,more,landed,in,fields,belonging,to,a,nearby,village,.,They,say,not,all,of,the,rockets,exploded,upon,impact,.,Indian,forces,said,they,responded,to,the,attack
pos,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,NN,IN,NNP,CC,VB,DT,NN,IN,JJ,NNS,IN,DT,NN,.,NNS,IN,NNS,VBN,IN,DT,NN,VBD,DT,NNS,WP,VBD,NNS,IN,JJ,NNS,...,IN,DT,NNS,VBD,IN,DT,NN,NN,NN,.,CD,JJR,VBD,IN,NNS,VBG,TO,DT,JJ,NN,.,PRP,VBP,RB,DT,IN,DT,NNS,VBD,IN,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
labels,O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-gpe,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-gpe,O,O,O,O,O,O,O


In [None]:
df.sentence_id.nunique(), df.words.nunique(), df.pos.nunique(), df.labels.nunique()

(47959, 35178, 42, 17)

We have 47959 sentences that contain 35178 unique words.

These sentences have a total of 42 unique POS tags and 17 unique NER tags in total.

## Tag Distribution

The GMB dataset utilizes IOB tagging or _Inside, Outside Beginning_. IOB is a common tagging format for tagging tokens which we have discussed earlier. To refresh your memory:

+ __I- prefix__ before a tag indicates that the tag is inside a chunk.
+ __B- prefix__ before a tag indicates that the tag is the beginning of a chunk.
+ __O-  tag__ indicates that a token belongs to no chunk (outside).

The tags in this dataset are explained as follows:

+ __geo__ = Geographical Entity
+ __org__ = Organization
+ __per__ = Person
+ __gpe__ = Geopolitical Entity
+ __tim__ = Time indicator
+ __art__ = Artifact
+ __eve__ = Event
+ __nat__ = Natural Phenomenon

Anything outside these classes is termed as other, denoted as __O__. 

The following output shows the unbalanced distribution of different tags in the dataset

In [None]:
df.labels.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: labels, dtype: int64

In [None]:
custom_labels = df.labels.unique().tolist()
custom_labels

['O',
 'B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

## Prepare Train and Test Datasets

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

dataset = df[['sentence_id', 'words', 'labels']]

X_train, X_test = train_test_split(dataset, test_size=0.25, random_state=42, shuffle=False)
X_train.shape, X_test.shape

((786431, 3), (262144, 3))

In [None]:
X_train.head()

Unnamed: 0,sentence_id,words,labels
0,1,Thousands,O
1,1,of,O
2,1,demonstrators,O
3,1,have,O
4,1,marched,O


# NER Training with Transformers

Here we will use the excellent __[SimpleTransformers](https://simpletransformers.ai/)__ framework which is a wrapper on top of the already popular `transformers` framework from HuggingFace.

SimpleTransformers enables us to focus on the core workflow and task at hand, taking out the heavy lifting of data formatting and writing unnecessary boilerplate code

In [None]:
!pip install simpletransformers



# Load Dependencies and Setup NER Model Configs

In [None]:
import logging
from simpletransformers.ner import NERModel, NERArgs

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)



In [None]:
# Configure the model
model_args = NERArgs()
model_args.train_batch_size = 16
model_args.num_train_epochs = 5
model_args.evaluate_during_training = True

In [None]:
model_args

NERArgs(adam_epsilon=1e-08, best_model_dir='outputs/best_model', cache_dir='cache_dir/', config={}, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=2, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, eval_batch_size=8, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=False, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, manual_seed=None, max_grad_norm=1.0, max_seq_length=128, model_name=None, model_type=None, multiprocessing_chunksize=500, n_gpu=1, no_cache=False, no_save=False, num_train_epochs=5, output_dir='outputs/', overwrite_output_dir=False, process_count=2, quantized_model=False, reprocess_input_data=True, save_best_model=True, save_eval_chec

# Load Pretrained RoBERTa Model

The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Liu et. al. It is based on Google’s BERT model released in 2018.

It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates.

__The abstract from the paper is the following:__

_Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. 
Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. 
We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. 
We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. 
These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code._

__Tips:__

- This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained models.
- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a different pre-training scheme.
- RoBERTa doesn’t have token_type_ids, you don’t need to indicate which token belongs to which segment. Just separate your segments with the separation token tokenizer.sep_token (or </s>)

[Source](https://huggingface.co/transformers/model_doc/roberta.html)

In [None]:
model = NERModel(
    "roberta", "roberta-base", args=model_args, labels=custom_labels
)

INFO:filelock:Lock 140406857714600 acquired on /root/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140406857714600 released on /root/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690.lock





INFO:filelock:Lock 140406857714600 acquired on /root/.cache/torch/transformers/80b4a484eddeb259bec2f06a6f2f05d90934111628e0e1c09a33bd4a121358e1.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…

INFO:filelock:Lock 140406857714600 released on /root/.cache/torch/transformers/80b4a484eddeb259bec2f06a6f2f05d90934111628e0e1c09a33bd4a121358e1.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e.lock





Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…

INFO:filelock:Lock 140404697037176 released on /root/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b.lock





INFO:filelock:Lock 140404706527104 acquired on /root/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

INFO:filelock:Lock 140404706527104 released on /root/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock





# Create Training and Validation Datasets

In [None]:
X_training, X_eval = train_test_split(X_train, test_size=0.1, random_state=42, shuffle=False)
X_training.shape, X_eval.shape

((707787, 3), (78644, 3))

# Train the NER Model by Finetuning the Pre-Trained Transformer Model

In [None]:
model.train_model(X_training, eval_data=X_eval)

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=32345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=2022.0, style=ProgressStyle(de…

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…





INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=2022.0, style=ProgressStyle(de…

Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fb1e41d0ef0>>Exception ignored in: 
Traceback (most recent call last):
<bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fb1e41d0ef0>>  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    
Traceback (most recent call last):
self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1075, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1101, in __del__
    
  File "/usr/lib/python3.6/multiprocessing/process.py", line 122, in join
self._shutdown_workers()    
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 1075, in _s

HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…





INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=2022.0, style=ProgressStyle(de…

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…





INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=2022.0, style=ProgressStyle(de…

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…





INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=2022.0, style=ProgressStyle(de…

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…





INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…





INFO:simpletransformers.ner.ner_model: Training of roberta model complete. Saved to outputs/.


# Evaluate Model Performance

In [None]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(X_eval)

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=3606.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=451.0, style=ProgressStyle(descr…




INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.14793327322469543, 'precision': 0.8236533957845433, 'recall': 0.8302644003777149, 'f1_score': 0.8269456853985422}


In [None]:
result

{'eval_loss': 0.14793327322469543,
 'f1_score': 0.8269456853985422,
 'precision': 0.8236533957845433,
 'recall': 0.8302644003777149}

In [None]:
result, model_outputs, preds_list = model.eval_model(X_test)
result

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=12009.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=1502.0, style=ProgressStyle(desc…




INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13404305497032512, 'precision': 0.8370960914350218, 'recall': 0.8415490460316335, 'f1_score': 0.8393166625402327}


{'eval_loss': 0.13404305497032512,
 'f1_score': 0.8393166625402327,
 'precision': 0.8370960914350218,
 'recall': 0.8415490460316335}

# Prepare Test Data for Inference

In [None]:
X_test_data = X_test.groupby('sentence_id').apply(lambda x: x['words'].tolist()).values
y_test = X_test.groupby('sentence_id').apply(lambda x: x['labels'].tolist()).values

In [None]:
X_test_data

array([list(['questions', 'about', 'President', 'Bush', "'s", 'upcoming', 'trip', 'through', 'Europe', 'and', 'Russia', ',', 'in', 'honor', 'of', 'the', 'historic', 'moment', '.']),
       list(['U.S.', 'Assistant', 'Secretary', 'of', 'State', 'Daniel', 'Fried', 'and', 'Special', 'Assistant', 'to', 'the', 'President', 'Thomas', 'Graham', 'answered', 'questions', 'Wednesday', 'on', 'the', 'White', 'House', 'interactive', 'website', 'about', 'President', 'Bush', "'s", 'VE', 'day', 'visits', 'to', 'the', 'Netherlands', ',', 'Russia', 'and', 'Georgia', '.']),
       list(['Questions', 'were', 'taken', 'from', 'around', 'the', 'world', '.']),
       ...,
       list(['Two', 'more', 'landed', 'in', 'fields', 'belonging', 'to', 'a', 'nearby', 'village', '.']),
       list(['They', 'say', 'not', 'all', 'of', 'the', 'rockets', 'exploded', 'upon', 'impact', '.']),
       list(['Indian', 'forces', 'said', 'they', 'responded', 'to', 'the', 'attack'])],
      dtype=object)

In [None]:
y_test

array([list(['O', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['B-org', 'O', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'B-per', 'I-per', 'I-per', 'O', 'O', 'B-tim', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'B-geo', 'O', 'B-geo', 'O']),
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), ...,
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])], dtype=object)

# Predict on Test Data

In [None]:
predictions = model.predict(X_test_data, split_on_space=False)

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=12009.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1502.0, style=ProgressStyle(desc…




In [None]:
pred_labels = predictions[0]

In [None]:
pred_labels[:2]

[[{'questions': 'O'},
  {'about': 'O'},
  {'President': 'B-per'},
  {'Bush': 'I-per'},
  {"'s": 'O'},
  {'upcoming': 'O'},
  {'trip': 'O'},
  {'through': 'O'},
  {'Europe': 'B-geo'},
  {'and': 'O'},
  {'Russia': 'B-geo'},
  {',': 'O'},
  {'in': 'O'},
  {'honor': 'O'},
  {'of': 'O'},
  {'the': 'O'},
  {'historic': 'O'},
  {'moment': 'O'},
  {'.': 'O'}],
 [{'U.S.': 'B-org'},
  {'Assistant': 'O'},
  {'Secretary': 'O'},
  {'of': 'O'},
  {'State': 'B-org'},
  {'Daniel': 'B-per'},
  {'Fried': 'I-per'},
  {'and': 'O'},
  {'Special': 'O'},
  {'Assistant': 'O'},
  {'to': 'O'},
  {'the': 'O'},
  {'President': 'B-per'},
  {'Thomas': 'I-per'},
  {'Graham': 'I-per'},
  {'answered': 'O'},
  {'questions': 'O'},
  {'Wednesday': 'B-tim'},
  {'on': 'O'},
  {'the': 'O'},
  {'White': 'B-org'},
  {'House': 'I-org'},
  {'interactive': 'O'},
  {'website': 'O'},
  {'about': 'O'},
  {'President': 'B-per'},
  {'Bush': 'I-per'},
  {"'s": 'O'},
  {'VE': 'O'},
  {'day': 'O'},
  {'visits': 'O'},
  {'to': 'O'},
  {'

# Get Predicted NER Tag Labels

In [None]:
pred_labels_flat = [[list(d.values())[0] for d in item] 
                      for item in pred_labels]
pred_labels_flat[:2]      

[['O',
  'O',
  'B-per',
  'I-per',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-org',
  'O',
  'O',
  'O',
  'B-org',
  'B-per',
  'I-per',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-per',
  'I-per',
  'I-per',
  'O',
  'O',
  'B-tim',
  'O',
  'O',
  'B-org',
  'I-org',
  'O',
  'O',
  'O',
  'B-per',
  'I-per',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'I-geo',
  'O',
  'B-geo',
  'O',
  'B-geo',
  'O']]

In [None]:
labels_of_interest = ['B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art',
                      'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat']
labels_of_interest                      

['B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

In [None]:
X_test_data.shape

(12009,)

# Check Model Performance on Test Data

In [None]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.1MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [None]:
from sklearn_crfsuite import metrics as crf_metrics

print(crf_metrics.flat_classification_report(y_test, pred_labels_flat, labels=labels_of_interest))

              precision    recall  f1-score   support

       B-geo       0.88      0.89      0.89      9498
       B-gpe       0.96      0.96      0.96      4084
       B-per       0.83      0.84      0.84      4243
       I-geo       0.81      0.82      0.81      1838
       B-org       0.78      0.74      0.76      5077
       I-org       0.84      0.78      0.81      4243
       B-tim       0.93      0.89      0.91      5095
       B-art       0.21      0.18      0.19        90
       I-art       0.13      0.13      0.13        53
       I-per       0.86      0.90      0.88      4278
       I-gpe       0.86      0.80      0.83        55
       I-tim       0.87      0.77      0.82      1589
       B-nat       0.63      0.53      0.57        51
       B-eve       0.41      0.30      0.35        60
       I-eve       0.31      0.26      0.28        62
       I-nat       0.44      0.67      0.53        12

   micro avg       0.86      0.85      0.86     40328
   macro avg       0.67   

We have intentially left out the ___Others___ tag to understand the performance of model on the remaining tags. The above evaluation statistics showcase a model which seems to have learnt the transitions quite well giving us an overall F1-score of 86%!

We can achieve even better results by fine tuning the model with hyper-parameter tuning.

## Your Turn: End-to-End NER Tagger with trained NER Model

There is no fun (or value!) if we cannot use our model to tag new sentences in the future assuming we would want to put this model in production. Let's try and build an end-to-end workflow to perform NER Tagging on our sample document from earlier. First we perform NER tagging with SpaCy to remind you how it looks like.

### Prepare Sample Document

In [None]:
import re

text = """Three more countries have joined an “international grand committee” of parliaments, adding to calls for 
Facebook’s boss, Mark Zuckerberg, to give evidence on misinformation to the coalition. Brazil, Latvia and Singapore 
bring the total to eight different parliaments across the world, with plans to send representatives to London on 27 
November with the intention of hearing from Zuckerberg. Since the Cambridge Analytica scandal broke, the Facebook chief 
has only appeared in front of two legislatures: the American Senate and House of Representatives, and the European parliament. 
Facebook has consistently rebuffed attempts from others, including the UK and Canadian parliaments, to hear from Zuckerberg. 
He added that an article in the New York Times on Thursday, in which the paper alleged a pattern of behaviour from Facebook 
to “delay, deny and deflect” negative news stories, “raises further questions about how recent data breaches were allegedly 
dealt with within Facebook.”
"""

text = re.sub(r'\n', '', text)
text

'Three more countries have joined an “international grand committee” of parliaments, adding to calls for Facebook’s boss, Mark Zuckerberg, to give evidence on misinformation to the coalition. Brazil, Latvia and Singapore bring the total to eight different parliaments across the world, with plans to send representatives to London on 27 November with the intention of hearing from Zuckerberg. Since the Cambridge Analytica scandal broke, the Facebook chief has only appeared in front of two legislatures: the American Senate and House of Representatives, and the European parliament. Facebook has consistently rebuffed attempts from others, including the UK and Canadian parliaments, to hear from Zuckerberg. He added that an article in the New York Times on Thursday, in which the paper alleged a pattern of behaviour from Facebook to “delay, deny and deflect” negative news stories, “raises further questions about how recent data breaches were allegedly dealt with within Facebook.”'

### NER Tagging with SpaCy

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en')
text_nlp = nlp(text)
displacy.render(text_nlp, style='ent', jupyter=True)

In [None]:
predictions = model.predict([text])

INFO:simpletransformers.ner.ner_model: Converting to features started.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Prediction', max=1.0, style=ProgressStyle(descrip…




In [None]:
tagged_tokens = [list(item.items())[0] for item in predictions[0][0]]
tagged_tokens

[('Three', 'O'),
 ('more', 'O'),
 ('countries', 'O'),
 ('have', 'O'),
 ('joined', 'O'),
 ('an', 'O'),
 ('“international', 'O'),
 ('grand', 'O'),
 ('committee”', 'O'),
 ('of', 'O'),
 ('parliaments,', 'O'),
 ('adding', 'O'),
 ('to', 'O'),
 ('calls', 'O'),
 ('for', 'O'),
 ('Facebook’s', 'B-art'),
 ('boss,', 'O'),
 ('Mark', 'B-per'),
 ('Zuckerberg,', 'I-per'),
 ('to', 'O'),
 ('give', 'O'),
 ('evidence', 'O'),
 ('on', 'O'),
 ('misinformation', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('coalition.', 'O'),
 ('Brazil,', 'B-geo'),
 ('Latvia', 'B-geo'),
 ('and', 'O'),
 ('Singapore', 'B-org'),
 ('bring', 'O'),
 ('the', 'O'),
 ('total', 'O'),
 ('to', 'O'),
 ('eight', 'O'),
 ('different', 'O'),
 ('parliaments', 'O'),
 ('across', 'O'),
 ('the', 'O'),
 ('world,', 'O'),
 ('with', 'O'),
 ('plans', 'O'),
 ('to', 'O'),
 ('send', 'O'),
 ('representatives', 'O'),
 ('to', 'O'),
 ('London', 'B-geo'),
 ('on', 'O'),
 ('27', 'B-tim'),
 ('November', 'I-tim'),
 ('with', 'O'),
 ('the', 'O'),
 ('intention', 'O'),
 ('of'

In [None]:
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for idx, (term, tag) in enumerate(tagged_tokens):
    if tag != 'O':
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    if tag == 'O' or idx == len(tagged_tokens)-1:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None

In [None]:
import pandas as pd

pd.DataFrame(named_entities, columns=['Entity', 'Tag'])

Unnamed: 0,Entity,Tag
0,Facebook’s,B-art
1,"Mark Zuckerberg,",I-per
2,"Brazil, Latvia",B-geo
3,Singapore,B-org
4,London,B-geo
5,27 November,I-tim
6,Zuckerberg.,B-per
7,Cambridge Analytica,I-org
8,Facebook,B-org
9,"American Senate and House of Representatives,",I-org
