**Building Custom Named Entity Recognition Model Using Spacy**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import spacy

#Installing the required transformer model

In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.3/236.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl (25 kB)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (731 kB)
[2K  

#Loading the spacy model as ner_model

In [None]:
ner_model = spacy.load("en_core_web_trf")
ner_model

<spacy.lang.en.English at 0x7eb144562200>

In [None]:
doc = ner_model("Donad Trump was President of USA")

In [None]:
doc

Donad Trump was President of USA

In [None]:
type(doc)

spacy.tokens.doc.Doc

In [None]:
doc.ents

(Donad Trump, USA)

In [None]:
doc.ents[0], type(doc.ents[0])

(Donad Trump, spacy.tokens.span.Span)

In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

#Loading custom data
Here we have created data related to finance domain

In [None]:
import json
with open('/content/drive/MyDrive/PII_tag/financial_data8.json', 'r') as f:
    data = json.load(f)

In [None]:
data['examples'][100]

{'id': 'b75999c4-0f24-4af6-ace1-319d01dc878b',
 'content': 'Company 0.22% reported 6.31% in Schmidt Group.',
 'metadata': {},
 'annotations': [{'id': 'ce2d5773-c6e8-44e5-bfbc-1b02ab7040b3',
   'start': 8,
   'end': 13,
   'label': 'Percentage',
   'text': '0.22%'},
  {'id': '3d5c841c-9be2-418e-8e37-d29a24fe9ed6',
   'start': 23,
   'end': 28,
   'label': 'Percentage',
   'text': '6.31%'},
  {'id': '32f2d728-152b-46a6-b3a3-7f5d971b8874',
   'start': 32,
   'end': 45,
   'label': 'Organization',
   'text': 'Schmidt Group'}]}

In [None]:
data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations'])

In [None]:
data['examples'][0]['content']

'Company 3.47% reported 2023-02-18 in 2022-08-21.'

In [None]:
data['examples'][0]['annotations']

[{'id': 'd935770c-3460-4555-b547-3f2d2dd147c1',
  'start': 8,
  'end': 13,
  'label': 'Percentage',
  'text': '3.47%'},
 {'id': '25414d11-bb25-4da8-836d-279bd41977cb',
  'start': 23,
  'end': 33,
  'label': 'Date',
  'text': '2023-02-18'},
 {'id': 'a218f443-7800-4859-a95c-d6a433123355',
  'start': 37,
  'end': 47,
  'label': 'Date',
  'text': '2022-08-21'}]

#Preprocessing the data
Creating training data from the custom dataset:
So the training data contains:
*   context
*   start index and end index of the entities
*   Corresponding annotations



In [None]:
training_data = []
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['label'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data.append(temp_dict)

print(training_data[0])

{'text': 'Company 3.47% reported 2023-02-18 in 2022-08-21.', 'entities': [(8, 13, 'PERCENTAGE'), (23, 33, 'DATE'), (37, 47, 'DATE')]}


In [None]:
print(training_data[2])

{'text': 'New regulations like Palmer-Baker impact financial markets.', 'entities': [(21, 33, 'ORGANIZATION')]}


In [None]:
training_data[0]['text']

'Company 3.47% reported 2023-02-18 in 2022-08-21.'

In [None]:
training_data[0]['entities']

[(8, 13, 'PERCENTAGE'), (23, 33, 'DATE'), (37, 47, 'DATE')]

In [None]:
training_data[0]['text'][29:53]

'2-18 in 2022-08-21.'

#Converting the training data into spacy format
Save it to disk as train.spacy

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()



In [None]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("/content/drive/MyDrive/PII_tag/train_2k.spacy")

100%|██████████| 2000/2000 [00:00<00:00, 5131.26it/s]


#To make a config file
Config file contains the required hyperparameters
need to be tuned to finetune the model

In [None]:
# https://spacy.io/usage/training#quickstart

In [None]:
#!pip install spacy-transformers

#Copy the base configuration file to a config file from which it fetches all the required parameters and hyperparameters

In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/PII_tag/base_config.cfg /content/drive/MyDrive/PII_tag/config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/PII_tag/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy debug data /content/drive/MyDrive/PII_tag/config.cfg

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: transformer, ner
Components from other pipelines: ner, transformer
2000 training docs
2000 evaluation docs
[38;5;3m⚠ 1819 training examples also in evaluation data[0m
[1m
[38;5;4mℹ 23554 total word(s) in the data (1124 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 22 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;3m⚠ Some model labels are not present in the train data. The model
performance may be degraded for these labels after training: 'PRODUCT',
'PERSON', 'ORDINAL', 'FAC', 'TIME', 'PERCENT', 'ORG', 'EVENT', 'CARDINAL',
'WORK_OF_ART', 'LOC', 'NORP', 'LANGUAGE'.[0m
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities crossing sentence 

# Finetune the model with our custom dataset

In [None]:
!python -m spacy train /content/drive/MyDrive/PII_tag/config.cfg \
--output /content/drive/MyDrive/PII_tag \
    --paths.train /content/drive/MyDrive/PII_tag/train_2k.spacy \
    --paths.dev //content/drive/MyDrive/PII_tag/train_2k.spacy \
    --gpu-id 0
# --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

In [None]:
import spacy

# Get the best spacy model which gives result on our train dataset

In [None]:
nlp_ner = spacy.load("/content/drive/MyDrive/PII_tag/model-best")

In [None]:
import pickle
from google.colab import files

In [None]:
output_path = "/content/drive/MyDrive/PII_tag/nlp_ner_model.pkl"
with open(output_path, "wb") as f:
    pickle.dump(nlp_ner, f)

In [None]:
#doc = nlp_ner(" The Williams Inc increased by $438,533.70 due to 2023-09-19")
#doc=nlp_ner("After securing a lucrative contract with Acme Corporation, the leading tech giant, they reported record profits of $1.5 million in the third quarter of 2023")
#doc_1=nlp_ner("On 2023-05-12, the European Union implemented a new regulation under the GDPR, affecting the trading of 1,000 shares of Bonds, as the Revenue from these investments increased by 8% to $5,000,000.")
doc_2=nlp_ner("On 2023-05-12, the European Union implemented a new regulation under the GDPR. Earnings per Share (EPS)")

colors = {"Organization": "#F67DE3", "Money": "#7DF6D9", "Date":"#a6e22d", "FinancialIndicator": "#8A2BE2","FinancialProduct":"#FFA07A","Percentage":"#FF4500","GPE":"#FF6347","Law":"#DAA520","Quantity":"#4682B4"}
options = {"colors": colors}

spacy.displacy.render(doc_2, style="ent", options= options, jupyter=True)