# Build a custom NER using TESTEset.json data. The data are avalable in the google drive.


In [1]:
import json
import spacy
from tqdm import tqdm
from spacy.tokens import DocBin
from spacy.util import filter_spans

In [3]:
with open('/content/TASTEset.json', 'r') as f:
    data = json.load(f)

In [4]:
data[0]

['5 ounces rum\n4 ounces triple sec\n3 ounces Tia Maria\n20 ounces orange juice\n',
 {'entities': [[0, 1, 'QUANTITY'],
   [2, 8, 'UNIT'],
   [9, 12, 'FOOD'],
   [13, 14, 'QUANTITY'],
   [15, 21, 'UNIT'],
   [22, 32, 'FOOD'],
   [33, 34, 'QUANTITY'],
   [35, 41, 'UNIT'],
   [42, 51, 'FOOD'],
   [52, 54, 'QUANTITY'],
   [55, 61, 'UNIT'],
   [62, 74, 'FOOD']]}]

In [5]:
nlp  = spacy.blank('en')
doc_bin = DocBin()

In [6]:
for training in tqdm(data):
    # print(training)
    text = training[0]
    # print(f'TEXTE: {text}')
    labels = training[1]['entities']
    # print(f'LABELS: {labels}')
    # print('*' * 50)

    # Create a Doc object from the text without running the full pipeline
    doc = nlp.make_doc(text)

    # Initialize an empty list to store entity spans
    ents = []

    # Iterate over the entities in the training example
    for start, end, label in labels:
        # Create a span for each entity
        span = doc.char_span(start, end, label=label, alignment_mode="contract")

        # If the span is valid, add it to the list of entities
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    # Filter overlapping spans to ensure each token is part of at most one entity
    filtered_ents = filter_spans(ents)

    # Assign the filtered entities to the document
    doc.ents = filtered_ents

    # Add the processed document to the DocBin
    doc_bin.add(doc)

# Save the DocBin with the processed documents to disk
doc_bin.to_disk("customNER.spacy")

100%|██████████| 700/700 [00:00<00:00, 1505.36it/s]


In [7]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
!python -m spacy train config.cfg --output ./ --paths.train ./customNER.spacy --paths.dev ./customNER.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     61.87   19.04   15.13   25.65    0.19
  0     200        289.12   4656.80   90.25   90.74   89.76    0.90
  1     400        336.98   2266.59   94.11   94.61   93.61    0.94
  2     600        496.90   1995.32   96.04   95.80   96.29    0.96
  3     800        467.18   1865.83   97.35   97.42   97.28    0.97
  5    1000        523.26   1692.77   98.09   98.06   98.11    0.98
  7    1200        642.34   1601.69   98.87   98.91   98.83    0.99
 10    1400        661.42   1392.95   99.18   99.18   99.17    0.99
 13    1600        769.89   1321.08   99.52   99.48   99.55    1.00
 17    1800        876.47   1217.37   99.63   99.60

In [31]:
model_ner = spacy.load("model-best")

In [10]:
# Intitalize the colours for the NER

colors = {"UNIT": "#F67DE3", "FOOD": "#7DF6D9", "QUANTITY":"#a6e22d", "COLOR": "#e6db74", "PART": "#f92672", "PHYSICAL_QUALITY": "#fd971f", "PROCESS": "#ae81ff", "TASTE": "#f92672"}
options = {"colors": colors}

In [36]:
import pandas as pd

df = pd.read_excel('/content/cooking.xlsx')
df = df.iloc[1: 5, 1:]
df.rename(columns= {"Unnamed: 1": "Tiltle", "Unnamed: 2": "Instructions"}, inplace=True)

In [47]:
for index, row in df.iterrows():

    text = row["Instructions"]
    doc = model_ner(text)
    spacy.displacy.render(doc,style="ent",options=options, jupyter=True)
    print('\n')

















In [39]:
df.head()

Unnamed: 0,Tiltle,Instructions
1,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ..."
2,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...
3,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...
4,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...
