In [1]:
# import sys
# !{sys.executable} -m pip install spacy

## Reading text file

In [3]:
import pandas as pd
import numpy as np

In [4]:
filename = './restauranttrain.bio.txt'
df = pd.read_table(filename, sep='\t', header=None, names=['label','text'], skip_blank_lines=False)
df.head(20)

Unnamed: 0,label,text
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside
5,I-Amenity,dining
6,,
7,O,34
8,,
9,B-Rating,5


## Preprocessing Data

In [5]:
train = []
sent_tokens = []
entities = []
start_position = 0
position = 0
entity = None
for index, (label, text) in df.iterrows():
#     print(label, text)
    if type(label)!= str and type(text)!= str:
#         print('break!')
#         print(sent_tokens)
        if entity:
            entities.append((start_position, position-1, entity))
            entity = None
        train.append((' '.join(sent_tokens),{"entities":entities}))
        sent_tokens = []
        entities = []
        position = 0
        entity = None
        continue
    sent_tokens.append(text)
    if label[0] == 'O' and entity is not None:
        entities.append((start_position, position-1, entity))  
        entity = None
    elif label[0] == 'B':
        if entity:
            entities.append((start_position, position-1, entity))  
            entity = None
        entity = label.replace('B-','')
        start_position = position
    position += len(text) + 1
train[:10]
    

[('2 start restaurants with inside dining',
  {'entities': [(0, 7, 'Rating'), (25, 38, 'Amenity')]}),
 ('34', {'entities': []}),
 ('5 star resturants in my town',
  {'entities': [(0, 6, 'Rating'), (18, 28, 'Location')]}),
 ('98 hong kong restaurant reasonable prices',
  {'entities': [(3, 12, 'Restaurant_Name'), (24, 34, 'Price')]}),
 ('a great lunch spot but open till 2 a m passims kitchen',
  {'entities': [(23, 38, 'Hours'), (39, 54, 'Restaurant_Name')]}),
 ('a place that serves soft serve ice cream',
  {'entities': [(20, 40, 'Dish')]}),
 ('a restaurant that is good for groups',
  {'entities': [(21, 25, 'Rating'), (26, 36, 'Amenity')]}),
 ('a salad would make my day', {'entities': [(2, 7, 'Dish')]}),
 ('a smoothie would hit the spot', {'entities': [(2, 10, 'Cuisine')]}),
 ('a steak would be nice', {'entities': [(2, 7, 'Dish')]})]

## Model Training

In [6]:
import spacy
# spacy.cli.download("en_core_web_sm")

In [7]:
from spacy.tokens import DocBin

In [8]:
db = DocBin() # create a DocBin object

In [11]:
nlp = spacy.load("en_core_web_sm")
i = 0
for text, annot in train: # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
#     print(doc)
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
#         print(span)
        if span is None:
            print("Skipping entity")
            pass
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    if i < 10:
        print(doc, doc.ents)
    db.add(doc)
    i += 1

2 start restaurants with inside dining (2 start, inside dining)
34 ()
5 star resturants in my town (5 star, in my town)
98 hong kong restaurant reasonable prices (hong kong, reasonable)
a great lunch spot but open till 2 a m passims kitchen (open till 2 a m, passims kitchen)
a place that serves soft serve ice cream (soft serve ice cream,)
a restaurant that is good for groups (good, for groups)
a salad would make my day (salad,)
a smoothie would hit the spot (smoothie,)
a steak would be nice (steak,)


In [8]:
db.to_disk("./train.spacy")

In [11]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

^C


In [13]:
ner = spacy.load(r"./output/model-last") #load the final model

In [30]:
doc = ner("This is an excellent 5 star fish and chips restaurant for fine dining and it is near to my house.")
entities = {}
for ent in doc.ents:
    entities[ent.text] = ent.label_
entities

{'5 star': 'Rating',
 'fish and chips': 'Cuisine',
 'fine dining': 'Amenity',
 'near to my house.': 'Location'}

## Usage

In [32]:
!python ner_predict.py "This is an excellent 5 star fish and chips restaurant for fine dining and it is near to my house."

{   '5 star': 'Rating',
    'fine dining': 'Amenity',
    'fish and chips': 'Cuisine',
    'near to my house': 'Location'}
