In [1]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('en_core_web_sm')
ner=nlp.get_pipe("ner")
import warnings
warnings.filterwarnings('ignore')
from spacy.training.example import Example
# Import requirements
import random
from spacy.util import minibatch, compounding
import pandas as pd

In [2]:
# training data
TRAIN_DATA = [
              ("Hello Please reply with number 1 or 2 1. English 2. Hindi", {"entities": [(41, 48, "LANGUAGE")]}),
              ("Hello What is your name?", {"entities": [(19, 23, "NAME")]}),
              ("Drishti from CARE. What's your PIN?", {"entities": [(31,34, "PINCODE")]}),
              ("Which of the following items do you provide? Please reply with number 1,2,3 or 4 1. Oxygen Cylinder 2. Oxygen Refill 3. Medicines 4. If something else, please type", {"entities": [(36,43, "MATERIAL")]}),
              ("Hello, Drishti from CARE. Do you have Food available? Please reply with 1 or 2 1. Yes 2. No", {"entities": [(43,52, "AVAILABILITY")]}),
              ("By when will it be available? Please select 1,2,3 or 4 1. 3 hours 2. 6 hours 3. 12 hours 4. 24 hours 5. Don't know ", {"entities": [(3,12, "DURATION")]}),
              ]

In [3]:
# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [4]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [5]:
# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        for text, annotations in batch:
            # create Example
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            # Update the model
            nlp.update([example], losses=losses, drop=0.5)
            print("Losses", losses)

Losses {'ner': 4.820422617982642}
Losses {'ner': 13.128085206350988}
Losses {'ner': 15.128588331571109}
Losses {'ner': 18.41263308745338}
Losses {'ner': 20.53066187990704}
Losses {'ner': 24.699039681521896}
Losses {'ner': 4.244859072569852}
Losses {'ner': 9.436308432006413}
Losses {'ner': 12.079879939771988}
Losses {'ner': 14.031474892722231}
Losses {'ner': 16.014684215450096}
Losses {'ner': 19.526277906481326}
Losses {'ner': 5.454789087089646}
Losses {'ner': 8.447409395643078}
Losses {'ner': 11.047538102377453}
Losses {'ner': 13.01563707180001}
Losses {'ner': 15.370264769563244}
Losses {'ner': 17.15217788262388}
Losses {'ner': 2.0731761569633305}
Losses {'ner': 4.048235515979701}
Losses {'ner': 6.099339998432527}
Losses {'ner': 8.632360835194103}
Losses {'ner': 10.333331482400464}
Losses {'ner': 12.53401676758794}
Losses {'ner': 2.3020067573705774}
Losses {'ner': 3.9351470961962036}
Losses {'ner': 5.7478614317682855}
Losses {'ner': 7.088362689228347}
Losses {'ner': 10.69403581819114}


In [6]:
# Testing the model
doc = nlp("Hello What is your name?")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('name', 'NAME')]


In [7]:
# Testing the model
doc = nlp("Hello Please reply with number 1 or 2 1. English 2. Hindi")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('English', 'LANGUAGE')]


In [8]:
# Testing the model
doc = nlp("Drishti from CARE. What's your PIN?")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('PIN', 'PINCODE')]


In [9]:

doc = nlp("Which of the following items do you provide? Please reply with number 1,2,3 or 4 1. Oxygen Cylinder 2. Oxygen Refill 3. Medicines 4. If something else, please type")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('provide', 'MATERIAL')]


In [10]:
df = pd.read_csv('Data.csv')

In [11]:
lst_doc = []
for i in df['msg_txt']:
    lst_doc.append(i)

In [12]:
labels = []
for i in lst_doc:
    doc = nlp(i)
    for ent in doc.ents:
    # Print the entity text and its label
       labels.append(ent.label_)

In [13]:
data = [df["msg_txt"], pd.Series(labels)]

headers = ["msg_txt", "entity"]

df_new = pd.concat(data, axis=1, keys=headers)

In [14]:
df_new

Unnamed: 0,msg_txt,entity
0,Hello\nPlease reply with number 1 or 2\n1. Eng...,LANGUAGE
1,Hello\nWhat is your name?,NAME
2,Drishti from CARE. What's your PIN?,PINCODE
3,Which of the following items do you provide?\...,MATERIAL
4,"Hello, Drishti from CARE.\nDo you have Food av...",AVAILABILITY
5,"Hello, Drishti from CARE.\nDo you have Food av...",AVAILABILITY
6,By when will it be available?\nPlease select 1...,DURATION


In [15]:
df_new.to_csv('Data_new.csv')