***The resume data is from: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset. I only retrieve 25 records from it. Labels can be seen in the code below--Skill, University, Degree, Job Description, Duration, Position, Experience.***

In [1]:
import spacy
from spacy import displacy
import random
import json
import logging

### 1. Load Annotated resume document by doccano

In [38]:
filename = 'resume.jsonl'

In [39]:
with open(filename,'r') as json_file:
    data=json.load(json_file)

In [6]:
data

{'id': 7,
 'label': [[58, 160, 'Experience'],
  [992, 1144, 'Experience'],
  [1345, 1525, 'Skill'],
  [1546, 1585, 'Skill'],
  [1655, 1681, 'Skill'],
  [1709, 1819, 'Skill'],
  [2119, 2142, 'Duration'],
  [2146, 2159, 'Position'],
  [2205, 2614, 'Job Description'],
  [2624, 2646, 'Duration'],
  [2651, 2661, 'Position'],
  [2706, 3580, 'Job Description'],
  [3589, 3611, 'Duration'],
  [3616, 3652, 'Position'],
  [3698, 4223, 'Job Description'],
  [4231, 4255, 'Duration'],
  [4258, 4306, 'Position'],
  [4352, 4868, 'Job Description'],
  [4876, 4898, 'Duration'],
  [4903, 4920, 'Position'],
  [4966, 5671, 'Job Description'],
  [5682, 5704, 'Duration'],
  [5709, 5726, 'Position'],
  [5772, 6040, 'Job Description'],
  [6052, 6074, 'Duration'],
  [6079, 6118, 'Position'],
  [6160, 6604, 'Job Description'],
  [6650, 6693, 'Degree'],
  [6696, 6715, 'University'],
  [6805, 6851, 'Experience'],
  [6926, 6974, 'Experience'],
  [7142, 7186, 'Experience'],
  [7284, 7367, 'Skill'],
  [7497, 7544, 'S

### 2. Convert annotated json file to spacy required format

In [8]:
del data['id']
data

 'label': [[58, 160, 'Experience'],
  [992, 1144, 'Experience'],
  [1345, 1525, 'Skill'],
  [1546, 1585, 'Skill'],
  [1655, 1681, 'Skill'],
  [1709, 1819, 'Skill'],
  [2119, 2142, 'Duration'],
  [2146, 2159, 'Position'],
  [2205, 2614, 'Job Description'],
  [2624, 2646, 'Duration'],
  [2651, 2661, 'Position'],
  [2706, 3580, 'Job Description'],
  [3589, 3611, 'Duration'],
  [3616, 3652, 'Position'],
  [3698, 4223, 'Job Description'],
  [4231, 4255, 'Duration'],
  [4258, 4306, 'Position'],
  [4352, 4868, 'Job Description'],
  [4876, 4898, 'Duration'],
  [4903, 4920, 'Position'],
  [4966, 5671, 'Job Description'],
  [5682, 5704, 'Duration'],
  [5709, 5726, 'Position'],
  [5772, 6040, 'Job Description'],
  [6052, 6074, 'Duration'],
  [6079, 6118, 'Position'],
  [6160, 6604, 'Job Description'],
  [6650, 6693, 'Degree'],
  [6696, 6715, 'University'],
  [6805, 6851, 'Experience'],
  [6926, 6974, 'Experience'],
  [7142, 7186, 'Experience'],
  [7284, 7367, 'Skill'],
  [7497, 7544, 'Skill'],
  

In [70]:
def convert_doccano_to_spacy(data):
    try:
        training_data = []
        lines=[]
        text = data['text']
        entities = []
        for each in data['label']:
            points = each[:2]
            label = each[2:][0]
            if not isinstance(label, list):
                labels = [label]
            for label in labels:
                entities.append((points[0], points[1]+1 ,label))


        training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + "\n" + "error = " + str(e))
        return None

In [71]:
TRAIN_DATA = convert_doccano_to_spacy(data)
TRAIN_DATA

  {'entities': [(58, 161, 'Experience'),
    (992, 1145, 'Experience'),
    (1345, 1526, 'Skill'),
    (1546, 1586, 'Skill'),
    (1655, 1682, 'Skill'),
    (1709, 1820, 'Skill'),
    (2119, 2143, 'Duration'),
    (2146, 2160, 'Position'),
    (2205, 2615, 'Job Description'),
    (2624, 2647, 'Duration'),
    (2651, 2662, 'Position'),
    (2706, 3581, 'Job Description'),
    (3589, 3612, 'Duration'),
    (3616, 3653, 'Position'),
    (3698, 4224, 'Job Description'),
    (4231, 4256, 'Duration'),
    (4258, 4307, 'Position'),
    (4352, 4869, 'Job Description'),
    (4876, 4899, 'Duration'),
    (4903, 4921, 'Position'),
    (4966, 5672, 'Job Description'),
    (5682, 5705, 'Duration'),
    (5709, 5727, 'Position'),
    (5772, 6041, 'Job Description'),
    (6052, 6075, 'Duration'),
    (6079, 6119, 'Position'),
    (6160, 6605, 'Job Description'),
    (6650, 6694, 'Degree'),
    (6696, 6716, 'University'),
    (6805, 6852, 'Experience'),
    (6926, 6975, 'Experience'),
    (7142, 7187, 

### 3. NER model

##### *blank Language model*

In [65]:
nlp=spacy.blank('en')

##### *NER pipeline*

In [66]:
if 'ner' not in nlp.pipe_names:
    ner=nlp.create_pipe('ner')
    nlp.add_pipe(ner,last=True)

##### *Adding labels*

In [72]:
for _,annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

#####  *Get names of other pipes to disable them during training*

In [73]:
other_pipes=[pipe for pipe in nlp.pipe_names if pipe !='ner']

##### *Train only NER*

In [74]:
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(100):
        print("Statring iteration " + str(itn))
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.1,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

        doc = nlp("02/2015   to   Current     HR Employee Relations Specialist.")
        print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))
        doc = nlp("Dedicated and self-motivated professional with experience in providing outstanding support to business partners. Skillful in tracking details, office management, and following-up with internal and external partners to ensure ontime completion of projects.")
        print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))

Statring iteration 0
{'ner': 16648.330108642578}
Entities= []
Entities= []
Statring iteration 1
{'ner': 16464.135345458984}
Entities= []
Entities= []
Statring iteration 2
{'ner': 16316.135833740234}
Entities= []
Entities= []
Statring iteration 3
{'ner': 15739.951950073242}
Entities= []
Entities= []
Statring iteration 4
{'ner': 14372.240753173828}
Entities= []
Entities= []
Statring iteration 5
{'ner': 11048.793273925781}
Entities= []
Entities= []
Statring iteration 6
{'ner': 7682.323257446289}
Entities= []
Entities= []
Statring iteration 7
{'ner': 4871.496940612793}
Entities= []
Entities= []
Statring iteration 8
{'ner': 4606.156135559082}
Entities= []
Entities= []
Statring iteration 9
{'ner': 4988.917922973633}
Entities= []
Entities= []
Statring iteration 10
{'ner': 5246.429626464844}
Entities= []
Entities= []
Statring iteration 11
{'ner': 5113.501792907715}
Entities= []
Entities= []
Statring iteration 12
{'ner': 4952.172225952148}
Entities= []
Entities= []
Statring iteration 13
{'ner':

In [82]:
#do prediction
doc = nlp("02/2015   to   Current     HR Employee Relations Specialist.")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))

doc = nlp("Dedicated and self-motivated professional with experience in providing outstanding support to business partners. Skillful in tracking details, office management, and following-up with internal and external partners to ensure ontime completion of projects.")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))

Entities= ['HR Employee Relations Specialist_Job Description']
Entities= ['experience in providing outstanding support to business partners. Skillful in tracking details, office management, and following-up with internal and external partners to ensure ontime completion of projects._Experience']


#### 4. Test

In [101]:
doc = nlp("Versatile  media professional with background in Communications, Marketing, Human Resources and Technology")
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents]))

Entities= ['background in Communications, Marketing, Human Resources and Technology_Experience']


In [102]:
displacy.render(doc, jupyter=True, style='ent')