# NER Modeling for Resume Parsing : spaCy

In [1]:
import json
import json5
import re

**Load dataset**

In [50]:
file_data = []
with open(r'/kaggle/input/resume-entities-for-ner/Entity Recognition in Resumes.json','r',encoding='utf-8') as f:
    file = f.readlines()
    for line in file:
        file_data.append(json5.loads(line))

In [51]:
print("Number of records: ",len(file_data))
file_data[0]

Number of records:  220


{'content': "Abhishek Jha\nApplication Development Associate - Accenture\n\nBengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a\n\n• To work for an organization which provides me the opportunity to improve my skills\nand knowledge for my individual and company's growth in best possible ways.\n\nWilling to relocate to: Bangalore, Karnataka\n\nWORK EXPERIENCE\n\nApplication Development Associate\n\nAccenture -\n\nNovember 2017 to Present\n\nRole: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries\nfor the Bot which will be triggered based on given input. Also, Training the bot for different possible\nutterances (Both positive and negative), which will be given as\ninput by the user.\n\nEDUCATION\n\nB.E in Information science and engineering\n\nB.v.b college of engineering and technology -  Hubli, Karnataka\n\nAugust 2013 to June 2017\n\n12th in Mathematics\n\nWoodbine modern school\n\nApril 2011 to March 2013\n\n10th\n\nKendriya 

**Processing initial data into format for spacy conversion**

In [52]:
final_data = []
for data in file_data:
    temp_dict = {}
    temp_dict['text'] = data['content'].replace('\n',' ')
    temp_dict['entities'] = []
    annotations = data['annotation']
    for annot in annotations:
        if len(annot['label']) != 0: # ignore labels with no value
            v = annot.pop('label')
            annot.update({'label': v[0]})
        elif (len(annot['label']) == 0) & (annot['points'][0]['text'] == 'Oracle'): # Replace no label
            v = annot.pop('label')
            annot.update({'label': 'Skills'})
        else:
            v = annot.pop('label')
            annot.update({'label': 'Designation'})
        label = annot['label']
        points = annot['points'][0]
        start_point = points['start']
        end_point = points['end']
        point_text = points['text']
        lstrip= len(point_text) - len(point_text.lstrip())
        rstrip = len(point_text) - len(point_text.rstrip())
        if lstrip != 0:
            start_point = start_point + lstrip
        if rstrip != 0:
            end_point = end_point - rstrip
        temp_dict['entities'].append((int(start_point),int(end_point),label))
    final_data.append(temp_dict)

In [53]:
final_data[0]

{'text': "Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILL

In [54]:
# Remove leading and trailing white spaces

def trim_entity_spans(data):
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for item in data:
        temp_dict = {}
        temp_dict['text'] = item['text']
        entities = item['entities']
        temp_dict['entities'] = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(item['text']) and invalid_span_tokens.match(
                    item['text'][valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    item['text'][valid_end]):
                valid_end -= 1
            temp_dict['entities'].append((valid_start, valid_end, label))
        cleaned_data.append(temp_dict)
    return cleaned_data

In [55]:
cleaned_data = trim_entity_spans(final_data)
cleaned_data[0]

{'text': "Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILL

In [56]:
# Split data into train and test
print("Number of records: ",len(final_data))
train = final_data[:200]
test = final_data[200:220]

print("\nTrain data length: ",len(train))
print("Test data length: ",len(test))

Number of records:  220

Train data length:  200
Test data length:  20


**Convert data into spacy binary files**

In [22]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
nlp = spacy.blank("en")

In [36]:
def convert_data(data,output_path):
    # create a docbin object
    db = DocBin()
    for example in tqdm(data):
        text = example['text']
        labels = example['entities']
        # create a doc object from text
        doc = nlp.make_doc(text)
        ents = []
        for start,end,label in labels:
            span = doc.char_span(start,end,label=label,alignment_mode="contract")
            if span is not None:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        db.add(doc)
    db.to_disk(output_path)

In [57]:
convert_data(train,'/kaggle/working/train.spacy')
convert_data(test,'/kaggle/working/dev.spacy')

100%|██████████| 200/200 [00:01<00:00, 136.45it/s]
100%|██████████| 20/20 [00:00<00:00, 175.93it/s]
