# NER Resume Parser

## Import all the Dependencies

In [1]:
import json
import random
import spacy

## Split of data into train and test
Total of 94 resumes are taken as data and splitted into training and testing data using json line formatted data.
1. train_data = 80 resumes (data used for training the model)
2. test_data = 14 resumes (data used for the tetsting the accuracy of the trained model)

In [2]:
data = []
with open("data.jsonl", encoding="utf8") as d:
  for line in d:
    dat = json.loads(line)
    text = dat["data"]
    entity = dat["label"]
    entities = []
    for ent in entity:
      entities.append((ent[0], ent[1], ent[2]))
    data.append((text, {"entities" : entities}))

train_data = data[:80]
test_data = data[80:]

## Creating Spacy Model
Blank spacy model is created and pipeline is defined, furthermore as labels list is defined they are added into the pipeline.

In [3]:
LABELS = ["name", "phone_no", "email", "linkedin", "github", "designation",
         "company", "job-duration", "Experience", "degree", "academic-institute", "databases", "tools", "core-skills", "soft-skills", "cloud-platforms", 
         "Front End", "Back End", "Mobile App", "Libraries"]


model = spacy.blank('en')
if 'ner' not in model.pipe_names:
    ner = model.create_pipe('ner')
    model.add_pipe(ner, last=True)
    

for i in LABELS:
  ner.add_label(i)

## Training the Spacy Model
The model which is created above will be trained on 80 resumes and here all other pipelines except the ner one which is created above are disabled. 
Also, the train_data is shuffled so the order doesn't create an issue but mostly it will be other way around.

In [52]:
other_pipes = [pipe for pipe in model.pipe_names if pipe != 'ner']
with model.disable_pipes(*other_pipes):  # only train NER
    optimizer = model.begin_training()
    for itn in range(200):
        print("Statring iteration " + str(itn))
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            model.update(
                [text], 
                [annotations],
                drop=0.3,
                sgd=optimizer,
                losses=losses)
        print(losses)

Statring iteration 0
{'ner': 9086.016344472797}
Statring iteration 1
{'ner': 5775.54421436147}
Statring iteration 2
{'ner': 5790.462567481561}
Statring iteration 3
{'ner': 5649.707963589194}
Statring iteration 4
{'ner': 4530.505186055407}
Statring iteration 5
{'ner': 4118.0038339086495}
Statring iteration 6
{'ner': 4319.213940221822}
Statring iteration 7
{'ner': 4377.778997759579}
Statring iteration 8
{'ner': 3738.5067388601824}
Statring iteration 9
{'ner': 3707.0684635037087}
Statring iteration 10
{'ner': 3529.437793201774}
Statring iteration 11
{'ner': 3399.4632947736563}
Statring iteration 12
{'ner': 3174.375069948829}
Statring iteration 13
{'ner': 3091.4008657450927}
Statring iteration 14
{'ner': 2946.735754441812}
Statring iteration 15
{'ner': 2808.616162615468}
Statring iteration 16
{'ner': 2671.855197548338}
Statring iteration 17
{'ner': 2740.0388042511067}
Statring iteration 18
{'ner': 2474.338745852204}
Statring iteration 19
{'ner': 2457.9215802250037}
Statring iteration 20
{'

{'ner': 405.05396767102155}
Statring iteration 167
{'ner': 361.86459688912413}
Statring iteration 168
{'ner': 340.7971179377868}
Statring iteration 169
{'ner': 348.37238743079007}
Statring iteration 170
{'ner': 313.46543275349825}
Statring iteration 171
{'ner': 368.9364160980823}
Statring iteration 172
{'ner': 298.50807290967737}
Statring iteration 173
{'ner': 347.40076645291754}
Statring iteration 174
{'ner': 372.67491239407843}
Statring iteration 175
{'ner': 291.54123632431987}
Statring iteration 176
{'ner': 335.36426562878364}
Statring iteration 177
{'ner': 381.93293677266803}
Statring iteration 178
{'ner': 312.7087802256321}
Statring iteration 179
{'ner': 285.12354443455854}
Statring iteration 180
{'ner': 389.3947435995446}
Statring iteration 181
{'ner': 389.08302324264076}
Statring iteration 182
{'ner': 348.7697888022164}
Statring iteration 183
{'ner': 344.5794787355446}
Statring iteration 184
{'ner': 354.331606467093}
Statring iteration 185
{'ner': 281.84807877029584}
Statring it

## Testing the Spacy Model
After training part is completed, using test_data the "model" predicts the labels and their data and write them into txt files.
The below case predicts 14 txt files will be there to see how the model predicted.

In [53]:
c = 0        
for text,annot in test_data:
    f = open("resume"+str(c)+".txt","w+", encoding="utf8")
    doc_to_test = model(text)
    d = {}
    for ent in doc_to_test.ents:
        d[ent.label_] = []
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)
    for i in set(d.keys()):
        f.write("\n\n")
        f.write(i +":"+"\n")
        for j in set(d[i]):
            f.write(j.replace('\n','')+"\n")
    c+=1
    
model.to_disk("my_model")

## Accuracy of Model
The below code let's you see accuracy on each resume and on each label which in turn could help us to see where the improvement is needed.

In [65]:
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit    

def check(ref:str, pred:str):
    if len(ref) > len(pred):
        if pred in ref:
            return True
    else:
        if ref in pred:
            return True
    return False

nlp = spacy.load("E:\\Darsh\\DP\\Internship and Works\\AtliQ Technologies\\NER Resume Model\\my_model")
whole_true = 0
whole_pred = 0
for text,annot in test_data:
    true_labels = 0
    pred_labels = 0
    p = []
    pred = []
    true = []
    doc_to_test=nlp(text)
    
    for i in range(len(annot["entities"])):
        true.append(([text[annot["entities"][i][0]:annot["entities"][i][1]] , annot["entities"][i][2] ]))
        a = dict(true)
        true = [[k, v] for k,v in a.items()]
    print(true)
    
    for i in doc_to_test.ents:
        p.append(([str(i).strip("\n") , i.label_])) 
    
    predd = list((dict(p)).items())    
    for i in range(len(predd)):
        pred.append(([predd[i][0], predd[i][1]]))
    print(pred)
    
    for i in LABELS:
      c_t = 0
      c_p = 0
      for j in range(len(true)):
        if true[j][1] == i:
            c_t+=1
            whole_true+=1
            true_labels +=1  
            for k in range(len(pred)):
                #pred[k][0] = pred[k][0].replace("(", "")
                if (i != "Front End") and (pred[k][1] == i) and (check(true[j][0], pred[k][0]) == True) and ( i != "Back End" ):
                    #print(true[k][0])
                    #print(pred[k][0])
                    c_p+=1
                    whole_pred+=1
                    pred_labels+=1
                    continue
                elif (i == "Front End" or i == "Back End") and (pred[k][0] == true[j][0]):
                    c_p+=1
                    whole_pred+=1
                    pred_labels+=1
                    continue                
      if c_t == 0:
        continue
      else: 
        print(f"{i} :  {c_t}, pred: {c_p}")
        print(f"{i} : {(c_p/c_t)*100}\n")
    print(f"Accuracy per resume: {(pred_labels/true_labels)*100}\n")
    
print(f"\nAccuracy on test set: {(whole_pred/whole_true)*100}\n")


[['problem solving', 'soft-skills'], ['leadership skill', 'soft-skills'], ['team management', 'soft-skills'], ['javascript', 'Front End'], ['angular', 'Front End'], ['html', 'Front End'], ['css', 'Front End'], ['bootstarp', 'Front End'], ['bachelor engineering', 'degree'], ['shri labhubhai trivedi\ninstitute engineering technology', 'academic-institute'], ['sql', 'Back End'], ['marketing assistant', 'designation'], ['intuitive problem solving', 'soft-skills'], ['creative thinking', 'soft-skills'], ['communication', 'soft-skills'], ['excellent team player', 'soft-skills'], ['shreyash\nzinzuvadia', 'name'], ['project management', 'core-skills'], ['google web desiner', 'core-skills'], ['google digital marketing', 'core-skills'], ['9408152104', 'phone_no'], ['8320361177', 'phone_no'], ['shreyashsoni95@gmail .com', 'email'], ['linkedin.com/in/shreyash-\nzinzuvadia', 'linkedin']]
[['red blue', 'name'], ['problem solving', 'soft-skills'], ['creative thinking', 'soft-skills'], ['team player', 

[['keval padsumbiya', 'name'], ['kppadsumbiya@gmail.com', 'email'], ['9825497726', 'phone_no'], ['github.com/kevalpadsumbiya', 'github'], ['data structures\n', 'core-skills'], ['python', 'Back End'], ['java', 'Back End'], ['c++', 'Back End'], ['c', 'Back End'], ['django', 'Libraries'], ['php', 'Back End'], ['learn new', 'soft-skills'], ['junior php developer(4 weeks', 'designation'], ['silverwing technologies pvt ltd', 'company'], ['web development', 'core-skills'], ['linkedin.com/in/keval-\npadsumbiya', 'linkedin'], ['data\nstructures\nalgorithms\n', 'core-skills'], ['btech information technology', 'degree'], ['image service', 'tools'], ['birla vishvakarma mahavidhyalaya engineering college', 'academic-institute'], ['software engineering', 'core-skills'], ['https://github.com/kevalpadsumbiya/problem-finder', 'github'], ['https://github.com/kevalpadsumbiya/share-images', 'github']]
[['keval padsumbiya', 'name'], ['kppadsumbiya@gmail.com', 'email'], ['9825497726', 'phone_no'], ['linkedi

[['vishal kumar', 'name'], ['vishalyo990@gmail.com', 'email'], ['python', 'Back End'], ['angular developer', 'Front End'], ['good interpersonal skills', 'soft-skills'], ['python developer', 'designation'], ['angular', 'Front End'], ['rest api', 'Back End'], ['flask', 'Libraries'], ['microservices architecture', 'tools'], ['azure cloud', 'cloud-platforms'], ['dronacharya college engineering gurgoan', 'academic-institute'], ['bachelor technology computer science engineering', 'degree'], ['docker', 'tools'], ['leadership team player', 'soft-skills'], ['6352282184', 'phone_no'], ['2 + years experience', 'Experience'], ['communication skills', 'soft-skills'], ['core competencies', 'core-skills'], ['backend development', 'core-skills'], ['automation tasks', 'core-skills'], ['frontend development', 'core-skills'], ['cloud deployment', 'core-skills'], ['stemmons business services pvt ltd', 'company'], ['jun 2019 present', 'job-duration'], ['selenium web testing', 'tools'], ['transpipe integrit