<a target="_blank" href="https://colab.research.google.com/github/daviszars/nlp_cv_parser/blob/main/nlp_cv_parsing.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Darbam ar PDF dokumentiem tiek ienistalēta speciāla bibliotēka.

CV NLP Model

In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5


In [None]:
import spacy
import random
import json
import re
import pandas as pd

from spacy.training import Example, offsets_to_biluo_tags
from spacy.tokens import DocBin, span
from spacy.util import minibatch, compounding

In [None]:
!python -m spacy download en_core_web_sm

Download our pre-trained models and data

In [None]:
!git clone https://github.com/daviszars/nlp_cv_parser.git

Cloning into 'nlp_cv_parser'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 32 (delta 4), reused 32 (delta 4), pack-reused 0[K
Receiving objects: 100% (32/32), 7.42 MiB | 18.10 MiB/s, done.
Resolving deltas: 100% (4/4), done.


NER model training / preprocessing data

In [None]:
def align_entities_with_tokens(nlp, text, entities):
    doc = nlp.make_doc(text)
    valid_entities = []
    for start, end, label in entities:
        span = doc.char_span(start, end, alignment_mode="contract")
        if span is not None:
            valid_entities.append((span.start_char, span.end_char, label))
    return valid_entities

def preprocess_training_data(train_data, nlp):
    processed_data = []
    for text, annotations in train_data:
        entities = annotations['entities']
        aligned_entities = align_entities_with_tokens(nlp, text, entities)
        if aligned_entities:
            processed_data.append((text, {'entities': aligned_entities}))
        else:
            print(f"Skipping misaligned entity: {entities} in text: {text[:50]}")
    return processed_data

def train_model(train_data, iterations=10):
    nlp = spacy.blank("en")  # blank English model

    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # we add labels to the NER pipeline
    for _, annotations in train_data:
        for ent in annotations['entities']:
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    train_data = preprocess_training_data(train_data, nlp)

    with nlp.disable_pipes(*other_pipes):  # only train the NER pipeline
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print(f"Iteration {itn + 1}/{iterations}")
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
                try:
                    nlp.update(examples, drop=0.5, sgd=optimizer, losses=losses)
                except Exception as e:
                    print(f"Error updating with batch: {batch}")
                    print(e)
            print("Losses", losses)

    return nlp

Testing / Validating misalignments

In [None]:
'''file_path = "nlp_cv_parser/train_data.json"
with open(file_path, "r", encoding="utf-8") as file:
    aligned_train_data = json.load(file)

# Inspect the first few entries
for i, entry in enumerate(aligned_train_data[:5]):  # Display first 5 entries
    text, annotations = entry
    print(f"Entry {i + 1}:")
    print(f"Text: {text[:100]}...")  # Display only first 100 characters
    print(f"Annotations: {annotations}")
    print("\n")

def debug_alignment(data):
    nlp = spacy.blank("en")
    misaligned_entries = []
    for text, annotations in data:
        doc = nlp.make_doc(text)
        entities = annotations['entities']
        biluo_tags = offsets_to_biluo_tags(doc, entities)

        if '-' in biluo_tags:
            misaligned_entries.append({
                'text': text,
                'entities': entities,
                'tags': biluo_tags
            })

    return misaligned_entries

misaligned_entries = debug_alignment(aligned_train_data)
for entry in misaligned_entries
    print(f"Text: {entry['text']}")
    print(f"Entities: {entry['entities']}")
    print(f"BILUO Tags: {entry['tags']}") # "-" is misaligned
    print("\n")'''

Training / saving model

In [None]:
# Load the cleaned data
file_path = "nlp_cv_parser/train_data.json" # pagaidām manuāli jāieliek colab, uz beigām varēšu github ielikt, lai būtu download links
with open(file_path, 'r', encoding='utf-8') as file:
    train_data = json.load(file)

# Train the model
nlp = train_model(train_data, iterations=100)

# Save the model
nlp.to_disk("nlp_cv_parser/ner_model_spacy")

Iteration 1/100
Losses {'ner': 21999.84887876154}
Iteration 2/100
Losses {'ner': 3730.1003680919875}
Iteration 3/100
Losses {'ner': 3546.363207352472}
Iteration 4/100
Losses {'ner': 3647.454556387455}
Iteration 5/100
Losses {'ner': 3504.937798037518}
Iteration 6/100
Losses {'ner': 3696.9023374014246}
Iteration 7/100
Losses {'ner': 2945.993383268496}
Iteration 8/100
Losses {'ner': 2792.722916814341}
Iteration 9/100
Losses {'ner': 2907.0753568093514}
Iteration 10/100
Losses {'ner': 2563.8826700847103}
Iteration 11/100
Losses {'ner': 2723.5998962630447}
Iteration 12/100
Losses {'ner': 3042.909618486358}
Iteration 13/100
Losses {'ner': 2560.241305627979}
Iteration 14/100
Losses {'ner': 2594.31771974024}
Iteration 15/100
Losses {'ner': 2526.9499360683217}
Iteration 16/100
Losses {'ner': 2445.042784533822}
Iteration 17/100
Losses {'ner': 2294.8190113108258}
Iteration 18/100
Losses {'ner': 2308.2591326358606}
Iteration 19/100
Losses {'ner': 2401.826136999996}
Iteration 20/100
Losses {'ner': 2

Convert data from pdf's

In [None]:
model_name = "./trained_nlp_41min"
nlp.to_disk(model_name)

To extract the trained model

In [None]:
!zip -r /content/file.zip /content/trained_nlp_41min

  adding: content/trained_nlp_41min/ (stored 0%)
  adding: content/trained_nlp_41min/config.cfg (deflated 59%)
  adding: content/trained_nlp_41min/ner/ (stored 0%)
  adding: content/trained_nlp_41min/ner/model (deflated 8%)
  adding: content/trained_nlp_41min/ner/moves (deflated 75%)
  adding: content/trained_nlp_41min/ner/cfg (deflated 33%)
  adding: content/trained_nlp_41min/vocab/ (stored 0%)
  adding: content/trained_nlp_41min/vocab/vectors.cfg (stored 0%)
  adding: content/trained_nlp_41min/vocab/key2row (stored 0%)
  adding: content/trained_nlp_41min/vocab/strings.json (deflated 75%)
  adding: content/trained_nlp_41min/vocab/lookups.bin (stored 0%)
  adding: content/trained_nlp_41min/vocab/vectors (deflated 45%)
  adding: content/trained_nlp_41min/meta.json (deflated 50%)
  adding: content/trained_nlp_41min/tokenizer (deflated 81%)


Usable for both, create more training data and for end testing.

In [None]:
import sys, fitz
fname = 'nlp_cv_parser/Berlin-Simple-Resume-Template.pdf'
doc = fitz.open(fname)
text = ""
for page in range(doc.page_count):
    page_content = doc.load_page(page)  # Load each page
    page_text = page_content.get_text("text")  # Extract text from the page
    text += page_text  # Append the extracted text to the all_text string

print(text)

SEAN
PRICE
IT Consultant
DETAILS
ADDRESS
1515 Pacific Ave
Los Angeles, CA 90291
United States
PHONE
3868683442
EMAIL
email@email.com
PLACE OF BIRTH
San Antonio
DRIVING LICENSE
Full
LINKS
LinkedIn
Pinterest
Resume Templates
Build this template
HOBBIES
Angling, Sailing, Fly Fishing
LANGUAGES
English
French
PROFILE
Personable IT Consultant with 5+ years of experience in a global 
technology firm. CompTIA A+ Certification. Scored the region leading 
QST rating based on internal reviews (97.86%). I am seeking to leverage 
solid technical skills and abilities to advance my career as the next IT 
consultant for Linsang Group.
EMPLOYMENT HISTORY
IT Consultant , Amazon
Jacksonville
Jan 2020 — Jun 2021
Administered first-level MHE and PKMS support and under-provided 
SOPs to make appropriate corrections when necessary.
•
Researched and documented existing and new processes for IT 
Support Teams and interacted with business users and other IT 
groups to ascertain business requirements and design 

Testing NER model on test input data

In [None]:
nlp = spacy.load("nlp_cv_parser/ner_model_cv_spacy")

'''
test_input = """
Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year), Database Management System (Less than 1 year), Java (Less than 1 year)  ADDITIONAL INFORMATION  Technical Skills  https://www.indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a?isid=rex-download&ikw=download-top&co=IN   • Programming language: C, C++, Java • Oracle PeopleSoft • Internet Of Things • Machine Learning • Database Management System • Computer Networks • Operating System worked on: Linux, Windows, Mac  Non - Technical Skills  • Honest and Hard-Working • Tolerant and Flexible to Different Situations • Polite and Calm • Team-Player
"""
'''
# test_input = """
# Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, browsing with  Microsoft Edge and making purchases at the Xbox Store, the Windows Store and the Microsoft  Store. Plus, user can pick up bonus points for taking daily quizzes and tours on the Microsoft  rewards website. Rewards live dashboards gives a live picture of usage world-wide and by  markets like US, Canada, Australia, new user registration count, top/bottom performing rewards  offers, orders stats and weekly trends of user activities, orders and new user registrations. the  PBI tiles gets refreshed in different frequencies starting from 5 seconds to 30 minutes.  Technology/Tools used    EDUCATION  Indian Institute of Technology – Mumbai  2001    SKILLS  Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; peers  • Supervised junior developers throughout project lifecycle and provided technical assistance
# """
test_input = text
# Process the input string using the model
doc = nlp(test_input)

print("Entities in the test input:\n")
with open("nlp_cv_parser/extracted_ner.txt", 'w') as file:
    for ent in doc.ents:
      print(f"{ent.text} ({ent.start_char}, {ent.end_char}): {ent.label_}")

      #
      file.write(f"{ent.text} ({ent.start_char}, {ent.end_char}): {ent.label_}\n")

Entities in the test input:

SEAN (0, 4): NAME
IT Consultant (11, 24): DESIGNATION
1515 Pacific Ave
Los Angeles, CA 90291
United States (41, 93): LOCATION
email@email.com (117, 132): EMAIL ADDRESS
English (291, 298): SKILLS
IT Consultant (325, 338): DESIGNATION
5+ years (344, 352): YEARS OF EXPERIENCE
CompTIA A+ Certification. (397, 422): DEGREE
technical skills (529, 545): SKILLS
IT Consultant (646, 659): DESIGNATION
Amazon (662, 668): COMPANIES WORKED AT
MHE (727, 730): SKILLS
PKMS (735, 739): SKILLS
SOPs (768, 772): SKILLS
IT Consultant (1402, 1415): DESIGNATION
PWC (1417, 1420): COMPANIES WORKED AT
program management (1561, 1579): SKILLS
Bachelor of Science in Information (2172, 2206): DEGREE
Miami University (2228, 2244): COLLEGE NAME
CCNA Routing and Switching (2530, 2556): SKILLS


In [None]:
vocab = nlp.vocab

# List all tokens in the vocabulary
tokens = [word.text for word in vocab if word.is_alpha]

print(tokens)

['nuthin', 'Kan', 'Mar', 'Development', 'it', 'e', 'is', 'college', 'Might', 'Nov', 'provides', 'Honest', 'Bengaluru', 'where', 'X', 'Non', 'Del', 'must', 'growth', 'Team', 'Ariz', 'had', 'Calif', 'does', 'Cos', 'Would', 'do', 'b', 'science', 'Indeed', 'might', 'Mac', 'Has', 'pm', 'ought', 'Dec', 'ä', 'these', 'Role', 'Tenn', 'Miss', 'Ga', 'Must', 'somethin', 'What', 'by', 'ü', 'r', 'an', 'When', 'Have', 'Computer', 'You', 'Polite', 'Database', 'language', 'O', 'co', 'doin', 'Networks', 'Mt', 'To', 'Less', 'to', 'user', 'Let', 'xDD', 'Prof', 'Also', 'Ai', 'Does', 'Minn', 'g', 'Jha', 'La', 'of', 'Mich', 'Not', 'Okla', 'w', 'cos', 'year', 'Va', 'would', 'as', 'Kendriya', 'Rep', 'Player', 'Was', 'Management', 'k', 'Email', 'XD', 'Things', 'Sha', 'C', 'Ought', 'Sep', 'd', 'will', 'Who', 'November', 'f', 'need', 'ways', 'c', 'June', 'm', 'cause', 'EDUCATION', 'you', 'was', 'F', 'in', 'y', 'Developing', 'Jr', 'Mont', 'Bangalore', 'input', 'Where', 'nt', 'Oracle', 'Working', 'Gen', 'given', '

In [None]:
# Get the entity labels
ner_labels = nlp.get_pipe("ner").labels

print("NER Labels:")
print(ner_labels)

NER Labels:
('COLLEGE NAME', 'COMPANIES WORKED AT', 'DEGREE', 'DESIGNATION', 'EMAIL ADDRESS', 'GRADUATION YEAR', 'LOCATION', 'NAME', 'SKILLS', 'UNKNOWN', 'YEARS OF EXPERIENCE')


In [None]:
def print_word_locations(text):
    words = text.split()
    start_pos = 0

    for word in words:
        start_pos = text.find(word, start_pos)
        end_pos = start_pos + len(word) - 1
        print(f"{word} {start_pos}-{end_pos}")
        start_pos += len(word)

print_word_locations(text)

SEAN 0-3
PRICE 5-9
IT 11-12
Consultant 14-23
DETAILS 25-31
ADDRESS 33-39
1515 41-44
Pacific 46-52
Ave 54-56
Los 58-60
Angeles, 62-69
CA 71-72
90291 74-78
United 80-85
States 87-92
PHONE 94-98
3868683442 100-109
EMAIL 111-115
email@email.com 117-131
PLACE 133-137
OF 139-140
BIRTH 142-146
San 148-150
Antonio 152-158
DRIVING 160-166
LICENSE 168-174
Full 176-179
LINKS 181-185
LinkedIn 187-194
Pinterest 196-204
Resume 206-211
Templates 213-221
Build 223-227
this 229-232
template 234-241
HOBBIES 243-249
Angling, 251-258
Sailing, 260-267
Fly 269-271
Fishing 273-279
LANGUAGES 281-289
English 291-297
French 299-304
PROFILE 306-312
Personable 314-323
IT 325-326
Consultant 328-337
with 339-342
5+ 344-345
years 347-351
of 353-354
experience 356-365
in 367-368
a 370-370
global 372-377
technology 380-389
firm. 391-395
CompTIA 397-403
A+ 405-406
Certification. 408-421
Scored 423-428
the 430-432
region 434-439
leading 441-447
QST 450-452
rating 454-459
based 461-465
on 467-468
internal 470-477
reviews

Convert sample data set to uppercase so the Ner-Anotator can be used.

Extra samples taken from: https://www.resumeviking.com/templates/
Tagged with: https://tecoholic.github.io/ner-annotator/

In [None]:
import json

def convert_tags_in_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)

        # Ensure the data is a list of dictionaries with 'tag' key
        if not isinstance(data, list):
            raise ValueError("JSON file should contain a list of dictionaries")

        # Convert tags to uppercase
        for item in data:
            for tag in range(len(item[1]['entities'])):
              item[1]['entities'][tag][2] = item[1]['entities'][tag][2].upper()

        # Write the updated JSON back to the file
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)

        print("Tags converted to uppercase successfully.")

    except Exception as e:
        print(f"An error occurred: {e}")

file_path = 'nlp_cv_parser/train_data.json'
convert_tags_in_json(file_path)

Tags converted to uppercase successfully.


Job listing NLP

In [None]:
file_path = "nlp_cv_parser/train_data_jobs.json"
with open(file_path, 'r', encoding='utf-8') as file:
    train_data = json.load(file)

# Train the model
nlp_job = train_model(train_data, iterations=100)

# Save the model
nlp_job.to_disk("nlp_cv_parser/ner_model_spacy")

Jobs NER model test

In [None]:
from sklearn.metrics import classification_report

nlp = spacy.load("nlp_cv_parser/ner_model_jobs_spacy")

with open("nlp_cv_parser/test_data_jobs.json") as f:
    test_data = json.load(f)

def create_examples(data, nlp):
    examples = []
    for text, annotations in data:
        doc = nlp.make_doc(text)
        entities = annotations['entities']
        spans = [(start, end, label) for start, end, label in entities]
        example = Example.from_dict(doc, {"entities": spans})
        examples.append(example)
    return examples

test_examples = create_examples(test_data, nlp)

def evaluate_model(nlp, examples):
    scorer = nlp.evaluate(examples)
    return scorer

scorer = evaluate_model(nlp, test_examples)

print(f"Precision: {scorer['ents_p']}")
print(f"Recall: {scorer['ents_r']}")
print(f"F1-score: {scorer['ents_f']}")

y_true = []
y_pred = []

# Extract true and predicted entities with their labels
for example in test_examples:
    gold_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in example.reference.ents]
    pred_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in nlp(example.text).ents]

    gold_map = { (start, end): label for start, end, label in gold_ents }

    pred_map = { (start, end): label for start, end, label in pred_ents }

    all_positions = set(gold_map.keys()).union(set(pred_map.keys()))

    # Populate y_true and y_pred based on positions
    for pos in all_positions:
        y_true.append(gold_map.get(pos, 'O'))
        y_pred.append(pred_map.get(pos, 'O'))

print(classification_report(y_true, y_pred, zero_division=0))

Precision: 0.3425925925925926
Recall: 0.2813688212927757
F1-score: 0.30897703549060546
                  precision    recall  f1-score   support

      EXPERIENCE       0.37      0.39      0.38        36
               O       0.00      0.00      0.00       124
   QUALIFICATION       0.21      0.18      0.19        34
    REQUIREMENTS       1.00      0.02      0.04        46
RESPONSIBILITIES       0.42      0.38      0.40        97
          SKILLS       0.27      0.32      0.29        50

        accuracy                           0.19       387
       macro avg       0.38      0.21      0.22       387
    weighted avg       0.31      0.19      0.19       387



CV NER model test

In [None]:
nlp = spacy.load("nlp_cv_parser/ner_model_cv_spacy")

with open("nlp_cv_parser/test_data.json") as f:
    test_data = json.load(f)

def create_examples(data, nlp):
    examples = []
    for text, annotations in data:
        doc = nlp.make_doc(text)
        entities = annotations['entities']
        spans = [(start, end, label) for start, end, label in entities]
        example = Example.from_dict(doc, {"entities": spans})
        examples.append(example)
    return examples

test_examples = create_examples(test_data, nlp)

def evaluate_model(nlp, examples):
    scorer = nlp.evaluate(examples)
    return scorer

scorer = evaluate_model(nlp, test_examples)

print(f"Precision: {scorer['ents_p']}")
print(f"Recall: {scorer['ents_r']}")
print(f"F1-score: {scorer['ents_f']}")

y_true = []
y_pred = []

# Extract true and predicted entities with their labels
for example in test_examples:
    gold_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in example.reference.ents]
    pred_ents = [(ent.start_char, ent.end_char, ent.label_) for ent in nlp(example.text).ents]

    gold_map = { (start, end): label for start, end, label in gold_ents }

    pred_map = { (start, end): label for start, end, label in pred_ents }

    all_positions = set(gold_map.keys()).union(set(pred_map.keys()))

    # Populate y_true and y_pred based on positions
    for pos in all_positions:
        y_true.append(gold_map.get(pos, 'O'))
        y_pred.append(pred_map.get(pos, 'O'))

print(classification_report(y_true, y_pred, zero_division=0))



Precision: 0.9565217391304348
Recall: 0.8825214899713467
F1-score: 0.9180327868852459
                     precision    recall  f1-score   support

       COLLEGE NAME       0.45      0.88      0.60        26
COMPANIES WORKED AT       0.70      0.54      0.61        35
             DEGREE       0.67      0.90      0.77        31
        DESIGNATION       0.60      0.89      0.72        53
      EMAIL ADDRESS       0.75      1.00      0.86         6
    GRADUATION YEAR       0.94      0.89      0.91        18
           LOCATION       0.86      1.00      0.92        12
               NAME       0.40      1.00      0.57        12
                  O       0.00      0.00      0.00       118
             SKILLS       0.92      0.93      0.92       144
YEARS OF EXPERIENCE       0.85      0.92      0.88        12

           accuracy                           0.66       467
          macro avg       0.65      0.81      0.71       467
       weighted avg       0.57      0.66      0.60       4