In [26]:
import json
import os

def load_training_data_from_file(json_file):
    """Load a single JSON annotated resume and return spaCy format"""
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    text = data["text"]
    entities = []
    for start, end, label in data["annotations"]:
        entities.append((start, end, "SKILL"))  # Keep only SKILL
    return [(text, {"entities": entities})]

# Folder containing all JSON annotated resumes
folder_path = "ResumesJsonAnnotated"

TRAIN_DATA = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        TRAIN_DATA.extend(load_training_data_from_file(file_path))



In [2]:
TRAIN_DATA[0]

('One97 Communications Limited \nData Scientist Jan 2019 to Till Date \nDetect important information from images and redact\nrequired fields. YOLO CNN Object-detection, OCR\nInsights, find anomaly or performance drop in all\npossible sub-space. \nPredict the Insurance claim probability. Estimate the\npremium amount to be charged\nB.Tech(Computer Science) from SGBAU university in\n2017. \nM.Tech (Computer Science Engineering) from Indian\nInstitute of Technology (IIT), Kanpur in 2019WORK EXPERIENCE\nEDUCATIONMACY WILLIAMS\nDATA SCIENTIST\nData Scientist working  on problems related to market research and customer analysis. I want to expand my arsenal of\napplication building and work on different kinds of problems. Looking for a role where I can work with a coordinative team\nand exchange knowledge during the process.\nJava, C++, Python, Machine Learning, Algorithms, Natural Language Processing, Deep Learning, Computer Vision, Pattern\nRecognition, Data Science, Data Analysis, Software 

In [3]:
import spacy
from spacy.training.example import Example
import random


In [4]:
# Start with a blank English pipeline
nlp = spacy.blank("en")

# Add NER pipe
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add the SKILL label
ner.add_label("SKILL")


1

In [5]:
def clean_annotations(train_data):
    cleaned = []
    for text, ann in train_data:
        entities = ann["entities"]
        # Sort by start index
        entities = sorted(entities, key=lambda x: (x[0], x[1]))
        non_overlapping = []
        prev_end = -1
        for start, end, label in entities:
            if start >= prev_end:  # only keep if it doesn't overlap
                non_overlapping.append((start, end, label))
                prev_end = end
        cleaned.append((text, {"entities": non_overlapping}))
    return cleaned

TRAIN_DATA = clean_annotations(TRAIN_DATA)


In [6]:
TRAIN_DATA = TRAIN_DATA[1:200]


In [7]:
# import re
# import unicodedata

# def clean_text(text):
#     # Remove surrogate characters (emojis, weird symbols)
#     text = re.sub(r'[\ud800-\udfff]', '', text)
#     # Normalize Unicode characters
#     text = unicodedata.normalize('NFKC', text)
#     # Optional: remove other non-printable characters
#     text = ''.join(c for c in text if c.isprintable())
#     return text

# # Apply to all training data
# TRAIN_DATA = [(clean_text(text), annotations) for text, annotations in TRAIN_DATA]


In [11]:
# Initialize weights and transitions properly
nlp.initialize(
    lambda: [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in TRAIN_DATA]
)

Mobi..." with entities "[(22, 28, 'SKILL'), (103, 106, 'SKILL'), (142, 153...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
 
Mob :+91 9745004628         Em..." with entities "[(13, 14, 'SKILL'), (48, 53, 'SKILL'), (65, 67, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
KottalilThoppil (H)
Chumathra P athul..." with entities "[(61, 66, 'SKILL'), (67, 70, 'SKILL'), (83, 85, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
External advisor
PhD Civil ..." with entities "[(19, 22, 'SKILL'), (32, 39, 'SKILL'), (50, 58, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will 

<thinc.optimizers.Optimizer at 0x17782edca40>

In [12]:
for epoch in range(50):
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print(f"Epoch {epoch} Losses: {losses}")


Mobi..." with entities "[(22, 28, 'SKILL'), (103, 106, 'SKILL'), (142, 153...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
 
Mob :+91 9745004628         Em..." with entities "[(13, 14, 'SKILL'), (48, 53, 'SKILL'), (65, 67, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
KottalilThoppil (H)
Chumathra P athul..." with entities "[(61, 66, 'SKILL'), (67, 70, 'SKILL'), (83, 85, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
External advisor
PhD Civil ..." with entities "[(19, 22, 'SKILL'), (32, 39, 'SKILL'), (50, 58, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will 

Epoch 0 Losses: {'ner': np.float32(29979.775)}
Epoch 1 Losses: {'ner': np.float32(21800.951)}
Epoch 2 Losses: {'ner': np.float32(17716.223)}
Epoch 3 Losses: {'ner': np.float32(15452.748)}
Epoch 4 Losses: {'ner': np.float32(14299.281)}
Epoch 5 Losses: {'ner': np.float32(12661.549)}
Epoch 6 Losses: {'ner': np.float32(10938.656)}
Epoch 7 Losses: {'ner': np.float32(9293.594)}
Epoch 8 Losses: {'ner': np.float32(8878.378)}
Epoch 9 Losses: {'ner': np.float32(9317.594)}
Epoch 10 Losses: {'ner': np.float32(8212.956)}
Epoch 11 Losses: {'ner': np.float32(6846.665)}
Epoch 12 Losses: {'ner': np.float32(6175.6597)}
Epoch 13 Losses: {'ner': np.float32(6014.437)}
Epoch 14 Losses: {'ner': np.float32(6052.439)}
Epoch 15 Losses: {'ner': np.float32(5245.294)}
Epoch 16 Losses: {'ner': np.float32(4867.8164)}
Epoch 17 Losses: {'ner': np.float32(4518.8335)}
Epoch 18 Losses: {'ner': np.float32(4631.9946)}
Epoch 19 Losses: {'ner': np.float32(4378.6606)}
Epoch 20 Losses: {'ner': np.float32(4203.624)}
Epoch 21 Lo

In [18]:
from spacy.training.example import Example
from spacy.scorer import Scorer
import random

# Split data
random.shuffle(TRAIN_DATA)
split = int(len(TRAIN_DATA) * 0.8)
train_data = TRAIN_DATA[:split]
valid_data = TRAIN_DATA[split:]

In [28]:
from spacy.training import offsets_to_biluo_tags, Example
misaligned = []
for text, entities in TRAIN_DATA:
    doc = nlp.make_doc(text)
    try:
        offsets_to_biluo_tags(doc, entities)
    except ValueError:
        misaligned.append((text, entities))
print(f"Number of misaligned entries: {len(misaligned)}")


UnicodeEncodeError: 'utf-8' codec can't encode character '\ud83d' in position 0: surrogates not allowed

In [52]:
from spacy.training.example import Example
from spacy.scorer import Scorer

# Create validation examples
valid_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in valid_data]

# Score all examples at once
scorer = Scorer()
scores = scorer.score(valid_examples)

print(f"Epoch {epoch} Validation scores:", scores)


Epoch 0 Validation scores: {'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': None, 'ents_r': None, 'ents_f': None, 'ents_per_type': None, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}


In [22]:
for epoch in range(50):
    random.shuffle(train_data)
    losses = {}
    
    # Training
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.3, losses=losses)
    
    print(f"Epoch {epoch} Training Losses: {losses}")
    
    # Validation
    scorer = Scorer()
    for text, annotations in valid_data:
        doc = nlp(text)
        example = Example.from_dict(doc, annotations)
        scorer.score(example)
    
    print(f"Epoch {epoch} Validation scores:", scorer.scores)

Dinesh Madhavan Nair, 402, Family Pharmacy ..." with entities "[(0, 6, 'SKILL'), (41, 49, 'SKILL'), (50, 58, 'SKI...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
KULDEEP CHANDRAKANT  BHAGWAT
Geeta Narayan ..." with entities "[(0, 6, 'SKILL'), (82, 86, 'SKILL'), (108, 114, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
 D-Wing/301, Green Gagan Buildi..." with entities "[(32, 37, 'SKILL'), (44, 52, 'SKILL'), (120, 126, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Maven
<dependency>
    <..." with entities "[(26, 31, 'SKILL'), (33, 43, 'SKILL'), (226, 234, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the a

Epoch 0 Training Losses: {'ner': np.float32(14795.417)}


Practi..." with entities "[(23, 25, 'SKILL'), (30, 31, 'SKILL'), (44, 52, 'S...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


TypeError: [E978] The Tokenizer.score method takes a list of Example objects, but got: <class 'spacy.training.example.Example'>