<a href="https://colab.research.google.com/github/chewzzz1014/fyp/blob/master/ner/src/train_ner_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train NER Models

In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir spacy_ner_data

## Spacy NER

In [3]:
# load json and convert into spacy format

import json
import random
from sklearn.model_selection import train_test_split
import spacy
import numpy as np
np.float_ = np.float64
from spacy.tokens import DocBin

# load JSON data from Drive
with open('/content/drive/MyDrive/FYP/Implementation/Resume Dataset/1100_resumes_annotated.json', "r") as f:
    data = json.load(f)

# remove overlapped entities (one word has >1 entitiy)
def remove_overlapping_entities(entities):
    """Remove overlapping entities from the list."""
    # sort entities by start position
    entities = sorted(entities, key=lambda x: x[0])
    non_overlapping = []
    last_end = -1
    for start, end, label in entities:
        # only add to list if there's no overlap with the previous entity
        if start >= last_end:
            non_overlapping.append((start, end, label))
            last_end = end
    return non_overlapping

# convert JSON data to Spacy's DocBin format
def convert_to_spacy_format(data):
    # load a blank Spacy model
    nlp = spacy.blank("en")
    # container for our docs
    doc_bin = DocBin()

    for item in data:
        # full document text
        text = item['data']['Text']
        entities = []

        for annotation in item['annotations'][0]['result']:
            start = annotation['value']['start']
            end = annotation['value']['end']
            label = annotation['value']['labels'][0]
            entities.append((start, end, label))

        # remove overlapping entities
        entities = remove_overlapping_entities(entities)
        # create a Spacy doc and add entities to it
        doc = nlp.make_doc(text)
        spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
        # filter out None spans if Spacy can't align the character indices with tokens
        spans = [span for span in spans if span is not None]
        # assign entities to the doc
        doc.ents = spans
        doc_bin.add(doc)

    return doc_bin

# split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# convert train and test sets to Spacy format
train_doc_bin = convert_to_spacy_format(train_data)
test_doc_bin = convert_to_spacy_format(test_data)

# save the train and test data to .spacy files in current runtime
train_doc_bin.to_disk("spacy_ner_data/train_data.spacy")
test_doc_bin.to_disk("spacy_ner_data/test_data.spacy")

In [4]:
# check the distribution of entitiy labels

import spacy
from spacy.lang.en import English
from spacy.tokens import DocBin

# count number of label entities
def count_entity_labels(file_path):
    # load data from file_path and create DocBin
    doc_bin = DocBin().from_disk(file_path)
    label_counts = {}
    for doc in doc_bin.get_docs(English().vocab):
        # count occurence of label
        for ent in doc.ents:
            label = ent.label_
            label_counts[label] = label_counts.get(label, 0) + 1
    return label_counts

# calculate and print label distribution in train and test data
# sorted from largest to smallest
train_label_counts = count_entity_labels("spacy_ner_data/train_data.spacy")
sorted_train_label_counts = sorted(train_label_counts.items(), key=lambda x: x[1], reverse=True)
print("Train Data Entity Label Distribution:")
for label, count in sorted_train_label_counts:
    print(f"{label}: {count}")

test_label_counts = count_entity_labels("spacy_ner_data/test_data.spacy")
sorted_test_label_counts = sorted(test_label_counts.items(), key=lambda x: x[1], reverse=True)
print("\nTest Data Entity Label Distribution:")
for label, count in sorted_test_label_counts:
    print(f"{label}: {count}")

Train Data Entity Label Distribution:
SKILL: 15539
JOB: 4275
LOC: 2963
WORK PER: 2795
COMPANY: 2606
UNI: 1249
DEG: 1091
NAME: 883
STUDY PER: 852
PHONE: 816
EMAIL: 775

Test Data Entity Label Distribution:
SKILL: 3812
JOB: 1063
LOC: 778
WORK PER: 687
COMPANY: 640
UNI: 325
DEG: 282
NAME: 225
STUDY PER: 212
PHONE: 205
EMAIL: 189


In [5]:
# create base_config.cfg and paste the config generated from spacy widget
# need to update train and test file path
!touch base_config.cfg

In [6]:
# install transformer
!pip install git+https://github.com/explosion/spacy-transformers

Collecting git+https://github.com/explosion/spacy-transformers
  Cloning https://github.com/explosion/spacy-transformers to /tmp/pip-req-build-qhcpto2d
  Running command git clone --filter=blob:none --quiet https://github.com/explosion/spacy-transformers /tmp/pip-req-build-qhcpto2d
  Resolved https://github.com/explosion/spacy-transformers to commit 40ee09d9b2b2b18f77fc3715329ce080691b6af9
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers<4.42.0,>=3.4.0 (from spacy-transformers==1.3.6)
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers==1.3.6)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [7]:
!pip install -U spacy
!pip install "numpy<2"

Collecting spacy
  Downloading spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.2.0,>=1.1.0 (from thinc<8.4.0,>=8.3.0->spacy)
  Downloading blis-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.1/29.1 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading thinc-8.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading blis-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



In [8]:
# generate config.cfg from base_config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
# vector used for spacy ner
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
# debugging and profiling data.
!python -m spacy debug data config.cfg

[1m
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 150kB/s]
config.json: 100% 481/481 [00:00<00:00, 2.70MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 4.17MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 2.13MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 3.13MB/s]
model.safetensors: 100% 499M/499M [00:02<00:00, 232MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: transformer, ner
880 training docs
221 evaluation docs
[38;5;3m⚠ 3 training examples also in evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (880)[0m
[1m
[38;5;4mℹ 435225 total word(s) in the data (28696 unique)[0m
[3

In [11]:
# debugging and profiling configs and implementations.
!python -m spacy debug config config.cfg

[1m
[1m
[1m
[38;5;2m✔ Config is valid[0m


In [12]:
# train model using hyperparameters set in config.cfg
# save trained model in spacy_output/ dir

# using cpu
# !python -m spacy train config.cfg --output ./spacy_output

# using gpu
!python -m spacy train config.cfg --gpu-id 0 --output ./spacy_output

# save output dir into drive
!cp -r ./spacy_output /content/drive/MyDrive/FYP/Implementation/

[38;5;2m✔ Created output directory: spacy_output[0m
[38;5;4mℹ Saving to output directory: spacy_output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        6591.26   1430.35    3.50    1.91   20.65    0.03
  0     200      192137.65  63564.72   33.94   45.36   27.11    0.34
  1     400       20729.10  27951.65   61.40   64.22   58.81    0.61
  2     600       11688.74  22539.74   63.49   57.04   71.60    0.63
  2     800       10495.

In [13]:
# evaluate trained model performance
# store output and visualization into result/ dir

# use cpu
# !python -m spacy evaluate spacy_output/model-best spacy_ner_data/test_data.spacy -dp spacy_output

# use gpu
!python -m spacy evaluate spacy_output/model-best spacy_ner_data/test_data.spacy --gpu-id 0 -dp spacy_output

[38;5;4mℹ Using GPU: 0[0m
  self._model.load_state_dict(torch.load(filelike, map_location=device))
[1m

TOK     100.00
NER P   74.98 
NER R   62.20 
NER F   68.00 
SPEED   6366  

[1m

                P       R       F
LOC         82.68   79.18   80.89
PHONE       97.57   98.05   97.81
NAME        91.63   92.44   92.04
JOB         72.59   70.74   71.65
SKILL       62.94   41.66   50.13
COMPANY     76.86   71.09   73.86
WORK PER    96.56   89.81   93.06
UNI         68.24   71.38   69.77
STUDY PER   78.76   71.70   75.06
DEG         79.93   81.91   80.91
EMAIL       95.34   97.35   96.34

<IPython.core.display.HTML object>
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/l

In [14]:
# make prediction

import spacy_transformers
import spacy
import numpy as np
np.float_ = np.float64
import string
from spacy import displacy

resume_text = '''
John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.

John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'
'''

# convert text into small letter then remove punctuation
resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))

# load trained model
nlp = spacy.load("/content/drive/MyDrive/FYP/Implementation/spacy_output/model-best")

# create a Spacy doc and add text to it
doc = nlp(resume_text.lower())

# print predicted entities in text
# for ent in doc.ents:
#     print(f"{ent.text}: {ent.label_}")

# visualize predicted entities using displacy
colors = {
    "NAME": "lightblue",
    "LOC": "yellow",
    "PHONE": "pink",
    "EMAIL": "lightgreen",
    "JOB": "orange",
    "SKILL": "aqua",
    "COMPANY": "violet",
    "WORK PER": "salmon",
    "DEG": "lightcoral",
    "UNI": "lightgrey",
    "STUDY PER": "peachpuff",
}
options = {"ents": list(colors.keys()), "colors": colors}
displacy.render(doc, style="ent", jupyter=True, options=options)

  self._model.load_state_dict(torch.load(filelike, map_location=device))
  # NB: Previously this was torch.cuda.amp.autocast, passing a boolean


In [15]:
# make prediction

import spacy
import string
from spacy import displacy

resume_text = '''
Zi Qing Chew
chewziqing@gmail.com | 016-2892475 | Kuala Lumpur, Malaysia | linkedin.com/in/ziqingchew | github.com/chewzzz1014
EDUCATION

Universiti Putra Malaysia					                                                   Oct 2021 - Current
Bachelor in Computer Science with Honours
Expected to graduate in July 2025. CGPA: 3.99

WORK EXPERIENCE

Ant International 									          	July 2024 – Oct 2024
Java Engineer Intern							                               Kuala Lumpur, Malaysia
Collaborated in developing an audit logging feature for Ant Group’s internal Foreign Exchange (FX) trade strategy system that records changes made by business users to trade strategies.
Conducted comprehensive system analysis and project planning, delivering presentations to project stakeholders and QA teams prior to the development phase.
Utilised Ant Group’s internal frameworks, middleware, and tools to implement the audit logging feature.
Skills: Java, Spring, Sofaboot, Ant Group internal middlewares (ZDAL, DRM, Ant Scheduler, Msg Broker)
Howuku  									          	             Feb 2023 – Sep 2023
Software Developer Intern							                    Kuala Lumpur, Malaysia
Developed and optimized A/B testing features, including code editor and previewer for CSS and JavaScript modifications for experiment variations.
Expanded A/B testing targeting rule by incorporating website visitor's OS, device, and browser rules.
Automated experiment-stopping criteria and email notifications based on user-defined experiment termination conditions.
Collaborated with cross-functional teams to debug, troubleshoot, and enhance Howuku platform features based on user feedback and performance data.
Skills: JavaScript, Bootstrap, Vue.js, Express.js, MySQL

PROJECTS

Personal Portfolio Website (chewzzz1014.github.io/portfolio-website)
Designed, developed and deployed personalised portfolio website featuring skills, selected projects, and downloadable resume.
Skills: JavaScript, React.js, CSS, Bootstrap
Depression Level Detection Chatbot (https://github.com/chewzzz1014/health-ease-project)
Developed machine learning application that evaluates a message's depression level and provided tailored mental health advice and information based on the depression severity.
Skills: Python, pandas, scikit-learn, Keras, FastAPI, Gradio
Clothing Store Website (https://github.com/chewzzz1014/CSC3402-MVC-Project)
Worked in team to build a CRUD Spring Boot application with attractive interfaces, data persistence, authentication and authorisation.
Developed the backend of the application that involves querying the database, building REST endpoints and implementing Thymeleaf in HTML for dynamic contents.
Skills: Spring Boot, Spring MVC, Thymeleaf, Hibernate, Bootstrap

SKILLS
Programming Languages: Java, Python, HTML, CSS, JavaScript, MySQL, OracleSQL
Frameworks and Libraries: Spring, Spring Boot, TypeScript, Node.js, Express.js, React.js, Vue.js, Bootstrap, Tailwind CSS
Tools: Git, Github, Jira, Tableau, Excel, Jupyter Notebook, Google Colab, VSCode, IntelliJ
'''

# convert text into small letter then remove punctuation
# resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))

# create a Spacy doc and add text to it
doc = nlp(resume_text)

# load trained model
nlp = spacy.load("/content/drive/MyDrive/FYP/Implementation/spacy_output/model-best")

colors = {
    "NAME": "lightblue",
    "LOC": "yellow",
    "PHONE": "pink",
    "EMAIL": "lightgreen",
    "JOB": "orange",
    "SKILL": "aqua",
    "COMPANY": "violet",
    "WORK PER": "salmon",
    "DEG": "lightcoral",
    "UNI": "lightgrey",
    "STUDY PER": "peachpuff",
}
options = {"ents": list(colors.keys()), "colors": colors}
displacy.render(doc, style="ent", jupyter=True, options=options)

  # NB: Previously this was torch.cuda.amp.autocast, passing a boolean
  self._model.load_state_dict(torch.load(filelike, map_location=device))


## Flair NER

In [None]:
# install flair library
!pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.35.79-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [None]:
# convert json into flair data

import json
import random
from typing import List, Dict, Tuple
import spacy
from collections import defaultdict

class NERConverter:
    def __init__(self):
        # load pretrained model from Spacy library
        # to create Spacy Doc object
        self.nlp = spacy.load("en_core_web_sm")

    # get BIOES label based on location of word
    def get_bioes_label(self, token_index: int, entity_length: int, current_position: int, label: str) -> str:
        """
        Convert to BIOES format
        - S-: Single token entity
        - B-: Beginning of multi-token entity
        - I-: Inside of multi-token entity
        - E-: End of multi-token entity
        - O: Outside
        """
        if entity_length == 1:
            return f'S-{label}'
        if current_position == 0:
            return f'B-{label}'
        if current_position == entity_length - 1:
            return f'E-{label}'
        return f'I-{label}'

    # convert Label Studio's exported annotations in json format intto BIOES format
    def convert_to_bioes_format(self, json_data: List[dict]) -> List[List[Tuple[str, str]]]:
        """Convert JSON annotations to BIOES format."""
        all_sentences = []

        # process all annotation in json file
        for item in json_data:
            text = item['data']['Text']
            doc = self.nlp(text)

            # initialize character-level labels
            char_labels = ['O'] * len(text)

            # first pass: identify entity boundaries and lengths
            entity_spans = []
            if item['annotations'] and len(item['annotations']) > 0:
                for ann in item['annotations'][0]['result']:
                    if 'value' in ann:
                        start = ann['value']['start']
                        end = ann['value']['end']
                        label = ann['value']['labels'][0]
                        entity_spans.append((start, end, label))

            # sort spans by start position
            entity_spans.sort(key=lambda x: x[0])

            # second pass: apply BIOES labels
            for start, end, label in entity_spans:
                # get tokens that are part of this entity
                entity_text = text[start:end]
                entity_doc = self.nlp(entity_text)
                entity_length = len([token for token in entity_doc if not token.is_space])

                # set labels for the entire span
                current_token_idx = 0
                for i in range(start, end):
                    if i == start or text[i-1].isspace():
                        char_labels[i] = self.get_bioes_label(i, entity_length, current_token_idx, label)
                        current_token_idx += 1
                    else:
                        char_labels[i] = char_labels[i-1]

            # convert to token-level labels
            current_sentence = []
            for sent in doc.sents:
                for token in sent:
                    # get the most common label for the token's characters
                    token_chars_labels = char_labels[token.idx:token.idx + len(token.text)]
                    label_counts = defaultdict(int)
                    for char_label in token_chars_labels:
                        label_counts[char_label] += 1

                    token_label = max(label_counts.items(), key=lambda x: x[1])[0]
                    current_sentence.append((token.text, token_label))

                if current_sentence:
                    all_sentences.append(current_sentence)
                    current_sentence = []

        return all_sentences

    # write data in BIOES format into txt file
    def write_flair_file(self, sentences: List[List[Tuple[str, str]]], filename: str):
        """Write sentences in BIOES format to file."""
        with open(filename, 'w', encoding='utf-8') as f:
            for sentence in sentences:
                for token, label in sentence:
                    f.write(f'{token} {label}\n')
                f.write('\n')

    # convert json data into BIOES data
    # split BIOES data into train and test
    def convert_and_split(self, json_data: List[dict], train_file: str, test_file: str, test_ratio: float = 0.2):
        """Convert JSON to BIOES format and split into train/test sets."""
        all_sentences = self.convert_to_bioes_format(json_data)

        # shuffle and split based on test_ratio
        random.shuffle(all_sentences)
        split_idx = int(len(all_sentences) * (1 - test_ratio))

        # use list slicing to split
        train_sentences = all_sentences[:split_idx]
        test_sentences = all_sentences[split_idx:]

        # write to txt files
        self.write_flair_file(train_sentences, train_file)
        self.write_flair_file(test_sentences, test_file)

        return len(train_sentences), len(test_sentences)


# load JSON data
with open('/content/drive/MyDrive/FYP/Implementation/Resume Dataset/1100_resumes_annotated.json', 'r', encoding='utf-8') as f:
  json_data = json.load(f)
# with open('/content/1100_resumes_annotated.json', 'r', encoding='utf-8') as f:
#   json_data = json.load(f)

# load self-defined convert class
converter = NERConverter()

# convert json data into BIOES data and split into train and test
train_count, test_count = converter.convert_and_split(
    json_data,
    train_file='flair_train.txt',
    test_file='flair_test.txt',
    test_ratio=0.2
)
print(f'Created {train_count} training sentences and {test_count} test sentences')

Created 1585 training sentences and 397 test sentences


In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns for CoNLL (0: word, 1: label)
columns = {0: 'text', 1: 'ner'}

# set data folder and train and test path
data_folder = './'
train_file = 'flair_train.txt'
test_file = 'flair_test.txt'

# load the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=train_file,
                              test_file=test_file,
                              dev_file=None)

2024-12-12 08:49:56,886 Reading data from .
2024-12-12 08:49:56,887 Train: flair_train.txt
2024-12-12 08:49:56,888 Dev: None
2024-12-12 08:49:56,889 Test: flair_test.txt
2024-12-12 08:50:03,698 No dev split found. Using 10% (i.e. 158 samples) of the train split as dev data


In [None]:
# generate a dictionary of unique labels from the NER corpus.
# this dictionary maps each named entity label in the dataset to an integer ID.
tag_dictionary = corpus.make_label_dictionary(label_type='ner')
print("Labels:", tag_dictionary.get_items())

2024-12-12 08:50:07,914 Computing label dictionary. Progress:


0it [00:00, ?it/s]
1427it [00:00, 15331.24it/s]

2024-12-12 08:50:08,055 Dictionary created for label 'ner' with 11 values: SKILL (seen 13598 times), JOB (seen 4002 times), LOC (seen 2659 times), WORK (seen 2596 times), COMPANY (seen 2355 times), UNI (seen 1175 times), DEG (seen 993 times), NAME (seen 788 times), STUDY (seen 778 times), PHONE (seen 752 times), EMAIL (seen 693 times)
Labels: ['SKILL', 'JOB', 'LOC', 'WORK', 'COMPANY', 'UNI', 'DEG', 'NAME', 'STUDY', 'PHONE', 'EMAIL']





In [None]:
from collections import Counter

# count frequency of each entity label
def count_labels(file_path):
    with open(file_path, 'r') as file:
        labels = [line.split()[-1] for line in file if line.strip()]
    return Counter(labels)

# number of
print("Train label distribution:", count_labels('flair_train.txt'))
print("Test label distribution:", count_labels('flair_test.txt'))

Train label distribution: Counter({'O': 376460, 'S-SKILL': 7652, 'B-SKILL': 7475, 'E-SKILL': 7475, 'PER': 6842, 'E-JOB': 3911, 'B-JOB': 3857, 'I-JOB': 2108, 'B-COMPANY': 2064, 'E-COMPANY': 2063, 'S-LOC': 1937, 'I-DEG': 1852, 'I-SKILL': 1473, 'I-COMPANY': 1419, 'E-UNI': 1250, 'B-UNI': 1247, 'I-UNI': 1148, 'E-DEG': 1071, 'B-DEG': 1070, 'E-LOC': 964, 'B-LOC': 962, 'E-NAME': 870, 'B-NAME': 868, 'S-EMAIL': 754, 'E-PHONE': 674, 'B-PHONE': 665, 'S-COMPANY': 529, 'S-JOB': 510, 'I-PHONE': 472, 'S-PHONE': 139, 'I-NAME': 33, 'I-LOC': 32, 'S-UNI': 31, 'S-DEG': 21, 'B-EMAIL': 3, 'E-EMAIL': 3, 'S-NAME': 1})
Test label distribution: Counter({'O': 87769, 'S-SKILL': 2029, 'B-SKILL': 1883, 'E-SKILL': 1878, 'PER': 1612, 'E-JOB': 920, 'B-JOB': 914, 'I-JOB': 508, 'E-COMPANY': 480, 'B-COMPANY': 477, 'S-LOC': 456, 'I-DEG': 434, 'I-SKILL': 374, 'I-COMPANY': 349, 'E-UNI': 283, 'B-UNI': 282, 'B-DEG': 262, 'E-DEG': 261, 'I-UNI': 246, 'E-LOC': 241, 'B-LOC': 240, 'B-NAME': 235, 'E-NAME': 235, 'S-EMAIL': 191, 'B-PH

In [None]:
# create NER tagger
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger
import torch.nn as nn

# 1. using LSTM-CRF on top of frozen embeddings
# combine flair and glove embeddings
embeddings = StackedEmbeddings([
    WordEmbeddings('glove'),  # GloVe word embeddings
    FlairEmbeddings('news-forward'),  # Slightly heavier version of the forward Flair embeddings
    FlairEmbeddings('news-backward')  # Slightly heavier version of the backward Flair embeddings
])

# 2. Configure tagger with memory and performance optimizations
tagger = SequenceTagger(
    hidden_size=128,  # Increased hidden size for more capacity
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    use_crf=True,
    tag_format='BIOES',
    dropout=0.2,  # Reduced dropout for better capacity retention
    rnn_layers=2,  # Increased layers to enhance representation learning
)

2024-12-12 08:50:23,733 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmptcelcvdt


100%|██████████| 153M/153M [00:09<00:00, 16.4MB/s]

2024-12-12 08:50:34,014 copying /tmp/tmptcelcvdt to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2024-12-12 08:50:34,205 removing temp file /tmp/tmptcelcvdt
2024-12-12 08:50:34,751 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmptg1z30xv


100%|██████████| 20.5M/20.5M [00:02<00:00, 9.23MB/s]

2024-12-12 08:50:37,580 copying /tmp/tmptg1z30xv to cache at /root/.flair/embeddings/glove.gensim
2024-12-12 08:50:37,611 removing temp file /tmp/tmptg1z30xv





2024-12-12 08:50:43,804 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmp0s0e0krx


100%|██████████| 69.7M/69.7M [00:08<00:00, 9.05MB/s]

2024-12-12 08:50:52,381 copying /tmp/tmp0s0e0krx to cache at /root/.flair/embeddings/news-forward-0.4.1.pt
2024-12-12 08:50:52,462 removing temp file /tmp/tmp0s0e0krx





2024-12-12 08:50:54,551 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to /tmp/tmp1ki_xkkt


100%|██████████| 69.7M/69.7M [00:06<00:00, 11.3MB/s]

2024-12-12 08:51:01,539 copying /tmp/tmp1ki_xkkt to cache at /root/.flair/embeddings/news-backward-0.4.1.pt
2024-12-12 08:51:01,617 removing temp file /tmp/tmp1ki_xkkt





2024-12-12 08:51:01,912 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-LOC, B-LOC, E-LOC, I-LOC, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL


In [None]:
# train flair ner model

from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
import torch

# define ModelTrained based on tagger and corpus
trainer = ModelTrainer(tagger, corpus)

# train Flair NER Model
trainer.train(
    base_path='flair_output/',
    learning_rate=0.05,  # Lower learning rate for more stable training
    mini_batch_size=16,  # Increased batch size (if memory permits)
    max_epochs=50,  # More epochs to allow better model convergence
    patience=5,  # Increased patience for early stopping
    train_with_dev=True,
    save_final_model=True,
    use_amp=True,  # Mixed precision training for faster training
)

# save trained model to drive
!cp -r ./flair_output /content/drive/MyDrive/FYP/Implementation/

2024-12-04 12:06:51,350 ----------------------------------------------------------------------------------------------------
2024-12-04 12:06:51,352 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2024-12-04 12:07:32,760 epoch 1 - iter 7/73 - loss 4.19854115 - time (sec): 41.38 - samples/sec: 769.52 - lr: 0.050000 - momentum: 0.000000
2024-12-04 12:08:18,910 epoch 1 - iter 14/73 - loss 4.20430475 - time (sec): 87.52 - samples/sec: 742.59 - lr: 0.050000 - momentum: 0.000000
2024-12-04 12:09:19,855 epoch 1 - iter 21/73 - loss 3.58259553 - time (sec): 148.47 - samples/sec: 653.92 - lr: 0.050000 - momentum: 0.000000
2024-12-04 12:10:06,312 epoch 1 - iter 28/73 - loss 2.94078037 - time (sec): 194.93 - samples/sec: 654.54 - lr: 0.050000 - momentum: 0.000000
2024-12-04 12:10:51,804 epoch 1 - iter 35/73 - loss 2.53528460 - time (sec): 240.42 - samples/sec: 655.64 - lr: 0.050000 - momentum: 0.000000
2024-12-04 12:11:31,874 epoch 1 - iter 42/73 - loss 2.27277629 - time (sec): 280.49 - samples/sec: 666.33 - lr: 0.050000 - momentum: 0.000000
2024-12-04 12:12:15,286 epoch 1 - iter 49/73 - loss 2.06623933 - time (sec): 323.90 - samples/sec: 669.77 - lr: 0.050000 - momentum: 0.000000
2024-12-0

  0%|          | 0/5 [00:17<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.38 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.19 GiB is free. Process 3542 has 13.55 GiB memory in use. Of the allocated memory 11.10 GiB is allocated by PyTorch, and 2.31 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# evaluate model

from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

# load the trained model
tagger = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/final-model.pt')
# tagger = SequenceTagger.load('/content/final-model.pt')

# evaluate the model on the test set
result = tagger.evaluate(corpus.test, gold_label_type='ner', mini_batch_size=32)

# print the results
# print("Evaluation Loss:", eval_loss)
print(result.detailed_results)  # print the precision, recall, and F1-score per entity type

2024-12-12 08:51:38,310 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-LOC, B-LOC, E-LOC, I-LOC, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>


100%|██████████| 13/13 [02:12<00:00, 10.18s/it]


Results:
- F-score (micro) 0.648
- F-score (macro) 0.789
- Accuracy 0.4826

By class:
              precision    recall  f1-score   support

       SKILL     0.4602    0.5290    0.4922      3913
         JOB     0.5969    0.7389    0.6604      1038
         LOC     0.7701    0.9082    0.8334       697
        WORK     0.8596    0.9259    0.8915       688
     COMPANY     0.6742    0.7855    0.7256       606
         UNI     0.6715    0.7938    0.7276       291
         DEG     0.7381    0.8127    0.7736       267
        NAME     0.9425    0.8950    0.9181       238
       PHONE     0.9518    0.9559    0.9538       227
       STUDY     0.6933    0.8418    0.7604       196
       EMAIL     0.9078    0.9791    0.9421       191

   micro avg     0.6063    0.6960    0.6480      8352
   macro avg     0.7514    0.8333    0.7890      8352
weighted avg     0.6105    0.6960    0.6499      8352






In [None]:
# make prediction

from flair.models import SequenceTagger
from flair.data import Sentence
import spacy, string
from spacy import displacy

# load trained Flair NER model
# tagger = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/best-model.pt')
# tagger = SequenceTagger.load('/content/final-model.pt')

resume_text = '''
John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.

John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'
'''

# make into all small letter and remove punctuations
resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))

# step 1: predict entities using Flair trained model
sentence = Sentence(resume_text)
tagger.predict(sentence)

# step 2: convert Flair predictions to spaCy doc format
# initialize a blank spaCy NLP pipeline
nlp = spacy.blank("en")
doc = nlp(resume_text)

# extract entities from Flair prediction and convert to spaCy format
ents = []
for entity in sentence.get_spans('ner'):
    start, end = entity.start_position, entity.end_position
    label = entity.tag
    span = doc.char_span(start, end, label=label)
    if span is not None:
        ents.append(span)

# set the entities in the spaCy doc
doc.ents = ents

# step 3: visualization of prediction using displacy
displacy.render(doc, style="ent", jupyter=True)




In [None]:
resume_text_1 = '''
Zi Qing Chew
chewziqing@gmail.com | 016-2892475 | Kuala Lumpur, Malaysia | linkedin.com/in/ziqingchew | github.com/chewzzz1014
EDUCATION

Universiti Putra Malaysia					                                                   Oct 2021 - Current
Bachelor in Computer Science with Honours
Expected to graduate in July 2025. CGPA: 3.99

WORK EXPERIENCE

Ant International 									          	July 2024 – Oct 2024
Java Engineer Intern							                               Kuala Lumpur, Malaysia
Collaborated in developing an audit logging feature for Ant Group’s internal Foreign Exchange (FX) trade strategy system that records changes made by business users to trade strategies.
Conducted comprehensive system analysis and project planning, delivering presentations to project stakeholders and QA teams prior to the development phase.
Utilised Ant Group’s internal frameworks, middleware, and tools to implement the audit logging feature.
Skills: Java, Spring, Sofaboot, Ant Group internal middlewares (ZDAL, DRM, Ant Scheduler, Msg Broker)
Howuku  									          	             Feb 2023 – Sep 2023
Software Developer Intern							                    Kuala Lumpur, Malaysia
Developed and optimized A/B testing features, including code editor and previewer for CSS and JavaScript modifications for experiment variations.
Expanded A/B testing targeting rule by incorporating website visitor's OS, device, and browser rules.
Automated experiment-stopping criteria and email notifications based on user-defined experiment termination conditions.
Collaborated with cross-functional teams to debug, troubleshoot, and enhance Howuku platform features based on user feedback and performance data.
Skills: JavaScript, Bootstrap, Vue.js, Express.js, MySQL

PROJECTS

Personal Portfolio Website (chewzzz1014.github.io/portfolio-website)
Designed, developed and deployed personalised portfolio website featuring skills, selected projects, and downloadable resume.
Skills: JavaScript, React.js, CSS, Bootstrap
Depression Level Detection Chatbot (https://github.com/chewzzz1014/health-ease-project)
Developed machine learning application that evaluates a message's depression level and provided tailored mental health advice and information based on the depression severity.
Skills: Python, pandas, scikit-learn, Keras, FastAPI, Gradio
Clothing Store Website (https://github.com/chewzzz1014/CSC3402-MVC-Project)
Worked in team to build a CRUD Spring Boot application with attractive interfaces, data persistence, authentication and authorisation.
Developed the backend of the application that involves querying the database, building REST endpoints and implementing Thymeleaf in HTML for dynamic contents.
Skills: Spring Boot, Spring MVC, Thymeleaf, Hibernate, Bootstrap

SKILLS
Programming Languages: Java, Python, HTML, CSS, JavaScript, MySQL, OracleSQL
Frameworks and Libraries: Spring, Spring Boot, TypeScript, Node.js, Express.js, React.js, Vue.js, Bootstrap, Tailwind CSS
Tools: Git, Github, Jira, Tableau, Excel, Jupyter Notebook, Google Colab, VSCode, IntelliJ
'''

In [None]:
# make prediction
from flair.models import SequenceTagger
from flair.data import Sentence
import spacy, string
from spacy import displacy


# load trained Flair NER model
# tagger = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/best-model.pt')
# tagger = SequenceTagger.load('/content/final-model.pt')

resume_text_1 = resume_text_1.lower()
resume_text_1 = resume_text_1.translate(str.maketrans('', '', string.punctuation))

# step 1: predict entities using Flair trained model
sentence = Sentence(resume_text_1)
tagger.predict(sentence)


# step 2: convert Flair predictions to spaCy doc format
# initialize a blank spaCy NLP pipeline
nlp = spacy.blank("en")
doc = nlp(resume_text_1)


# extract entities from Flair prediction and convert to spaCy format
ents = []
for entity in sentence.get_spans('ner'):
    start, end = entity.start_position, entity.end_position
    label = entity.tag
    span = doc.char_span(start, end, label=label)
    if span is not None:
        ents.append(span)



# set the entities in the spaCy doc
doc.ents = ents



# step 3: visualization of prediction using displacy
displacy.render(doc, style="ent", jupyter=True)