<a href="https://colab.research.google.com/github/chewzzz1014/fyp/blob/master/ner/src/train_ner_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train NER Models

In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir spacy_ner_data

## Spacy NER

In [3]:
# load json and convert into spacy format

import json
import random
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin

# load JSON data from Drive
with open('/content/drive/MyDrive/FYP/Implementation/Resume Dataset/445_resumes_annotated.json', "r") as f:
    data = json.load(f)

# remove overlapped entities (one word has >1 entitiy)
def remove_overlapping_entities(entities):
    """Remove overlapping entities from the list."""
    # sort entities by start position
    entities = sorted(entities, key=lambda x: x[0])
    non_overlapping = []
    last_end = -1
    for start, end, label in entities:
        # only add to list if there's no overlap with the previous entity
        if start >= last_end:
            non_overlapping.append((start, end, label))
            last_end = end
    return non_overlapping

# convert JSON data to Spacy's DocBin format
def convert_to_spacy_format(data):
    # load a blank Spacy model
    nlp = spacy.blank("en")
    # container for our docs
    doc_bin = DocBin()

    for item in data:
        # full document text
        text = item['data']['Text']
        entities = []

        for annotation in item['annotations'][0]['result']:
            start = annotation['value']['start']
            end = annotation['value']['end']
            label = annotation['value']['labels'][0]
            entities.append((start, end, label))

        # remove overlapping entities
        entities = remove_overlapping_entities(entities)
        # create a Spacy doc and add entities to it
        doc = nlp.make_doc(text)
        spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
        # filter out None spans if Spacy can't align the character indices with tokens
        spans = [span for span in spans if span is not None]
        # assign entities to the doc
        doc.ents = spans
        doc_bin.add(doc)

    return doc_bin

# split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# convert train and test sets to Spacy format
train_doc_bin = convert_to_spacy_format(train_data)
test_doc_bin = convert_to_spacy_format(test_data)

# save the train and test data to .spacy files in current runtime
train_doc_bin.to_disk("spacy_ner_data/train_data.spacy")
test_doc_bin.to_disk("spacy_ner_data/test_data.spacy")

In [4]:
# check the distribution of entitiy labels

import spacy
from spacy.lang.en import English
from spacy.tokens import DocBin

# count number of label entities
def count_entity_labels(file_path):
    # load data from file_path and create DocBin
    doc_bin = DocBin().from_disk(file_path)
    label_counts = {}
    for doc in doc_bin.get_docs(English().vocab):
        # count occurence of label
        for ent in doc.ents:
            label = ent.label_
            label_counts[label] = label_counts.get(label, 0) + 1
    return label_counts

# calculate and print label distribution in train and test data
# sorted from largest to smallest
train_label_counts = count_entity_labels("spacy_ner_data/train_data.spacy")
sorted_train_label_counts = sorted(train_label_counts.items(), key=lambda x: x[1], reverse=True)
print("Train Data Entity Label Distribution:")
for label, count in sorted_train_label_counts:
    print(f"{label}: {count}")

test_label_counts = count_entity_labels("spacy_ner_data/test_data.spacy")
sorted_test_label_counts = sorted(test_label_counts.items(), key=lambda x: x[1], reverse=True)
print("\nTest Data Entity Label Distribution:")
for label, count in sorted_test_label_counts:
    print(f"{label}: {count}")

Train Data Entity Label Distribution:
SKILL: 4969
JOB: 1475
WORK PER: 1108
COMPANY: 990
LOC: 943
UNI: 422
DEG: 406
NAME: 363
PHONE: 341
STUDY PER: 337
EMAIL: 297

Test Data Entity Label Distribution:
SKILL: 1295
JOB: 371
WORK PER: 272
LOC: 223
COMPANY: 212
UNI: 109
DEG: 103
STUDY PER: 90
NAME: 84
PHONE: 82
EMAIL: 67


In [5]:
# create base_config.cfg and paste the config generated from spacy widget
# need to update train and test file path
!touch base_config.cfg

In [6]:
# generate config.cfg from base_config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:
# vector used for spacy ner
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
# train model using hyperparameters set in config.cfg
# save trained model in spacy-output/ dir

# using cpu
# !python -m spacy train config.cfg --output ./spacy_output

# using gpu
!python -m spacy train config.cfg --gpu-id 0 --output ./spacy_output

# save output dir into drive
!cp -r ./spacy_output /content/drive/MyDrive/FYP/Implementation/

[38;5;2m✔ Created output directory: spacy_output[0m
[38;5;4mℹ Saving to output directory: spacy_output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    313.35    0.00    0.00    0.00    0.00
  1     500        370.12  41099.79   18.66   41.10   12.07    0.19
  2    1000        144.29  25436.52   43.54   52.06   37.41    0.44
  4    1500        110.11  21728.42   46.32   63.42   36.49    0.46
  5    2000        117.39  20147.01   53.74   61.14   47.94    0.54
  7    2500        129.03  19823.69   53.41   68.57   43.74    0.53
  8    3000        134.53  18193.32   56.98   62.37   52.44    0.57
  9    3500        143.55  17491.75   58.14   65.76   52.10    0.58
 11    4000        154.15  17226.72   59.56   64.3

In [9]:
# evaluate trained model performance
# store output and visualization into result/ dir
!python -m spacy evaluate spacy_output/model-best spacy_ner_data/test_data.spacy -dp spacy_output

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   62.46 
NER R   58.87 
NER F   60.61 
SPEED   1053  

[1m

                P       R       F
JOB         60.52   62.80   61.64
LOC         60.95   74.89   67.20
PHONE       95.77   82.93   88.89
EMAIL       86.36   85.07   85.71
WORK PER    83.68   88.60   86.07
COMPANY     56.56   65.09   60.53
STUDY PER   72.83   74.44   73.63
UNI         60.31   72.48   65.83
DEG         76.42   78.64   77.51
SKILL       50.25   39.00   43.91
NAME        96.20   90.48   93.25

<IPython.core.display.HTML object>
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.10/

In [9]:
# make prediction

import spacy
import string

resume_text = '''
John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.

John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'
'''

# convert text into small letter then remove punctuation
resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))

# load trained model
nlp = spacy.load("/content/drive/MyDrive/FYP/Implementation/spacy_output/model-best")

# create a Spacy doc and add text to it
doc = nlp(resume_text.lower())

# print predicted entities in text
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

# visualize predicted entities using displacy
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

john doe: NAME
los angeles: LOC
1 555 1234567: PHONE
johndoeexamplecom: EMAIL
software engineer: JOB
python: SKILL
as a software engineer: JOB
at google llc: COMPANY
san francisco: LOC
as a junior developer: JOB
at tech innovators inc: COMPANY
in austin: LOC
july 2017 to july 2019: WORK PER
master of science: DEG
in computer science: DEG
from the university: UNI
berkeley: LOC
bachelor of science: DEG
from the university: UNI
at: LOC
may 2015: STUDY PER
python: SKILL
javascript: SKILL
git: SKILL
docker: SKILL
kubernetes: SKILL
solutions architect: JOB
cloud architect: JOB


In [3]:
# make prediction

import spacy
import string

resume_text = '''
Zi Qing Chew
chewziqing@gmail.com | 016-2892475 | Kuala Lumpur, Malaysia | linkedin.com/in/ziqingchew | github.com/chewzzz1014
EDUCATION

Universiti Putra Malaysia					                                                   Oct 2021 - Current
Bachelor in Computer Science with Honours
Expected to graduate in July 2025. CGPA: 3.99

WORK EXPERIENCE

Ant International 									          	July 2024 – Oct 2024
Java Engineer Intern							                               Kuala Lumpur, Malaysia
Collaborated in developing an audit logging feature for Ant Group’s internal Foreign Exchange (FX) trade strategy system that records changes made by business users to trade strategies.
Conducted comprehensive system analysis and project planning, delivering presentations to project stakeholders and QA teams prior to the development phase.
Utilised Ant Group’s internal frameworks, middleware, and tools to implement the audit logging feature.
Skills: Java, Spring, Sofaboot, Ant Group internal middlewares (ZDAL, DRM, Ant Scheduler, Msg Broker)
Howuku  									          	             Feb 2023 – Sep 2023
Software Developer Intern							                    Kuala Lumpur, Malaysia
Developed and optimized A/B testing features, including code editor and previewer for CSS and JavaScript modifications for experiment variations.
Expanded A/B testing targeting rule by incorporating website visitor's OS, device, and browser rules.
Automated experiment-stopping criteria and email notifications based on user-defined experiment termination conditions.
Collaborated with cross-functional teams to debug, troubleshoot, and enhance Howuku platform features based on user feedback and performance data.
Skills: JavaScript, Bootstrap, Vue.js, Express.js, MySQL

PROJECTS

Personal Portfolio Website (chewzzz1014.github.io/portfolio-website)
Designed, developed and deployed personalised portfolio website featuring skills, selected projects, and downloadable resume.
Skills: JavaScript, React.js, CSS, Bootstrap
Depression Level Detection Chatbot (https://github.com/chewzzz1014/health-ease-project)
Developed machine learning application that evaluates a message's depression level and provided tailored mental health advice and information based on the depression severity.
Skills: Python, pandas, scikit-learn, Keras, FastAPI, Gradio
Clothing Store Website (https://github.com/chewzzz1014/CSC3402-MVC-Project)
Worked in team to build a CRUD Spring Boot application with attractive interfaces, data persistence, authentication and authorisation.
Developed the backend of the application that involves querying the database, building REST endpoints and implementing Thymeleaf in HTML for dynamic contents.
Skills: Spring Boot, Spring MVC, Thymeleaf, Hibernate, Bootstrap

SKILLS
Programming Languages: Java, Python, HTML, CSS, JavaScript, MySQL, OracleSQL
Frameworks and Libraries: Spring, Spring Boot, TypeScript, Node.js, Express.js, React.js, Vue.js, Bootstrap, Tailwind CSS
Tools: Git, Github, Jira, Tableau, Excel, Jupyter Notebook, Google Colab, VSCode, IntelliJ
'''

# convert text into small letter then remove punctuation
resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))

# load trained model
nlp = spacy.load("/content/drive/MyDrive/FYP/Implementation/spacy_output/model-best")

# create a Spacy doc and add text to it
doc = nlp(resume_text.lower())

from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

## Flair NER

In [None]:
# install flair library
!pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.35.57-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [None]:
# convert json into flair data

import json
import random
from typing import List, Dict, Tuple
import spacy
from collections import defaultdict

class NERConverter:
    def __init__(self):
        # load pretrained model from Spacy library
        # to create Spacy Doc object
        self.nlp = spacy.load("en_core_web_sm")

    # get BIOES label based on location of word
    def get_bioes_label(self, token_index: int, entity_length: int, current_position: int, label: str) -> str:
        """
        Convert to BIOES format
        - S-: Single token entity
        - B-: Beginning of multi-token entity
        - I-: Inside of multi-token entity
        - E-: End of multi-token entity
        - O: Outside
        """
        if entity_length == 1:
            return f'S-{label}'
        if current_position == 0:
            return f'B-{label}'
        if current_position == entity_length - 1:
            return f'E-{label}'
        return f'I-{label}'

    # convert Label Studio's exported annotations in json format intto BIOES format
    def convert_to_bioes_format(self, json_data: List[dict]) -> List[List[Tuple[str, str]]]:
        """Convert JSON annotations to BIOES format."""
        all_sentences = []

        # process all annotation in json file
        for item in json_data:
            text = item['data']['Text']
            doc = self.nlp(text)

            # initialize character-level labels
            char_labels = ['O'] * len(text)

            # first pass: identify entity boundaries and lengths
            entity_spans = []
            if item['annotations'] and len(item['annotations']) > 0:
                for ann in item['annotations'][0]['result']:
                    if 'value' in ann:
                        start = ann['value']['start']
                        end = ann['value']['end']
                        label = ann['value']['labels'][0]
                        entity_spans.append((start, end, label))

            # sort spans by start position
            entity_spans.sort(key=lambda x: x[0])

            # second pass: apply BIOES labels
            for start, end, label in entity_spans:
                # get tokens that are part of this entity
                entity_text = text[start:end]
                entity_doc = self.nlp(entity_text)
                entity_length = len([token for token in entity_doc if not token.is_space])

                # set labels for the entire span
                current_token_idx = 0
                for i in range(start, end):
                    if i == start or text[i-1].isspace():
                        char_labels[i] = self.get_bioes_label(i, entity_length, current_token_idx, label)
                        current_token_idx += 1
                    else:
                        char_labels[i] = char_labels[i-1]

            # convert to token-level labels
            current_sentence = []
            for sent in doc.sents:
                for token in sent:
                    # get the most common label for the token's characters
                    token_chars_labels = char_labels[token.idx:token.idx + len(token.text)]
                    label_counts = defaultdict(int)
                    for char_label in token_chars_labels:
                        label_counts[char_label] += 1

                    token_label = max(label_counts.items(), key=lambda x: x[1])[0]
                    current_sentence.append((token.text, token_label))

                if current_sentence:
                    all_sentences.append(current_sentence)
                    current_sentence = []

        return all_sentences

    # write data in BIOES format into txt file
    def write_flair_file(self, sentences: List[List[Tuple[str, str]]], filename: str):
        """Write sentences in BIOES format to file."""
        with open(filename, 'w', encoding='utf-8') as f:
            for sentence in sentences:
                for token, label in sentence:
                    f.write(f'{token} {label}\n')
                f.write('\n')

    # convert json data into BIOES data
    # split BIOES data into train and test
    def convert_and_split(self, json_data: List[dict], train_file: str, test_file: str, test_ratio: float = 0.2):
        """Convert JSON to BIOES format and split into train/test sets."""
        all_sentences = self.convert_to_bioes_format(json_data)

        # shuffle and split based on test_ratio
        random.shuffle(all_sentences)
        split_idx = int(len(all_sentences) * (1 - test_ratio))

        # use list slicing to split
        train_sentences = all_sentences[:split_idx]
        test_sentences = all_sentences[split_idx:]

        # write to txt files
        self.write_flair_file(train_sentences, train_file)
        self.write_flair_file(test_sentences, test_file)

        return len(train_sentences), len(test_sentences)


# load JSON data
with open('/content/drive/MyDrive/FYP/Implementation/Resume Dataset/342_resumes_annotated.json', 'r', encoding='utf-8') as f:
  json_data = json.load(f)

# load self-defined convert class
converter = NERConverter()

# convert json data into BIOES data and split into train and test
train_count, test_count = converter.convert_and_split(
    json_data,
    train_file='flair_train.txt',
    test_file='flair_test.txt',
    test_ratio=0.2
)
print(f'Created {train_count} training sentences and {test_count} test sentences')

Created 298 training sentences and 75 test sentences


In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns for CoNLL (0: word, 1: label)
columns = {0: 'text', 1: 'ner'}

# set data folder and train and test path
data_folder = './'
train_file = 'flair_train.txt'
test_file = 'flair_test.txt'

# load the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=train_file,
                              test_file=test_file,
                              dev_file=None)

2024-11-10 06:40:19,078 Reading data from .
2024-11-10 06:40:19,079 Train: flair_train.txt
2024-11-10 06:40:19,080 Dev: None
2024-11-10 06:40:19,083 Test: flair_test.txt
2024-11-10 06:40:20,302 No dev split found. Using 10% (i.e. 30 samples) of the train split as dev data


In [None]:
# generate a dictionary of unique labels from the NER corpus.
# this dictionary maps each named entity label in the dataset to an integer ID.
tag_dictionary = corpus.make_label_dictionary(label_type='ner')
print("Labels:", tag_dictionary.get_items())

2024-11-10 06:40:23,085 Computing label dictionary. Progress:


0it [00:00, ?it/s]
268it [00:00, 15504.25it/s]

2024-11-10 06:40:23,122 Dictionary created for label 'ner' with 11 values: SKILL (seen 2062 times), JOB (seen 533 times), WORK (seen 455 times), COMPANY (seen 372 times), LOC (seen 252 times), UNI (seen 158 times), DEG (seen 149 times), NAME (seen 138 times), STUDY (seen 138 times), PHONE (seen 135 times), EMAIL (seen 108 times)
Labels: ['SKILL', 'JOB', 'WORK', 'COMPANY', 'LOC', 'UNI', 'DEG', 'NAME', 'STUDY', 'PHONE', 'EMAIL']





In [None]:
from collections import Counter

# count frequency of each entity label
def count_labels(file_path):
    with open(file_path, 'r') as file:
        labels = [line.split()[-1] for line in file if line.strip()]
    return Counter(labels)

# number of
print("Train label distribution:", count_labels('flair_train.txt'))
print("Test label distribution:", count_labels('flair_test.txt'))

Train label distribution: Counter({'O': 72298, 'S-SKILL': 1392, 'PER': 1203, 'B-SKILL': 991, 'E-SKILL': 988, 'E-JOB': 524, 'B-JOB': 517, 'E-COMPANY': 341, 'B-COMPANY': 336, 'I-JOB': 295, 'I-DEG': 256, 'I-COMPANY': 239, 'I-SKILL': 198, 'E-UNI': 170, 'B-UNI': 169, 'S-LOC': 165, 'B-NAME': 156, 'E-NAME': 156, 'E-DEG': 155, 'B-DEG': 154, 'I-UNI': 138, 'E-PHONE': 129, 'B-PHONE': 126, 'S-EMAIL': 118, 'B-LOC': 108, 'E-LOC': 108, 'I-PHONE': 89, 'S-JOB': 64, 'S-COMPANY': 62, 'S-PHONE': 22, 'I-NAME': 6, 'S-DEG': 6, 'I-LOC': 5, 'B-EMAIL': 1, 'E-EMAIL': 1})
Test label distribution: Counter({'O': 16795, 'S-SKILL': 399, 'PER': 308, 'B-SKILL': 222, 'E-SKILL': 221, 'E-JOB': 135, 'B-JOB': 130, 'B-COMPANY': 86, 'E-COMPANY': 86, 'I-DEG': 78, 'I-COMPANY': 76, 'I-JOB': 62, 'B-DEG': 47, 'E-DEG': 47, 'B-UNI': 45, 'E-UNI': 45, 'S-LOC': 40, 'I-SKILL': 38, 'B-NAME': 35, 'E-NAME': 35, 'I-UNI': 33, 'S-EMAIL': 30, 'B-PHONE': 29, 'E-PHONE': 28, 'B-LOC': 20, 'E-LOC': 20, 'I-PHONE': 17, 'S-COMPANY': 17, 'S-JOB': 11, '

In [None]:
# create NER tagger
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger

# 1. using LSTM-CRF on top of frozen embeddings
# combine flair and glove embeddings
# embeddings = StackedEmbeddings([
#                 WordEmbeddings('glove'),
#                 FlairEmbeddings('news-forward'),
#                 FlairEmbeddings('news-backward'),
#             ])
# tagger = SequenceTagger(hidden_size=256,
#                          embeddings=embeddings,
#                          tag_dictionary=tag_dictionary,
#                          tag_type='ner',
#                          use_crf=True,
#                          tag_format = 'BIOES')

# 2. using transformer embedding
# transformer option 1
# embeddings = TransformerWordEmbeddings('bert-base-uncased',
#                                         fine_tune=True,
#                                         layers='-1',
#                                         subtoken_pooling='first'
#                                       )
# transformer option 2
# embeddings = TransformerWordEmbeddings(
#     'roberta-base',  # or 'bert-base-uncased'
#     fine_tune=True,
#     layers='-1,-2,-3,-4',  # Use last 4 layers
#     subtoken_pooling='first',
#     allow_long_sentences=True
# )
# test running using Kaggle CPU
embeddings = TransformerWordEmbeddings(
    'roberta-base',
    fine_tune=True,
    layers='-1,',  # Use last layer
    subtoken_pooling='first',
    allow_long_sentences=True
)

use_crf = False
tagger = SequenceTagger(hidden_size=256,
                         embeddings=embeddings,
                         tag_dictionary=tag_dictionary,
                         tag_type='ner',
                         use_crf=use_crf,
                         use_rnn=False,
                         reproject_embeddings=False,
                         tag_format = 'BIOES')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


2024-11-10 06:40:38,569 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL


In [None]:
# train flair ner model

from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
import torch

# define ModelTrained based on tagger and corpus
trainer = ModelTrainer(tagger, corpus)

# train Flair NER Model
trainer.train(
    base_path='flair_output/',
    # learning_rate=5.0e-5,
    learning_rate=0.01,
    mini_batch_size=4,
    max_epochs=50,
    train_with_dev=False
)

# test running using Kaggle GPU
# trainer.train(
#     base_path='flair_output/',
#     learning_rate=0.01,
#     mini_batch_size=8,
#     max_epochs=100,
#     patience=8,
#     train_with_dev=False
# )

# save trained model to drive
!cp -r ./flair_output /content/drive/MyDrive/FYP/Implementation/

2024-11-10 06:40:45,644 ----------------------------------------------------------------------------------------------------
2024-11-10 06:40:45,648 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50266, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2024-11-10 06:40:48,883 epoch 1 - iter 3/34 - loss 4.49894183 - time (sec): 3.19 - samples/sec: 1695.27 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:40:52,200 epoch 1 - iter 6/34 - loss 4.15237677 - time (sec): 6.51 - samples/sec: 2130.30 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:40:55,951 epoch 1 - iter 9/34 - loss 3.99204313 - time (sec): 10.26 - samples/sec: 2001.13 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:40:58,560 epoch 1 - iter 12/34 - loss 3.86195917 - time (sec): 12.87 - samples/sec: 2045.47 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:01,525 epoch 1 - iter 15/34 - loss 3.74573814 - time (sec): 15.84 - samples/sec: 2062.92 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:04,752 epoch 1 - iter 18/34 - loss 3.62881443 - time (sec): 19.06 - samples/sec: 2062.98 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:07,560 epoch 1 - iter 21/34 - loss 3.52894054 - time (sec): 21.87 - samples/sec: 2068.05 - lr: 0.001000 - momentum: 0.000000
2024-11-10 

100%|██████████| 1/1 [00:02<00:00,  2.38s/it]

2024-11-10 06:41:23,341 DEV : loss 1.2984716892242432 - f1-score (micro avg)  0.0
2024-11-10 06:41:23,354  - 0 epochs without improvement
2024-11-10 06:41:23,355 ----------------------------------------------------------------------------------------------------





2024-11-10 06:41:25,712 epoch 2 - iter 3/34 - loss 1.37025713 - time (sec): 2.35 - samples/sec: 2791.39 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:29,301 epoch 2 - iter 6/34 - loss 1.25098791 - time (sec): 5.94 - samples/sec: 2428.96 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:32,516 epoch 2 - iter 9/34 - loss 1.20654881 - time (sec): 9.16 - samples/sec: 2329.78 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:36,025 epoch 2 - iter 12/34 - loss 1.12661611 - time (sec): 12.67 - samples/sec: 2227.84 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:39,965 epoch 2 - iter 15/34 - loss 1.06402043 - time (sec): 16.61 - samples/sec: 2160.80 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:43,028 epoch 2 - iter 18/34 - loss 1.02149114 - time (sec): 19.67 - samples/sec: 2181.38 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:41:45,868 epoch 2 - iter 21/34 - loss 1.00697889 - time (sec): 22.51 - samples/sec: 2204.22 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:02<00:00,  2.02s/it]

2024-11-10 06:42:00,299 DEV : loss 1.0739030838012695 - f1-score (micro avg)  0.0
2024-11-10 06:42:00,312  - 0 epochs without improvement
2024-11-10 06:42:00,313 ----------------------------------------------------------------------------------------------------





2024-11-10 06:42:03,190 epoch 3 - iter 3/34 - loss 0.74925599 - time (sec): 2.87 - samples/sec: 2930.39 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:07,009 epoch 3 - iter 6/34 - loss 0.71241576 - time (sec): 6.69 - samples/sec: 2513.78 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:10,416 epoch 3 - iter 9/34 - loss 0.77681343 - time (sec): 10.10 - samples/sec: 2379.53 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:13,645 epoch 3 - iter 12/34 - loss 0.78950655 - time (sec): 13.33 - samples/sec: 2277.65 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:16,801 epoch 3 - iter 15/34 - loss 0.76509655 - time (sec): 16.48 - samples/sec: 2327.05 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:19,504 epoch 3 - iter 18/34 - loss 0.75695488 - time (sec): 19.19 - samples/sec: 2281.46 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:22,254 epoch 3 - iter 21/34 - loss 0.75741118 - time (sec): 21.94 - samples/sec: 2245.28 - lr: 0.001000 - momentum: 0.000000
2024-11-10 

100%|██████████| 1/1 [00:02<00:00,  2.26s/it]

2024-11-10 06:42:36,300 DEV : loss 1.010551929473877 - f1-score (micro avg)  0.0
2024-11-10 06:42:36,314  - 0 epochs without improvement
2024-11-10 06:42:36,315 ----------------------------------------------------------------------------------------------------





2024-11-10 06:42:39,534 epoch 4 - iter 3/34 - loss 0.68801425 - time (sec): 3.22 - samples/sec: 2067.73 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:42,227 epoch 4 - iter 6/34 - loss 0.76204660 - time (sec): 5.91 - samples/sec: 2068.17 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:45,321 epoch 4 - iter 9/34 - loss 0.73750612 - time (sec): 9.00 - samples/sec: 2186.27 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:48,283 epoch 4 - iter 12/34 - loss 0.75587837 - time (sec): 11.97 - samples/sec: 2150.77 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:51,069 epoch 4 - iter 15/34 - loss 0.76613526 - time (sec): 14.75 - samples/sec: 2144.55 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:53,793 epoch 4 - iter 18/34 - loss 0.73983848 - time (sec): 17.48 - samples/sec: 2198.62 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:42:57,228 epoch 4 - iter 21/34 - loss 0.73852708 - time (sec): 20.91 - samples/sec: 2162.36 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:03<00:00,  3.85s/it]

2024-11-10 06:43:15,255 DEV : loss 0.9807278513908386 - f1-score (micro avg)  0.0
2024-11-10 06:43:15,278  - 0 epochs without improvement
2024-11-10 06:43:15,281 ----------------------------------------------------------------------------------------------------





2024-11-10 06:43:18,037 epoch 5 - iter 3/34 - loss 0.68770524 - time (sec): 2.75 - samples/sec: 2900.13 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:21,172 epoch 5 - iter 6/34 - loss 0.70787357 - time (sec): 5.89 - samples/sec: 2451.76 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:24,040 epoch 5 - iter 9/34 - loss 0.69117964 - time (sec): 8.75 - samples/sec: 2314.06 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:27,308 epoch 5 - iter 12/34 - loss 0.68813092 - time (sec): 12.02 - samples/sec: 2233.82 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:30,232 epoch 5 - iter 15/34 - loss 0.68943450 - time (sec): 14.95 - samples/sec: 2303.73 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:33,365 epoch 5 - iter 18/34 - loss 0.68649157 - time (sec): 18.08 - samples/sec: 2244.21 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:36,221 epoch 5 - iter 21/34 - loss 0.70725457 - time (sec): 20.94 - samples/sec: 2219.77 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:02<00:00,  2.16s/it]

2024-11-10 06:43:51,719 DEV : loss 0.9546282887458801 - f1-score (micro avg)  0.0
2024-11-10 06:43:51,733  - 0 epochs without improvement
2024-11-10 06:43:51,734 ----------------------------------------------------------------------------------------------------





2024-11-10 06:43:53,885 epoch 6 - iter 3/34 - loss 0.81728206 - time (sec): 2.15 - samples/sec: 2809.29 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:43:56,919 epoch 6 - iter 6/34 - loss 0.74839004 - time (sec): 5.18 - samples/sec: 2658.71 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:00,221 epoch 6 - iter 9/34 - loss 0.73816897 - time (sec): 8.48 - samples/sec: 2412.85 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:03,488 epoch 6 - iter 12/34 - loss 0.72221877 - time (sec): 11.75 - samples/sec: 2281.04 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:06,915 epoch 6 - iter 15/34 - loss 0.72434670 - time (sec): 15.18 - samples/sec: 2259.04 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:09,896 epoch 6 - iter 18/34 - loss 0.70545198 - time (sec): 18.16 - samples/sec: 2272.04 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:12,734 epoch 6 - iter 21/34 - loss 0.71637657 - time (sec): 20.99 - samples/sec: 2181.61 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:02<00:00,  2.50s/it]

2024-11-10 06:44:28,960 DEV : loss 0.9327938556671143 - f1-score (micro avg)  0.0
2024-11-10 06:44:28,975  - 0 epochs without improvement
2024-11-10 06:44:28,977 ----------------------------------------------------------------------------------------------------





2024-11-10 06:44:31,561 epoch 7 - iter 3/34 - loss 0.74686199 - time (sec): 2.58 - samples/sec: 2483.31 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:34,827 epoch 7 - iter 6/34 - loss 0.71537512 - time (sec): 5.85 - samples/sec: 2404.71 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:37,103 epoch 7 - iter 9/34 - loss 0.72893245 - time (sec): 8.12 - samples/sec: 2258.50 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:39,753 epoch 7 - iter 12/34 - loss 0.74824469 - time (sec): 10.77 - samples/sec: 2129.64 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:42,721 epoch 7 - iter 15/34 - loss 0.70783328 - time (sec): 13.74 - samples/sec: 2121.43 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:46,055 epoch 7 - iter 18/34 - loss 0.68970131 - time (sec): 17.07 - samples/sec: 2211.22 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:44:49,441 epoch 7 - iter 21/34 - loss 0.68404185 - time (sec): 20.46 - samples/sec: 2182.82 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:02<00:00,  2.23s/it]

2024-11-10 06:45:05,644 DEV : loss 0.9227402210235596 - f1-score (micro avg)  0.0
2024-11-10 06:45:05,668  - 0 epochs without improvement
2024-11-10 06:45:05,670 ----------------------------------------------------------------------------------------------------





2024-11-10 06:45:08,167 epoch 8 - iter 3/34 - loss 0.62215633 - time (sec): 2.49 - samples/sec: 2860.01 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:11,789 epoch 8 - iter 6/34 - loss 0.60526036 - time (sec): 6.11 - samples/sec: 2313.97 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:15,381 epoch 8 - iter 9/34 - loss 0.56508174 - time (sec): 9.71 - samples/sec: 2259.41 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:18,038 epoch 8 - iter 12/34 - loss 0.59217656 - time (sec): 12.36 - samples/sec: 2375.21 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:21,168 epoch 8 - iter 15/34 - loss 0.61928765 - time (sec): 15.49 - samples/sec: 2255.14 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:24,175 epoch 8 - iter 18/34 - loss 0.63696541 - time (sec): 18.50 - samples/sec: 2290.48 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:26,885 epoch 8 - iter 21/34 - loss 0.63918150 - time (sec): 21.21 - samples/sec: 2272.87 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:02<00:00,  2.08s/it]

2024-11-10 06:45:42,046 DEV : loss 0.911533534526825 - f1-score (micro avg)  0.0
2024-11-10 06:45:42,062  - 0 epochs without improvement
2024-11-10 06:45:42,063 ----------------------------------------------------------------------------------------------------





2024-11-10 06:45:44,668 epoch 9 - iter 3/34 - loss 0.68589084 - time (sec): 2.60 - samples/sec: 2477.72 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:47,563 epoch 9 - iter 6/34 - loss 0.68286957 - time (sec): 5.49 - samples/sec: 2352.45 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:50,345 epoch 9 - iter 9/34 - loss 0.66230968 - time (sec): 8.28 - samples/sec: 2176.18 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:53,439 epoch 9 - iter 12/34 - loss 0.67526850 - time (sec): 11.37 - samples/sec: 2121.90 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:56,304 epoch 9 - iter 15/34 - loss 0.68118573 - time (sec): 14.24 - samples/sec: 2194.91 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:45:59,065 epoch 9 - iter 18/34 - loss 0.68341011 - time (sec): 17.00 - samples/sec: 2227.43 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:01,983 epoch 9 - iter 21/34 - loss 0.68044528 - time (sec): 19.92 - samples/sec: 2216.12 - lr: 0.001000 - momentum: 0.000000
2024-11-10 0

100%|██████████| 1/1 [00:02<00:00,  2.65s/it]

2024-11-10 06:46:19,474 DEV : loss 0.9013434052467346 - f1-score (micro avg)  0.0
2024-11-10 06:46:19,499  - 0 epochs without improvement
2024-11-10 06:46:19,502 ----------------------------------------------------------------------------------------------------





2024-11-10 06:46:22,390 epoch 10 - iter 3/34 - loss 0.62659941 - time (sec): 2.88 - samples/sec: 2245.41 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:26,833 epoch 10 - iter 6/34 - loss 0.54192426 - time (sec): 7.33 - samples/sec: 2063.90 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:29,721 epoch 10 - iter 9/34 - loss 0.56179132 - time (sec): 10.22 - samples/sec: 2215.05 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:33,117 epoch 10 - iter 12/34 - loss 0.59192443 - time (sec): 13.61 - samples/sec: 2194.58 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:36,227 epoch 10 - iter 15/34 - loss 0.61155791 - time (sec): 16.72 - samples/sec: 2171.33 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:38,603 epoch 10 - iter 18/34 - loss 0.61891236 - time (sec): 19.10 - samples/sec: 2199.25 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:46:41,025 epoch 10 - iter 21/34 - loss 0.64942655 - time (sec): 21.52 - samples/sec: 2209.17 - lr: 0.001000 - momentum: 0.000000
2024

100%|██████████| 1/1 [00:01<00:00,  1.94s/it]

2024-11-10 06:46:56,425 DEV : loss 0.8974453806877136 - f1-score (micro avg)  0.0
2024-11-10 06:46:56,439  - 0 epochs without improvement
2024-11-10 06:46:56,441 ----------------------------------------------------------------------------------------------------





2024-11-10 06:46:58,901 epoch 11 - iter 3/34 - loss 0.73512399 - time (sec): 2.46 - samples/sec: 2533.07 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:02,144 epoch 11 - iter 6/34 - loss 0.68054520 - time (sec): 5.70 - samples/sec: 2574.06 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:06,848 epoch 11 - iter 9/34 - loss 0.61468447 - time (sec): 10.40 - samples/sec: 2303.15 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:09,877 epoch 11 - iter 12/34 - loss 0.62451034 - time (sec): 13.43 - samples/sec: 2242.27 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:13,376 epoch 11 - iter 15/34 - loss 0.60893554 - time (sec): 16.93 - samples/sec: 2273.95 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:16,162 epoch 11 - iter 18/34 - loss 0.61823904 - time (sec): 19.72 - samples/sec: 2278.53 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:19,564 epoch 11 - iter 21/34 - loss 0.61462373 - time (sec): 23.12 - samples/sec: 2292.35 - lr: 0.001000 - momentum: 0.000000
2024

100%|██████████| 1/1 [00:03<00:00,  3.74s/it]

2024-11-10 06:47:34,734 DEV : loss 0.8923962712287903 - f1-score (micro avg)  0.0
2024-11-10 06:47:34,757  - 0 epochs without improvement
2024-11-10 06:47:34,761 ----------------------------------------------------------------------------------------------------





2024-11-10 06:47:36,910 epoch 12 - iter 3/34 - loss 0.58433455 - time (sec): 2.14 - samples/sec: 2653.29 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:40,309 epoch 12 - iter 6/34 - loss 0.59768744 - time (sec): 5.54 - samples/sec: 2405.22 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:43,324 epoch 12 - iter 9/34 - loss 0.60776412 - time (sec): 8.56 - samples/sec: 2276.82 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:45,821 epoch 12 - iter 12/34 - loss 0.61844495 - time (sec): 11.05 - samples/sec: 2407.18 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:48,696 epoch 12 - iter 15/34 - loss 0.64994421 - time (sec): 13.93 - samples/sec: 2271.01 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:51,932 epoch 12 - iter 18/34 - loss 0.64896197 - time (sec): 17.16 - samples/sec: 2269.73 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:47:55,393 epoch 12 - iter 21/34 - loss 0.63578663 - time (sec): 20.63 - samples/sec: 2202.30 - lr: 0.001000 - momentum: 0.000000
2024-

100%|██████████| 1/1 [00:02<00:00,  2.19s/it]

2024-11-10 06:48:12,007 DEV : loss 0.8852460384368896 - f1-score (micro avg)  0.0
2024-11-10 06:48:12,022  - 0 epochs without improvement
2024-11-10 06:48:12,024 ----------------------------------------------------------------------------------------------------





2024-11-10 06:48:14,825 epoch 13 - iter 3/34 - loss 0.61267796 - time (sec): 2.80 - samples/sec: 2643.90 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:48:17,592 epoch 13 - iter 6/34 - loss 0.61072444 - time (sec): 5.56 - samples/sec: 2460.22 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:48:20,826 epoch 13 - iter 9/34 - loss 0.61868894 - time (sec): 8.80 - samples/sec: 2462.43 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:48:23,540 epoch 13 - iter 12/34 - loss 0.65424512 - time (sec): 11.51 - samples/sec: 2365.94 - lr: 0.001000 - momentum: 0.000000
2024-11-10 06:48:26,198 epoch 13 - iter 15/34 - loss 0.66167307 - time (sec): 14.17 - samples/sec: 2327.97 - lr: 0.001000 - momentum: 0.000000


OutOfMemoryError: CUDA out of memory. Tried to allocate 410.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 333.06 MiB is free. Process 135795 has 14.42 GiB memory in use. Of the allocated memory 13.30 GiB is allocated by PyTorch, and 1015.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# evaluate model

from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

# load the trained model
model = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/final-model.pt')

# evaluate the model on the test set
result = model.evaluate(corpus.test, gold_label_type='ner', mini_batch_size=32)

# print the results
# print("Evaluation Loss:", eval_loss)
print(result.detailed_results)  # print the precision, recall, and F1-score per entity type

2024-11-10 06:34:15,115 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL


100%|██████████| 3/3 [00:03<00:00,  1.33s/it]


Results:
- F-score (micro) 0.5075
- F-score (macro) 0.6301
- Accuracy 0.3429

By class:
              precision    recall  f1-score   support

       SKILL     0.3338    0.4100    0.3680       561
         JOB     0.4715    0.6190    0.5353       147
        WORK     0.7453    0.9160    0.8219       131
     COMPANY     0.4531    0.5273    0.4874       110
         LOC     0.5641    0.2973    0.3894        74
         UNI     0.3729    0.4681    0.4151        47
         DEG     0.5849    0.6200    0.6019        50
       STUDY     0.6667    0.6190    0.6420        42
       PHONE     1.0000    1.0000    1.0000        36
        NAME     0.9412    0.8649    0.9014        37
       EMAIL     0.6410    0.9615    0.7692        26

   micro avg     0.4714    0.5496    0.5075      1261
   macro avg     0.6159    0.6639    0.6301      1261
weighted avg     0.4822    0.5496    0.5085      1261






In [None]:
# make prediction

from flair.models import SequenceTagger
from flair.data import Sentence
import spacy, string
from spacy import displacy

# load trained Flair NER model
tagger = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/best-model.pt')

resume_text = '''
John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.

John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'
'''

# make into all small letter and remove punctuations
resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))

# step 1: predict entities using Flair trained model
sentence = Sentence(resume_text)
tagger.predict(sentence)

# step 2: convert Flair predictions to spaCy doc format
# initialize a blank spaCy NLP pipeline
nlp = spacy.blank("en")
doc = nlp(resume_text)

# extract entities from Flair prediction and convert to spaCy format
ents = []
for entity in sentence.get_spans('ner'):
    start, end = entity.start_position, entity.end_position
    label = entity.tag
    span = doc.char_span(start, end, label=label)
    if span is not None:
        ents.append(span)

# set the entities in the spaCy doc
doc.ents = ents

# step 3: visualization of prediction using displacy
displacy.render(doc, style="ent", jupyter=True)


2024-11-10 06:35:04,393 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL




In [None]:
# confidence of each predicted entity label

from flair.models import SequenceTagger
from flair.data import Sentence

# load the pretrained NER model
tagger = SequenceTagger.load("/content/drive/MyDrive/FYP/Implementation/flair_output/best-model.pt")

# create a Sentence object
sentence = Sentence(resume_text)

# predict entities
tagger.predict(sentence)

# print the detected entities
for entity in sentence.get_spans("ner"):
    print(f"Entity: {entity.text}, Type: {entity.get_label('ner').value}, Confidence: {entity.score}")

2024-11-10 06:35:28,375 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL
Entity: john doe, Type: NAME, Confidence: 0.35042084753513336
Entity: los angeles, Type: LOC, Confidence: 0.2777288407087326
Entity: 1, Type: PHONE, Confidence: 0.44708436727523804
Entity: 555 1234567, Type: PHONE, Confidence: 0.3490697592496872
Entity: johndoeexamplecom, Type: EMAIL, Confidence: 0.4608330726623535
Entity: engineer, Type: JOB, Confidence: 0.6058100461959839
Entity: web development, Type: SKILL, Confidence: 0.5158057510852814
Entity: cloud infrastructure, Type: SKILL, Confidence: 0.5084673464298248
Entity: javascript, Type: SKILL, Confide