<a href="https://colab.research.google.com/github/chewzzz1014/fyp/blob/master/ner/src/train_ner_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train NER Models

In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir spacy_ner_data

In [4]:
import json
import random
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin

# Load JSON data
with open('/content/drive/MyDrive/FYP/Implementation/Resume Dataset/169_resumes_annotated.json', "r") as f:
    data = json.load(f)

def remove_overlapping_entities(entities):
    """Remove overlapping entities from the list."""
    entities = sorted(entities, key=lambda x: x[0])  # Sort by start position
    non_overlapping = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:  # Only add if there's no overlap with the previous entity
            non_overlapping.append((start, end, label))
            last_end = end
    return non_overlapping

# Function to convert JSON data to Spacy's DocBin format
def convert_to_spacy_format(data):
    nlp = spacy.blank("en")  # Load a blank Spacy model
    doc_bin = DocBin()  # Container for our docs

    for item in data:
        text = item['data']['Text']  # Full document text
        entities = []

        for annotation in item['annotations'][0]['result']:
            start = annotation['value']['start']
            end = annotation['value']['end']
            label = annotation['value']['labels'][0]  # Entity label
            entities.append((start, end, label))

        entities = remove_overlapping_entities(entities)  # Remove overlapping entities
        # Create a Spacy doc and add entities to it
        doc = nlp.make_doc(text)
        spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
        # Filter out None spans if Spacy can't align the character indices with tokens
        spans = [span for span in spans if span is not None]
        doc.ents = spans  # Assign entities to the doc
        doc_bin.add(doc)

    return doc_bin

# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Convert train and test sets to Spacy format
train_doc_bin = convert_to_spacy_format(train_data)
test_doc_bin = convert_to_spacy_format(test_data)

# Save the train and test data to .spacy files
train_doc_bin.to_disk("spacy_ner_data/train_data.spacy")
test_doc_bin.to_disk("spacy_ner_data/test_data.spacy")

## Spacy NER

In [None]:
# create base_config.cfg and paste the config generated from spacy widget
# update train and test file path
!touch base_config.cfg

In [None]:
# generate config.cfg from base_config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# train model using hyperparameters set in config.cfg
# save trained model in spacy-output/ dir
!python -m spacy train config.cfg --output ./spacy_output
!cp -r ./spacy_output /content/drive/MyDrive/FYP/Implementation/

In [None]:
# evaluate trained model performance
# store output and visualization into result/ dir
!python -m spacy evaluate spacy_output/model-best spacy_ner_data/test_data.spacy -dp spacy_output

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   51.12 
NER R   41.26 
NER F   45.66 
SPEED   2395  

[1m

                P       R       F
NAME        89.66   78.79   83.87
JOB         72.00   32.43   44.72
DEG         62.16   63.89   63.01
UNI         38.89   34.15   36.36
EMAIL       63.33   95.00   76.00
LOC         39.39   31.71   35.14
WORK PER    75.45   83.00   79.05
COMPANY     28.42   36.49   31.95
SKILL       40.96   28.96   33.93
PHONE       89.66   83.87   86.67
STUDY PER   65.62   58.33   61.76

<IPython.core.display.HTML object>
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.10/dist-packages/spacy/cli/_util.py", line 87, in setup_cli
    c

In [None]:
# make prediction
import spacy
resume_text = '''
John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.

John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'
'''
nlp = spacy.load("spacy-output/model-best")
doc = nlp(resume_text.lower())

print(doc.ents)

for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

(john doe, in los, (555) 123-4567, john.doe@example.com, john is, aws, restful apis, master of science, bachelor of science in information technology, python, aws, azure, git, docker)
john doe: NAME
in los: LOC
(555) 123-4567: PHONE
john.doe@example.com: EMAIL
john is: NAME
aws: SKILL
restful apis: SKILL
master of science: DEG
bachelor of science in information technology: DEG
python: SKILL
aws: SKILL
azure: SKILL
git: SKILL
docker: SKILL


In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

## Flair NER

In [5]:
!pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.35.54-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [6]:
import spacy
from spacy.tokens import DocBin
import os

def convert_spacy_to_flair(input_file, output_file):
    """
    Convert SpaCy binary format to Flair's CoNLL format.

    Args:
        input_file (str): Path to SpaCy binary file (.spacy)
        output_file (str): Path to output file for Flair format
    """
    # Load spaCy model
    nlp = spacy.blank("en")

    # Load the DocBin
    doc_bin = DocBin().from_disk(input_file)
    docs = list(doc_bin.get_docs(nlp.vocab))

    with open(output_file, 'w', encoding='utf-8') as f:
        for doc in docs:
            tokens = [(t.text, t.ent_iob_, t.ent_type_) for t in doc]

            # Write tokens in CoNLL format
            for token in tokens:
                text, iob, ent_type = token

                # Convert spaCy IOB to CoNLL format
                if iob == 'O':
                    tag = 'O'
                else:
                    tag = f'{iob}-{ent_type}' if ent_type else 'O'

                # Write line: token and NER tag
                f.write(f'{text} {tag}\n')

            # Empty line between sentences
            f.write('\n')

def convert_spacy_json_to_flair(input_file, output_file):
    """
    Convert SpaCy JSON format to Flair's CoNLL format.

    Args:
        input_file (str): Path to JSON file with SpaCy annotations
        output_file (str): Path to output file for Flair format
    """
    import json

    nlp = spacy.blank("en")

    with open(input_file, 'r', encoding='utf-8') as f:
        training_data = json.load(f)

    with open(output_file, 'w', encoding='utf-8') as f:
        for example in training_data:
            text = example['text']
            ents = example.get('entities', [])

            # Create a spaCy doc
            doc = nlp(text)

            # Add entities to doc
            spans = []
            for start, end, label in ents:
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    spans.append(span)
            doc.ents = spans

            # Convert to CoNLL format
            tokens = [(t.text, t.ent_iob_, t.ent_type_) for t in doc]

            for token in tokens:
                text, iob, ent_type = token
                if iob == 'O':
                    tag = 'O'
                else:
                    tag = f'{iob}-{ent_type}' if ent_type else 'O'
                f.write(f'{text} {tag}\n')

            f.write('\n')

# Example usage for JSON format
flair_train_json = "flair_train.txt"
flair_test_json = "flair_test.txt"

convert_spacy_to_flair('/content/spacy_ner_data/train_data.spacy', flair_train_json)
convert_spacy_to_flair('/content/spacy_ner_data/test_data.spacy', flair_test_json)

In [None]:
import spacy
from spacy.training import Corpus

!python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")
corpus = Corpus("/content/spacy_ner_data/test_data.spacy")

data = corpus(nlp)

# Flair supports BIO and BIOES, see https://github.com/flairNLP/flair/issues/875
def rename_biluo_to_bioes(old_tag):
    new_tag = ""
    try:
        if old_tag.startswith("L"):
            new_tag = "E" + old_tag[1:]
        elif old_tag.startswith("U"):
            new_tag = "S" + old_tag[1:]
        else:
            new_tag = old_tag
    except:
        pass
    return new_tag


def generate_corpus():
    corpus = []
    n_ex = 0
    for example in data:
        n_ex += 1
        text = example.text
        doc = nlp(text)
        tags = example.get_aligned_ner()
        # Check if it's an empty list of NER tags.
        if None in tags:
            pass
        else:
            new_tags = [rename_biluo_to_bioes(tag) for tag in tags]
            for token, tag in zip(doc,new_tags):
                row = token.text +' '+ token.pos_ +' ' +tag + '\n'
                corpus.append(row)
            corpus.append('\n')
    return corpus

def write_file(filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        corpus = generate_corpus()
        f.writelines(corpus)

def main():
    write_file('flair_test.txt')

main()

Collecting de-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# Define columns for CoNLL (0: word, 1: label)
columns = {0: 'text', 1: 'ner'}

# Set data folder and file names
data_folder = './'
train_file = 'flair_train.txt'
test_file = 'flair_test.txt'

# Load the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=train_file,
                              test_file=test_file)

2024-11-04 03:42:52,212 Reading data from .
2024-11-04 03:42:52,213 Train: flair_train.txt
2024-11-04 03:42:52,214 Dev: None
2024-11-04 03:42:52,216 Test: flair_test.txt
2024-11-04 03:42:53,465 No dev split found. Using 10% (i.e. 14 samples) of the train split as dev data


In [8]:
tag_dictionary= corpus.make_label_dictionary(label_type = 'ner')
tag_dictionary

2024-11-04 03:43:05,133 Computing label dictionary. Progress:


0it [00:00, ?it/s]
121it [00:00, 9738.10it/s]

2024-11-04 03:43:05,204 Dictionary created for label 'ner' with 11 values: SKILL (seen 1857 times), JOB (seen 400 times), WORK (seen 366 times), COMPANY (seen 297 times), LOC (seen 167 times), UNI (seen 120 times), DEG (seen 117 times), NAME (seen 116 times), STUDY (seen 111 times), PHONE (seen 109 times), EMAIL (seen 89 times)





<flair.data.Dictionary at 0x791a6cab2e90>

In [13]:
from collections import Counter

def count_labels(file_path):
    with open(file_path, 'r') as file:
        labels = [line.split()[-1] for line in file if line.strip()]
    return Counter(labels)

print("Train label distribution:", count_labels('flair_train.txt'))
print("Test label distribution:", count_labels('flair_test.txt'))

Train label distribution: Counter({'O': 59880, 'B-SKILL': 2061, 'I-SKILL': 1028, 'PER': 1021, 'I-JOB': 642, 'I-COMPANY': 496, 'B-JOB': 452, 'I-DEG': 341, 'B-COMPANY': 337, 'I-UNI': 238, 'B-LOC': 190, 'I-PHONE': 165, 'B-UNI': 137, 'I-NAME': 136, 'B-DEG': 130, 'B-NAME': 129, 'B-PHONE': 122, 'B-EMAIL': 99, 'I-LOC': 91, 'I-EMAIL': 1})
Test label distribution: Counter({'O': 16372, 'B-SKILL': 587, 'I-SKILL': 265, 'PER': 250, 'I-JOB': 156, 'I-COMPANY': 116, 'B-JOB': 111, 'I-DEG': 94, 'B-COMPANY': 74, 'I-UNI': 72, 'I-PHONE': 50, 'B-UNI': 41, 'B-LOC': 41, 'B-DEG': 36, 'I-NAME': 34, 'B-NAME': 33, 'B-PHONE': 31, 'B-EMAIL': 20, 'I-LOC': 15})


In [9]:
# create NER tagger
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger

embeddings = StackedEmbeddings([
                WordEmbeddings('glove'),
                WordEmbeddings('en-crawl')
            ])

tagger = SequenceTagger(hidden_size=256,
                         embeddings=embeddings,
                         tag_dictionary=tag_dictionary,
                         tag_type='ner',
                         use_crf=True)

2024-11-04 03:43:10,048 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpwqgl4wuu


100%|██████████| 153M/153M [00:06<00:00, 23.6MB/s]

2024-11-04 03:43:16,920 copying /tmp/tmpwqgl4wuu to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2024-11-04 03:43:17,424 removing temp file /tmp/tmpwqgl4wuu
2024-11-04 03:43:17,573 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpjmdpn149


100%|██████████| 20.5M/20.5M [00:01<00:00, 20.8MB/s]

2024-11-04 03:43:18,690 copying /tmp/tmpjmdpn149 to cache at /root/.flair/embeddings/glove.gensim
2024-11-04 03:43:18,711 removing temp file /tmp/tmpjmdpn149





2024-11-04 03:43:23,614 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpnlztex0_


100%|██████████| 1.12G/1.12G [00:40<00:00, 29.7MB/s]

2024-11-04 03:44:04,205 copying /tmp/tmpnlztex0_ to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M.vectors.npy





2024-11-04 03:44:11,663 removing temp file /tmp/tmpnlztex0_
2024-11-04 03:44:12,177 https://flair.informatik.hu-berlin.de/resources/embeddings/token/en-fasttext-crawl-300d-1M not found in cache, downloading to /tmp/tmpw72e7loh


100%|██████████| 37.5M/37.5M [00:01<00:00, 28.2MB/s]

2024-11-04 03:44:13,843 copying /tmp/tmpw72e7loh to cache at /root/.flair/embeddings/en-fasttext-crawl-300d-1M





2024-11-04 03:44:13,943 removing temp file /tmp/tmpw72e7loh
2024-11-04 03:44:30,170 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL


In [10]:
# train flair ner model
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

trainer = ModelTrainer(tagger, corpus)

trainer.train(
    base_path='flair_output/',
    learning_rate=0.01,
    mini_batch_size=8,
    max_epochs=10,
    patience=3,
    embeddings_storage_mode='gpu',
    use_amp=True,  # Use mixed precision training
    train_with_dev=False
)
!cp -r ./flair_output /content/drive/MyDrive/FYP/Implementation/

2024-11-04 03:44:34,955 ----------------------------------------------------------------------------------------------------
2024-11-04 03:44:34,958 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): WordEmbeddings(
      'en-crawl'
      (embedding): Embedding(1000001, 300)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=400, out_features=400, bias=True)
  (rnn): LSTM(400, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=47, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)"
2024-11-04 03:44:34,960 ----------------------------------------------------------------------------------------------------
2024-11-04 03:44:34,963 Corpus: 121 train + 14 dev + 34 test sentences
2024-11-04 03:44:34,965 ---------------------------------

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2024-11-04 03:47:02,787 epoch 1 - iter 1/16 - loss 4.24561175 - time (sec): 147.79 - samples/sec: 23.13 - lr: 0.010000 - momentum: 0.000000
2024-11-04 03:48:49,058 epoch 1 - iter 2/16 - loss 4.20385731 - time (sec): 254.06 - samples/sec: 25.87 - lr: 0.010000 - momentum: 0.000000
2024-11-04 03:50:17,005 epoch 1 - iter 3/16 - loss 4.15725253 - time (sec): 342.01 - samples/sec: 31.12 - lr: 0.010000 - momentum: 0.000000
2024-11-04 03:53:45,371 epoch 1 - iter 4/16 - loss 4.11750967 - time (sec): 550.38 - samples/sec: 27.91 - lr: 0.010000 - momentum: 0.000000
2024-11-04 03:56:21,927 epoch 1 - iter 5/16 - loss 4.08077179 - time (sec): 706.93 - samples/sec: 28.71 - lr: 0.010000 - momentum: 0.000000
2024-11-04 03:58:18,277 epoch 1 - iter 6/16 - loss 4.05052744 - time (sec): 823.28 - samples/sec: 28.35 - lr: 0.010000 - momentum: 0.000000
2024-11-04 03:59:50,855 epoch 1 - iter 7/16 - loss 4.02122733 - time (sec): 915.86 - samples/sec: 29.17 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:01:21,

100%|██████████| 1/1 [00:08<00:00,  8.94s/it]

2024-11-04 04:17:19,816 DEV : loss 2.886380195617676 - f1-score (micro avg)  0.0037
2024-11-04 04:17:19,822  - 0 epochs without improvement
2024-11-04 04:17:19,824 saving best model





2024-11-04 04:17:42,758 ----------------------------------------------------------------------------------------------------
2024-11-04 04:21:02,429 epoch 2 - iter 1/16 - loss 2.89675857 - time (sec): 199.67 - samples/sec: 19.12 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:23:38,384 epoch 2 - iter 2/16 - loss 2.91259884 - time (sec): 355.62 - samples/sec: 21.97 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:25:46,286 epoch 2 - iter 3/16 - loss 2.86567103 - time (sec): 483.52 - samples/sec: 24.84 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:28:53,423 epoch 2 - iter 4/16 - loss 2.80089506 - time (sec): 670.66 - samples/sec: 24.35 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:30:27,773 epoch 2 - iter 5/16 - loss 2.77513204 - time (sec): 765.01 - samples/sec: 26.80 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:32:58,456 epoch 2 - iter 6/16 - loss 2.73163978 - time (sec): 915.69 - samples/sec: 27.65 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:33:40,963 epoch 2 - i

100%|██████████| 1/1 [00:15<00:00, 15.79s/it]

2024-11-04 04:47:37,237 DEV : loss 1.163938283920288 - f1-score (micro avg)  0.0078
2024-11-04 04:47:37,244  - 0 epochs without improvement
2024-11-04 04:47:37,245 saving best model





2024-11-04 04:47:47,020 ----------------------------------------------------------------------------------------------------
2024-11-04 04:50:53,721 epoch 3 - iter 1/16 - loss 1.50359719 - time (sec): 186.70 - samples/sec: 27.96 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:53:04,174 epoch 3 - iter 2/16 - loss 1.47712211 - time (sec): 317.15 - samples/sec: 28.49 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:54:09,718 epoch 3 - iter 3/16 - loss 1.46607596 - time (sec): 382.70 - samples/sec: 30.26 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:54:56,624 epoch 3 - iter 4/16 - loss 1.43397589 - time (sec): 429.60 - samples/sec: 34.06 - lr: 0.010000 - momentum: 0.000000
2024-11-04 04:57:36,041 epoch 3 - iter 5/16 - loss 1.31390673 - time (sec): 589.02 - samples/sec: 34.21 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:00:56,546 epoch 3 - iter 6/16 - loss 1.25346770 - time (sec): 789.52 - samples/sec: 31.64 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:04:33,712 epoch 3 - i

100%|██████████| 1/1 [00:15<00:00, 15.41s/it]

2024-11-04 05:17:54,272 DEV : loss 0.5864046812057495 - f1-score (micro avg)  0.0
2024-11-04 05:17:54,278  - 1 epochs without improvement
2024-11-04 05:17:54,281 ----------------------------------------------------------------------------------------------------





2024-11-04 05:19:29,413 epoch 4 - iter 1/16 - loss 0.98883055 - time (sec): 95.13 - samples/sec: 40.29 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:22:30,475 epoch 4 - iter 2/16 - loss 0.93724463 - time (sec): 276.19 - samples/sec: 27.17 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:24:51,202 epoch 4 - iter 3/16 - loss 0.91371509 - time (sec): 416.92 - samples/sec: 25.04 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:27:40,114 epoch 4 - iter 4/16 - loss 0.85623964 - time (sec): 585.83 - samples/sec: 25.61 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:32:04,553 epoch 4 - iter 5/16 - loss 0.79882023 - time (sec): 850.27 - samples/sec: 24.45 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:33:54,838 epoch 4 - iter 6/16 - loss 0.79121752 - time (sec): 960.55 - samples/sec: 25.55 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:35:20,831 epoch 4 - iter 7/16 - loss 0.80159090 - time (sec): 1046.55 - samples/sec: 26.83 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:38:37,

100%|██████████| 1/1 [00:14<00:00, 14.96s/it]

2024-11-04 05:50:25,645 DEV : loss 0.5455049872398376 - f1-score (micro avg)  0.0
2024-11-04 05:50:25,653  - 2 epochs without improvement
2024-11-04 05:50:25,656 ----------------------------------------------------------------------------------------------------





2024-11-04 05:54:39,341 epoch 5 - iter 1/16 - loss 0.71861263 - time (sec): 253.68 - samples/sec: 20.29 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:58:16,218 epoch 5 - iter 2/16 - loss 0.72913270 - time (sec): 470.56 - samples/sec: 21.54 - lr: 0.010000 - momentum: 0.000000
2024-11-04 05:59:40,410 epoch 5 - iter 3/16 - loss 0.76336280 - time (sec): 554.75 - samples/sec: 23.55 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:01:54,139 epoch 5 - iter 4/16 - loss 0.76852947 - time (sec): 688.48 - samples/sec: 24.81 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:02:50,836 epoch 5 - iter 5/16 - loss 0.77816590 - time (sec): 745.18 - samples/sec: 26.90 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:04:12,603 epoch 5 - iter 6/16 - loss 0.79308266 - time (sec): 826.94 - samples/sec: 28.15 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:06:16,135 epoch 5 - iter 7/16 - loss 0.77413488 - time (sec): 950.48 - samples/sec: 28.64 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:07:26,

100%|██████████| 1/1 [00:15<00:00, 15.35s/it]

2024-11-04 06:20:23,279 DEV : loss 0.5169975161552429 - f1-score (micro avg)  0.0
2024-11-04 06:20:23,286  - 3 epochs without improvement
2024-11-04 06:20:23,289 ----------------------------------------------------------------------------------------------------





2024-11-04 06:21:59,676 epoch 6 - iter 1/16 - loss 0.63143510 - time (sec): 96.38 - samples/sec: 42.06 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:24:10,297 epoch 6 - iter 2/16 - loss 0.76523626 - time (sec): 227.00 - samples/sec: 31.42 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:24:56,471 epoch 6 - iter 3/16 - loss 0.78406132 - time (sec): 273.18 - samples/sec: 37.09 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:28:53,591 epoch 6 - iter 4/16 - loss 0.70313423 - time (sec): 510.30 - samples/sec: 30.89 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:31:30,310 epoch 6 - iter 5/16 - loss 0.69418092 - time (sec): 667.02 - samples/sec: 30.84 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:33:05,435 epoch 6 - iter 6/16 - loss 0.67864614 - time (sec): 762.14 - samples/sec: 32.19 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:33:58,896 epoch 6 - iter 7/16 - loss 0.69829059 - time (sec): 815.60 - samples/sec: 33.79 - lr: 0.010000 - momentum: 0.000000
2024-11-04 06:36:02,3

100%|██████████| 1/1 [00:16<00:00, 16.29s/it]

2024-11-04 06:48:41,419 DEV : loss 0.49467983841896057 - f1-score (micro avg)  0.0
2024-11-04 06:48:41,427  - 4 epochs without improvement (above 'patience')-> annealing learning_rate to [0.005]
2024-11-04 06:48:41,429 ----------------------------------------------------------------------------------------------------





2024-11-04 06:50:45,131 epoch 7 - iter 1/16 - loss 0.75674191 - time (sec): 123.70 - samples/sec: 33.33 - lr: 0.005000 - momentum: 0.000000
2024-11-04 06:53:19,316 epoch 7 - iter 2/16 - loss 0.70952858 - time (sec): 277.88 - samples/sec: 31.51 - lr: 0.005000 - momentum: 0.000000
2024-11-04 06:54:23,159 epoch 7 - iter 3/16 - loss 0.67816765 - time (sec): 341.72 - samples/sec: 37.30 - lr: 0.005000 - momentum: 0.000000
2024-11-04 06:57:09,859 epoch 7 - iter 4/16 - loss 0.69375058 - time (sec): 508.42 - samples/sec: 31.89 - lr: 0.005000 - momentum: 0.000000
2024-11-04 06:58:59,828 epoch 7 - iter 5/16 - loss 0.69965707 - time (sec): 618.39 - samples/sec: 32.32 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:00:01,583 epoch 7 - iter 6/16 - loss 0.70990740 - time (sec): 680.15 - samples/sec: 33.15 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:01:19,336 epoch 7 - iter 7/16 - loss 0.70475105 - time (sec): 757.90 - samples/sec: 34.72 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:05:04,

100%|██████████| 1/1 [00:15<00:00, 15.98s/it]

2024-11-04 07:18:50,925 DEV : loss 0.48347175121307373 - f1-score (micro avg)  0.0
2024-11-04 07:18:50,933  - 1 epochs without improvement
2024-11-04 07:18:50,935 ----------------------------------------------------------------------------------------------------





2024-11-04 07:22:58,734 epoch 8 - iter 1/16 - loss 0.51902622 - time (sec): 247.80 - samples/sec: 21.05 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:25:30,332 epoch 8 - iter 2/16 - loss 0.59302905 - time (sec): 399.39 - samples/sec: 25.09 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:27:02,967 epoch 8 - iter 3/16 - loss 0.62644386 - time (sec): 492.03 - samples/sec: 28.47 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:28:05,828 epoch 8 - iter 4/16 - loss 0.66745544 - time (sec): 554.89 - samples/sec: 30.20 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:30:39,141 epoch 8 - iter 5/16 - loss 0.67267907 - time (sec): 708.20 - samples/sec: 28.98 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:31:20,276 epoch 8 - iter 6/16 - loss 0.70610005 - time (sec): 749.34 - samples/sec: 30.80 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:35:05,544 epoch 8 - iter 7/16 - loss 0.67071468 - time (sec): 974.60 - samples/sec: 30.20 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:36:37,

100%|██████████| 1/1 [00:16<00:00, 16.63s/it]

2024-11-04 07:48:32,122 DEV : loss 0.4754562973976135 - f1-score (micro avg)  0.0
2024-11-04 07:48:32,128  - 2 epochs without improvement
2024-11-04 07:48:32,130 ----------------------------------------------------------------------------------------------------





2024-11-04 07:50:54,733 epoch 9 - iter 1/16 - loss 0.78217304 - time (sec): 142.60 - samples/sec: 21.88 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:52:25,148 epoch 9 - iter 2/16 - loss 0.74886855 - time (sec): 233.01 - samples/sec: 28.30 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:55:22,652 epoch 9 - iter 3/16 - loss 0.68635746 - time (sec): 410.52 - samples/sec: 28.21 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:57:41,884 epoch 9 - iter 4/16 - loss 0.69577857 - time (sec): 549.75 - samples/sec: 28.19 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:58:27,495 epoch 9 - iter 5/16 - loss 0.70221662 - time (sec): 595.36 - samples/sec: 30.96 - lr: 0.005000 - momentum: 0.000000
2024-11-04 07:59:52,798 epoch 9 - iter 6/16 - loss 0.69123685 - time (sec): 680.66 - samples/sec: 31.51 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:00:45,360 epoch 9 - iter 7/16 - loss 0.69256631 - time (sec): 733.23 - samples/sec: 33.94 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:02:58,

100%|██████████| 1/1 [00:15<00:00, 15.86s/it]

2024-11-04 08:20:39,491 DEV : loss 0.4680645763874054 - f1-score (micro avg)  0.0
2024-11-04 08:20:39,501  - 3 epochs without improvement
2024-11-04 08:20:39,507 ----------------------------------------------------------------------------------------------------





2024-11-04 08:21:33,273 epoch 10 - iter 1/16 - loss 0.72782728 - time (sec): 53.76 - samples/sec: 55.02 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:23:22,337 epoch 10 - iter 2/16 - loss 0.70125997 - time (sec): 162.83 - samples/sec: 42.01 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:25:01,589 epoch 10 - iter 3/16 - loss 0.65062039 - time (sec): 262.08 - samples/sec: 43.41 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:26:28,776 epoch 10 - iter 4/16 - loss 0.66973686 - time (sec): 349.27 - samples/sec: 42.63 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:28:34,221 epoch 10 - iter 5/16 - loss 0.66638529 - time (sec): 474.71 - samples/sec: 37.91 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:29:42,935 epoch 10 - iter 6/16 - loss 0.67775553 - time (sec): 543.42 - samples/sec: 39.16 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:32:43,317 epoch 10 - iter 7/16 - loss 0.68461235 - time (sec): 723.81 - samples/sec: 35.14 - lr: 0.005000 - momentum: 0.000000
2024-11-04 08:

100%|██████████| 1/1 [00:15<00:00, 15.69s/it]

2024-11-04 08:50:21,191 DEV : loss 0.4630123972892761 - f1-score (micro avg)  0.0
2024-11-04 08:50:21,199  - 4 epochs without improvement (above 'patience')-> annealing learning_rate to [0.0025]





2024-11-04 08:50:43,503 ----------------------------------------------------------------------------------------------------
2024-11-04 08:50:43,507 Loading model from best epoch ...
2024-11-04 08:51:02,154 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>


100%|██████████| 1/1 [00:06<00:00,  6.83s/it]

2024-11-04 08:51:09,844 
Results:
- F-score (micro) 0.0098
- F-score (macro) 0.002
- Accuracy 0.0052

By class:
              precision    recall  f1-score   support

         LOC     0.0114    0.2439    0.0217        41
       SKILL     0.0000    0.0000    0.0000       587
         JOB     0.0000    0.0000    0.0000       111
     COMPANY     0.0000    0.0000    0.0000        74
        WORK     0.0000    0.0000    0.0000       100
       PHONE     0.0000    0.0000    0.0000        31
         UNI     0.0000    0.0000    0.0000        41
       STUDY     0.0000    0.0000    0.0000        36
         DEG     0.0000    0.0000    0.0000        36
        NAME     0.0000    0.0000    0.0000        33
       EMAIL     0.0000    0.0000    0.0000        20

   micro avg     0.0107    0.0090    0.0098      1110
   macro avg     0.0010    0.0222    0.0020      1110
weighted avg     0.0004    0.0090    0.0008      1110

2024-11-04 08:51:09,845 ---------------------------------------------------




In [11]:
# evaluate model
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

# Load the trained model
model = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/best-model.pt')

# Evaluate the model on the test set
result = model.evaluate(corpus.test, gold_label_type='ner', mini_batch_size=32)

# Print the results
# print("Evaluation Loss:", eval_loss)
print(result.detailed_results)  # print the precision, recall, and F1-score per entity type

2024-11-04 08:53:14,438 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>


100%|██████████| 2/2 [00:09<00:00,  4.79s/it]


Results:
- F-score (micro) 0.0098
- F-score (macro) 0.002
- Accuracy 0.0052

By class:
              precision    recall  f1-score   support

         LOC     0.0114    0.2439    0.0217        41
       SKILL     0.0000    0.0000    0.0000       587
         JOB     0.0000    0.0000    0.0000       111
     COMPANY     0.0000    0.0000    0.0000        74
        WORK     0.0000    0.0000    0.0000       100
       PHONE     0.0000    0.0000    0.0000        31
         UNI     0.0000    0.0000    0.0000        41
       STUDY     0.0000    0.0000    0.0000        36
         DEG     0.0000    0.0000    0.0000        36
        NAME     0.0000    0.0000    0.0000        33
       EMAIL     0.0000    0.0000    0.0000        20

   micro avg     0.0107    0.0090    0.0098      1110
   macro avg     0.0010    0.0222    0.0020      1110
weighted avg     0.0004    0.0090    0.0008      1110






In [12]:
# make prediction
import flair
model = SequenceTagger.load('/content/drive/MyDrive/FYP/Implementation/flair_output/best-model.pt')
resume_text = '''
John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.

John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'
'''
sentence = flair.data.Sentence(resume_text.lower())

model.predict(sentence)

print(sentence.to_tagged_string())

2024-11-04 08:53:50,822 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>
Sentence[326]: " john doe lives at 1234 elm street in los angeles, ca 90001. he can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. john is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of javascript, python, and cloud technologies like aws and azure. currently, he works as a software engineer at google llc in san francisco, ca, where he has been employed since august 2019. in this role, he has developed scalable web applications