<a href="https://colab.research.google.com/github/chewzzz1014/fyp/blob/master/ner/src/train_ner_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Train NER Models

In [1]:
# mount drive

from google.colab import drive

drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!mkdir spacy_ner_data

## Spacy NER

In [None]:
# load json and convert into spacy format



import json

import random

from sklearn.model_selection import train_test_split

import spacy

from spacy.tokens import DocBin



# load JSON data from Drive

with open('/content/drive/MyDrive/FYP/Implementation/Resume Dataset/200_resumes_annotated.json', "r") as f:

    data = json.load(f)



# remove overlapped entities (one word has >1 entitiy)

def remove_overlapping_entities(entities):

    """Remove overlapping entities from the list."""

    # sort entities by start position

    entities = sorted(entities, key=lambda x: x[0])

    non_overlapping = []

    last_end = -1

    for start, end, label in entities:

        # only add to list if there's no overlap with the previous entity

        if start >= last_end:

            non_overlapping.append((start, end, label))

            last_end = end

    return non_overlapping



# convert JSON data to Spacy's DocBin format

def convert_to_spacy_format(data):

    # load a blank Spacy model

    nlp = spacy.blank("en")

    # container for our docs

    doc_bin = DocBin()



    for item in data:

        # full document text

        text = item['data']['Text']

        entities = []



        for annotation in item['annotations'][0]['result']:

            start = annotation['value']['start']

            end = annotation['value']['end']

            label = annotation['value']['labels'][0]

            entities.append((start, end, label))



        # remove overlapping entities

        entities = remove_overlapping_entities(entities)

        # create a Spacy doc and add entities to it

        doc = nlp.make_doc(text)

        spans = [doc.char_span(start, end, label=label) for start, end, label in entities]

        # filter out None spans if Spacy can't align the character indices with tokens

        spans = [span for span in spans if span is not None]

        # assign entities to the doc

        doc.ents = spans

        doc_bin.add(doc)



    return doc_bin



# split data into train and test sets

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)



# convert train and test sets to Spacy format

train_doc_bin = convert_to_spacy_format(train_data)

test_doc_bin = convert_to_spacy_format(test_data)



# save the train and test data to .spacy files in current runtime

train_doc_bin.to_disk("spacy_ner_data/train_data.spacy")

test_doc_bin.to_disk("spacy_ner_data/test_data.spacy")

In [None]:
# check the distribution of entitiy labels



import spacy

from spacy.lang.en import English

from spacy.tokens import DocBin



# count number of label entities

def count_entity_labels(file_path):

    # load data from file_path and create DocBin

    doc_bin = DocBin().from_disk(file_path)

    label_counts = {}

    for doc in doc_bin.get_docs(English().vocab):

        # count occurence of label

        for ent in doc.ents:

            label = ent.label_

            label_counts[label] = label_counts.get(label, 0) + 1

    return label_counts



# calculate and print label distribution in train and test data

# sorted from largest to smallest

train_label_counts = count_entity_labels("spacy_ner_data/train_data.spacy")

sorted_train_label_counts = sorted(train_label_counts.items(), key=lambda x: x[1], reverse=True)

print("Train Data Entity Label Distribution:")

for label, count in sorted_train_label_counts:

    print(f"{label}: {count}")



test_label_counts = count_entity_labels("spacy_ner_data/test_data.spacy")

sorted_test_label_counts = sorted(test_label_counts.items(), key=lambda x: x[1], reverse=True)

print("\nTest Data Entity Label Distribution:")

for label, count in sorted_test_label_counts:

    print(f"{label}: {count}")

In [None]:
# create base_config.cfg and paste the config generated from spacy widget

# need to update train and test file path

!touch base_config.cfg

In [None]:
# generate config.cfg from base_config.cfg

!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# vector used for spacy ner

!python -m spacy download en_core_web_lg

In [None]:
# train model using hyperparameters set in config.cfg

# save trained model in spacy-output/ dir



# using cpu

# !python -m spacy train config.cfg --output ./spacy_output



# using gpu

!python -m spacy train config.cfg --gpu-id 0 --output ./spacy_output



# save output dir into drive

!cp -r ./spacy_output /content/drive/MyDrive/FYP/Implementation/

In [None]:
# evaluate trained model performance

# store output and visualization into result/ dir

!python -m spacy evaluate spacy_output/model-best spacy_ner_data/test_data.spacy -dp spacy_output

In [None]:
# make prediction



import spacy

import string



resume_text = '''

John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.



John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'

'''



# convert text into small letter then remove punctuation

resume_text = resume_text.lower()

resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))



# load trained model

nlp = spacy.load("spacy_output/model-best")



# create a Spacy doc and add text to it

doc = nlp(resume_text.lower())



# load Spacy doc's entities

print(doc.ents)



# print predicted entities in text

for ent in doc.ents:

    print(f"{ent.text}: {ent.label_}")

In [None]:
# visualize predicted entities using displacy



from spacy import displacy

displacy.render(doc, style="ent", jupyter=True)

## Flair NER

In [1]:
# install flair library

!pip install flair

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting gdown>=4.4.0 (from flair)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting segtok>=1.5.11 (from flair)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0

In [2]:
# convert json into flair data



import json

import random

from typing import List, Dict, Tuple

import spacy

from collections import defaultdict



class NERConverter:

    def __init__(self):

        # load pretrained model from Spacy library

        # to create Spacy Doc object

        self.nlp = spacy.load("en_core_web_sm")



    # get BIOES label based on location of word

    def get_bioes_label(self, token_index: int, entity_length: int, current_position: int, label: str) -> str:

        """

        Convert to BIOES format

        - S-: Single token entity

        - B-: Beginning of multi-token entity

        - I-: Inside of multi-token entity

        - E-: End of multi-token entity

        - O: Outside

        """

        if entity_length == 1:

            return f'S-{label}'

        if current_position == 0:

            return f'B-{label}'

        if current_position == entity_length - 1:

            return f'E-{label}'

        return f'I-{label}'



    # convert Label Studio's exported annotations in json format intto BIOES format

    def convert_to_bioes_format(self, json_data: List[dict]) -> List[List[Tuple[str, str]]]:

        """Convert JSON annotations to BIOES format."""

        all_sentences = []



        # process all annotation in json file

        for item in json_data:

            text = item['data']['Text']

            doc = self.nlp(text)



            # initialize character-level labels

            char_labels = ['O'] * len(text)



            # first pass: identify entity boundaries and lengths

            entity_spans = []

            if item['annotations'] and len(item['annotations']) > 0:

                for ann in item['annotations'][0]['result']:

                    if 'value' in ann:

                        start = ann['value']['start']

                        end = ann['value']['end']

                        label = ann['value']['labels'][0]

                        entity_spans.append((start, end, label))



            # sort spans by start position

            entity_spans.sort(key=lambda x: x[0])



            # second pass: apply BIOES labels

            for start, end, label in entity_spans:

                # get tokens that are part of this entity

                entity_text = text[start:end]

                entity_doc = self.nlp(entity_text)

                entity_length = len([token for token in entity_doc if not token.is_space])



                # set labels for the entire span

                current_token_idx = 0

                for i in range(start, end):

                    if i == start or text[i-1].isspace():

                        char_labels[i] = self.get_bioes_label(i, entity_length, current_token_idx, label)

                        current_token_idx += 1

                    else:

                        char_labels[i] = char_labels[i-1]



            # convert to token-level labels

            current_sentence = []

            for sent in doc.sents:

                for token in sent:

                    # get the most common label for the token's characters

                    token_chars_labels = char_labels[token.idx:token.idx + len(token.text)]

                    label_counts = defaultdict(int)

                    for char_label in token_chars_labels:

                        label_counts[char_label] += 1



                    token_label = max(label_counts.items(), key=lambda x: x[1])[0]

                    current_sentence.append((token.text, token_label))



                if current_sentence:

                    all_sentences.append(current_sentence)

                    current_sentence = []



        return all_sentences



    # write data in BIOES format into txt file

    def write_flair_file(self, sentences: List[List[Tuple[str, str]]], filename: str):

        """Write sentences in BIOES format to file."""

        with open(filename, 'w', encoding='utf-8') as f:

            for sentence in sentences:

                for token, label in sentence:

                    f.write(f'{token} {label}\n')

                f.write('\n')



    # convert json data into BIOES data

    # split BIOES data into train and test

    def convert_and_split(self, json_data: List[dict], train_file: str, test_file: str, test_ratio: float = 0.2):

        """Convert JSON to BIOES format and split into train/test sets."""

        all_sentences = self.convert_to_bioes_format(json_data)



        # shuffle and split based on test_ratio

        random.shuffle(all_sentences)

        split_idx = int(len(all_sentences) * (1 - test_ratio))



        # use list slicing to split

        train_sentences = all_sentences[:split_idx]

        test_sentences = all_sentences[split_idx:]



        # write to txt files

        self.write_flair_file(train_sentences, train_file)

        self.write_flair_file(test_sentences, test_file)



        return len(train_sentences), len(test_sentences)





# load JSON data

with open('/kaggle/input/ner-dataset-507/756_resumes_annotated.json', 'r', encoding='utf-8') as f:

  json_data = json.load(f)



# load self-defined convert class

converter = NERConverter()


# convert json data into BIOES data and split into train and test

train_count, test_count = converter.convert_and_split(

    json_data,

    train_file='flair_train.txt',

    test_file='flair_test.txt',

    test_ratio=0.2

)

print(f'Created {train_count} training sentences and {test_count} test sentences')

Created 1086 training sentences and 272 test sentences


In [3]:
from flair.data import Corpus

from flair.datasets import ColumnCorpus



# define columns for CoNLL (0: word, 1: label)

columns = {0: 'text', 1: 'ner'}



# set data folder and train and test path

data_folder = './'

train_file = 'flair_train.txt'

test_file = 'flair_test.txt'



# load the corpus

corpus: Corpus = ColumnCorpus(data_folder, columns,

                              train_file=train_file,

                              test_file=test_file,

                              dev_file=None)

2024-12-03 11:33:39,200 Reading data from .
2024-12-03 11:33:39,201 Train: flair_train.txt
2024-12-03 11:33:39,201 Dev: None
2024-12-03 11:33:39,202 Test: flair_test.txt
2024-12-03 11:33:43,869 No dev split found. Using 10% (i.e. 109 samples) of the train split as dev data


In [4]:
# generate a dictionary of unique labels from the NER corpus.

# this dictionary maps each named entity label in the dataset to an integer ID.

tag_dictionary = corpus.make_label_dictionary(label_type='ner')

print("Labels:", tag_dictionary.get_items())

2024-12-03 11:34:07,724 Computing label dictionary. Progress:


0it [00:00, ?it/s]
977it [00:00, 16930.89it/s]

2024-12-03 11:34:07,813 Dictionary created for label 'ner' with 11 values: SKILL (seen 8458 times), JOB (seen 2509 times), WORK (seen 1738 times), LOC (seen 1704 times), COMPANY (seen 1533 times), UNI (seen 756 times), DEG (seen 675 times), NAME (seen 542 times), STUDY (seen 528 times), PHONE (seen 525 times), EMAIL (seen 455 times)
Labels: ['SKILL', 'JOB', 'WORK', 'LOC', 'COMPANY', 'UNI', 'DEG', 'NAME', 'STUDY', 'PHONE', 'EMAIL']





In [5]:
from collections import Counter



# count frequency of each entity label

def count_labels(file_path):

    with open(file_path, 'r') as file:

        labels = [line.split()[-1] for line in file if line.strip()]

    return Counter(labels)



# number of

print("Train label distribution:", count_labels('flair_train.txt'))

print("Test label distribution:", count_labels('flair_test.txt'))

Train label distribution: Counter({'O': 264175, 'S-SKILL': 5290, 'PER': 4581, 'B-SKILL': 4370, 'E-SKILL': 4370, 'E-JOB': 2477, 'B-JOB': 2435, 'E-COMPANY': 1355, 'I-JOB': 1353, 'B-COMPANY': 1350, 'S-LOC': 1272, 'I-DEG': 1210, 'I-COMPANY': 939, 'I-SKILL': 881, 'E-UNI': 809, 'B-UNI': 807, 'B-DEG': 734, 'E-DEG': 734, 'I-UNI': 697, 'B-LOC': 614, 'E-LOC': 614, 'B-NAME': 605, 'E-NAME': 605, 'S-EMAIL': 514, 'E-PHONE': 473, 'B-PHONE': 468, 'I-PHONE': 352, 'S-COMPANY': 341, 'S-JOB': 322, 'S-PHONE': 108, 'I-NAME': 18, 'I-LOC': 16, 'S-DEG': 14, 'S-UNI': 14, 'B-EMAIL': 2, 'E-EMAIL': 2, 'S-NAME': 2})
Test label distribution: Counter({'O': 57294, 'S-SKILL': 1245, 'PER': 1107, 'B-SKILL': 985, 'E-SKILL': 981, 'E-JOB': 627, 'B-JOB': 623, 'I-JOB': 354, 'B-COMPANY': 327, 'E-COMPANY': 324, 'S-LOC': 287, 'I-DEG': 249, 'I-COMPANY': 228, 'I-SKILL': 190, 'B-UNI': 156, 'E-UNI': 155, 'B-DEG': 152, 'E-DEG': 152, 'B-LOC': 151, 'E-LOC': 151, 'B-NAME': 142, 'E-NAME': 142, 'I-UNI': 122, 'S-EMAIL': 121, 'E-PHONE': 106

In [6]:
# create NER tagger
from flair.embeddings import WordEmbeddings, StackedEmbeddings, TransformerWordEmbeddings, FlairEmbeddings
from flair.models import SequenceTagger
import torch.nn as nn

# 1. using LSTM-CRF on top of frozen embeddings
# combine flair and glove embeddings
embeddings = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),  # Use faster, lighter versions
    FlairEmbeddings('news-backward-fast')
])
    
    # 2. Configure tagger with memory and performance optimizations
tagger = SequenceTagger(
    hidden_size=64,  # Reduced hidden size to save memory
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    use_crf=True,
    tag_format='BIOES',
    dropout=0.3,  # Moderate dropout
    rnn_layers=1,  # Single layer to reduce memory usage
)

2024-12-03 11:34:14,564 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmp91mu1yh1


100%|██████████| 153M/153M [00:14<00:00, 10.8MB/s] 

2024-12-03 11:34:29,791 copying /tmp/tmp91mu1yh1 to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2024-12-03 11:34:29,904 removing temp file /tmp/tmp91mu1yh1
2024-12-03 11:34:30,423 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmphz92a8ou


100%|██████████| 20.5M/20.5M [00:01<00:00, 11.2MB/s]

2024-12-03 11:34:32,960 copying /tmp/tmphz92a8ou to cache at /root/.flair/embeddings/glove.gensim
2024-12-03 11:34:32,977 removing temp file /tmp/tmphz92a8ou





2024-12-03 11:34:53,045 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmp1vdwczie


100%|██████████| 18.8M/18.8M [00:01<00:00, 11.6MB/s]

2024-12-03 11:34:55,125 copying /tmp/tmp1vdwczie to cache at /root/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2024-12-03 11:34:55,141 removing temp file /tmp/tmp1vdwczie





2024-12-03 11:34:56,012 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmp61y2fady


100%|██████████| 18.8M/18.8M [00:01<00:00, 11.6MB/s]

2024-12-03 11:34:58,087 copying /tmp/tmp61y2fady to cache at /root/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2024-12-03 11:34:58,102 removing temp file /tmp/tmp61y2fady





2024-12-03 11:34:58,187 SequenceTagger predicts: Dictionary with 45 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-LOC, B-LOC, E-LOC, I-LOC, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL


In [None]:
# train flair ner model

from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
import torch

# define ModelTrained based on tagger and corpus
trainer = ModelTrainer(tagger, corpus)

# train Flair NER Model
trainer.train(
    base_path='flair_output/',
    learning_rate=0.1,
    mini_batch_size=8,  # Slightly increased batch size
    max_epochs=20,      # Reduced epochs with early stopping
    patience=3,         # Early stopping
    train_with_dev=True,
    save_final_model=True,
    use_amp=True,       # Mixed precision training
)

# save trained model to drive
!cp -r ./flair_output /kaggle/working/

2024-12-03 11:35:43,281 ----------------------------------------------------------------------------------------------------
2024-12-03 11:35:43,282 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
      )
    )
  )
  (dropout): Dropout(p=0.3, inplace=False)
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2148, out_features=2148, bias=True)
  (rnn): LSTM(2148, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=

  scaler = torch.cuda.amp.GradScaler(enabled=use_amp and flair.device.type != "cpu")


2024-12-03 11:36:12,060 epoch 1 - iter 13/136 - loss 4.55725944 - time (sec): 28.75 - samples/sec: 1062.44 - lr: 0.100000 - momentum: 0.000000
2024-12-03 11:36:37,433 epoch 1 - iter 26/136 - loss 2.73891578 - time (sec): 54.13 - samples/sec: 1163.74 - lr: 0.100000 - momentum: 0.000000
2024-12-03 11:37:01,490 epoch 1 - iter 39/136 - loss 2.13621505 - time (sec): 78.18 - samples/sec: 1176.11 - lr: 0.100000 - momentum: 0.000000
2024-12-03 11:37:31,060 epoch 1 - iter 52/136 - loss 1.75676127 - time (sec): 107.75 - samples/sec: 1167.10 - lr: 0.100000 - momentum: 0.000000
2024-12-03 11:37:56,110 epoch 1 - iter 65/136 - loss 1.58926555 - time (sec): 132.80 - samples/sec: 1142.43 - lr: 0.100000 - momentum: 0.000000
2024-12-03 11:38:17,977 epoch 1 - iter 78/136 - loss 1.46072149 - time (sec): 154.67 - samples/sec: 1151.15 - lr: 0.100000 - momentum: 0.000000
2024-12-03 11:39:01,094 epoch 1 - iter 91/136 - loss 1.33339771 - time (sec): 197.79 - samples/sec: 1058.68 - lr: 0.100000 - momentum: 0.00

In [2]:
# evaluate model
from flair.data import Corpus

from flair.datasets import ColumnCorpus

from flair.models import SequenceTagger

from flair.trainers import ModelTrainer



# load the trained model

model = SequenceTagger.load('/kaggle/working/flair_output/final-model.pt')



# evaluate the model on the test set

result = model.evaluate(corpus.test, gold_label_type='ner', mini_batch_size=32)



# print the results

# print("Evaluation Loss:", eval_loss)

print(result.detailed_results)  # print the precision, recall, and F1-score per entity type

Collecting flair
  Downloading flair-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting gdown>=4.4.0 (from flair)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting segtok>=1.5.11 (from flair)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/working/flair_output/final-model.pt'. Use `repo_type` argument if needed.

In [11]:
# make prediction



from flair.models import SequenceTagger

from flair.data import Sentence

import spacy, string

from spacy import displacy



# load trained Flair NER model

tagger = SequenceTagger.load('/kaggle/working/flair_output/final-model.pt')



resume_text = '''

John Doe lives at 1234 Elm Street in Los Angeles, CA 90001. He can be reached at +1 (555) 123-4567 or via email at john.doe@example.com. John is a results-driven software engineer with over 5 years of experience in web development and cloud infrastructure, with strong knowledge of JavaScript, Python, and cloud technologies like AWS and Azure. Currently, he works as a Software Engineer at Google LLC in San Francisco, CA, where he has been employed since August 2019. In this role, he has developed scalable web applications using JavaScript, Node.js, and React, deployed and maintained cloud infrastructure on AWS, reducing downtime by 20%, and led a team of 4 engineers to enhance backend performance by 30%. Previously, he worked as a Junior Developer at Tech Innovators Inc. in Austin, TX, from July 2017 to July 2019, where he created RESTful APIs using Python and Flask, collaborated with front-end developers to build and deploy user-facing applications, and wrote unit and integration tests, improving code coverage by 15%.



John holds a Master of Science in Computer Science from the University of California, Berkeley, with a graduation date of May 2017, and a Bachelor of Science in Information Technology from the University of Texas at Austin, graduated in May 2015. His skillset includes proficiency in programming languages like Python, JavaScript, and Java; frameworks such as React, Flask, and Django; cloud platforms including AWS, Google Cloud, and Azure; as well as other tools like Git, Docker, Kubernetes, and SQL. He is certified as an AWS Certified Solutions Architect – Associate, earned in 2020, and as a Google Professional Cloud Architect, earned in 2021'

'''
resume_text = resume_text.lower()
resume_text = resume_text.translate(str.maketrans('', '', string.punctuation))


# step 1: predict entities using Flair trained model

sentence = Sentence(resume_text)

tagger.predict(sentence)



# step 2: convert Flair predictions to spaCy doc format

# initialize a blank spaCy NLP pipeline

nlp = spacy.blank("en")

doc = nlp(resume_text)



# extract entities from Flair prediction and convert to spaCy format

ents = []

for entity in sentence.get_spans('ner'):

    start, end = entity.start_position, entity.end_position

    label = entity.tag

    span = doc.char_span(start, end, label=label)

    if span is not None:

        ents.append(span)



# set the entities in the spaCy doc

doc.ents = ents



# step 3: visualization of prediction using displacy

displacy.render(doc, style="ent", jupyter=True)


2024-11-16 15:42:07,008 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-LOC, B-LOC, E-LOC, I-LOC, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>


In [13]:
resume_text_1 = '''
Zi Qing Chew
chewziqing@gmail.com | 016-2892475 | Kuala Lumpur, Malaysia | linkedin.com/in/ziqingchew | github.com/chewzzz1014
EDUCATION

Universiti Putra Malaysia					                                                   Oct 2021 - Current
Bachelor in Computer Science with Honours
Expected to graduate in July 2025. CGPA: 3.99

WORK EXPERIENCE

Ant International 									          	July 2024 – Oct 2024
Java Engineer Intern							                               Kuala Lumpur, Malaysia
Collaborated in developing an audit logging feature for Ant Group’s internal Foreign Exchange (FX) trade strategy system that records changes made by business users to trade strategies.
Conducted comprehensive system analysis and project planning, delivering presentations to project stakeholders and QA teams prior to the development phase.
Utilised Ant Group’s internal frameworks, middleware, and tools to implement the audit logging feature.
Skills: Java, Spring, Sofaboot, Ant Group internal middlewares (ZDAL, DRM, Ant Scheduler, Msg Broker)
Howuku  									          	             Feb 2023 – Sep 2023
Software Developer Intern							                    Kuala Lumpur, Malaysia
Developed and optimized A/B testing features, including code editor and previewer for CSS and JavaScript modifications for experiment variations.
Expanded A/B testing targeting rule by incorporating website visitor's OS, device, and browser rules.
Automated experiment-stopping criteria and email notifications based on user-defined experiment termination conditions.
Collaborated with cross-functional teams to debug, troubleshoot, and enhance Howuku platform features based on user feedback and performance data.
Skills: JavaScript, Bootstrap, Vue.js, Express.js, MySQL

PROJECTS

Personal Portfolio Website (chewzzz1014.github.io/portfolio-website)
Designed, developed and deployed personalised portfolio website featuring skills, selected projects, and downloadable resume.
Skills: JavaScript, React.js, CSS, Bootstrap
Depression Level Detection Chatbot (https://github.com/chewzzz1014/health-ease-project)
Developed machine learning application that evaluates a message's depression level and provided tailored mental health advice and information based on the depression severity.
Skills: Python, pandas, scikit-learn, Keras, FastAPI, Gradio
Clothing Store Website (https://github.com/chewzzz1014/CSC3402-MVC-Project)
Worked in team to build a CRUD Spring Boot application with attractive interfaces, data persistence, authentication and authorisation.
Developed the backend of the application that involves querying the database, building REST endpoints and implementing Thymeleaf in HTML for dynamic contents.
Skills: Spring Boot, Spring MVC, Thymeleaf, Hibernate, Bootstrap

SKILLS
Programming Languages: Java, Python, HTML, CSS, JavaScript, MySQL, OracleSQL
Frameworks and Libraries: Spring, Spring Boot, TypeScript, Node.js, Express.js, React.js, Vue.js, Bootstrap, Tailwind CSS
Tools: Git, Github, Jira, Tableau, Excel, Jupyter Notebook, Google Colab, VSCode, IntelliJ
'''

In [14]:
# make prediction



from flair.models import SequenceTagger

from flair.data import Sentence

import spacy, string

from spacy import displacy



# load trained Flair NER model

tagger = SequenceTagger.load('/kaggle/working/flair_output/final-model.pt')



resume_text_1 = resume_text_1.lower()
resume_text_1 = resume_text_1.translate(str.maketrans('', '', string.punctuation))


# step 1: predict entities using Flair trained model

sentence = Sentence(resume_text_1)

tagger.predict(sentence)



# step 2: convert Flair predictions to spaCy doc format

# initialize a blank spaCy NLP pipeline

nlp = spacy.blank("en")

doc = nlp(resume_text_1)



# extract entities from Flair prediction and convert to spaCy format

ents = []

for entity in sentence.get_spans('ner'):

    start, end = entity.start_position, entity.end_position

    label = entity.tag

    span = doc.char_span(start, end, label=label)

    if span is not None:

        ents.append(span)



# set the entities in the spaCy doc

doc.ents = ents



# step 3: visualization of prediction using displacy

displacy.render(doc, style="ent", jupyter=True)


2024-11-16 15:42:34,569 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-LOC, B-LOC, E-LOC, I-LOC, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>


In [14]:
# confidence of each predicted entity label



from flair.models import SequenceTagger

from flair.data import Sentence



# load the pretrained NER model

tagger = SequenceTagger.load("/kaggle/working/flair_output/final-model.pt")



# create a Sentence object

sentence = Sentence(resume_text)



# predict entities

tagger.predict(sentence)



# print the detected entities

for entity in sentence.get_spans("ner"):

    print(f"Entity: {entity.text}, Type: {entity.get_label('ner').value}, Confidence: {entity.score}")

2024-11-15 10:23:13,820 SequenceTagger predicts: Dictionary with 47 tags: O, S-SKILL, B-SKILL, E-SKILL, I-SKILL, S-JOB, B-JOB, E-JOB, I-JOB, S-WORK, B-WORK, E-WORK, I-WORK, S-COMPANY, B-COMPANY, E-COMPANY, I-COMPANY, S-LOC, B-LOC, E-LOC, I-LOC, S-UNI, B-UNI, E-UNI, I-UNI, S-DEG, B-DEG, E-DEG, I-DEG, S-NAME, B-NAME, E-NAME, I-NAME, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-STUDY, B-STUDY, E-STUDY, I-STUDY, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, <START>, <STOP>
Entity: john doe, Type: NAME, Confidence: 0.9760492444038391
Entity: los angeles, Type: LOC, Confidence: 0.9690457582473755
Entity: 1 555 1234567, Type: PHONE, Confidence: 0.7879879077275594
Entity: johndoeexamplecom, Type: EMAIL, Confidence: 0.9108949899673462
Entity: john, Type: NAME, Confidence: 0.9642440676689148
Entity: software engineer, Type: JOB, Confidence: 0.7308900654315948
Entity: web development, Type: SKILL, Confidence: 0.9023480117321014
Entity: cloud infrastructure, Type: SKILL, Confidence: 0.864546149969101
Entity: javas