<a href="https://colab.research.google.com/github/chewzzz1014/fyp/blob/master/ner/src/train_ner_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train NER Models

In [48]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
import json
import random

def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f)

def split_data(data, train_ratio=0.8):
    # Shuffle the data
    random.shuffle(data)

    # Calculate the split index
    split_index = int(len(data) * train_ratio)

    # Split the data
    train_data = data[split_index:]
    test_data = data[:split_index]

    return train_data, test_data

# Load the JSON data
json_file_path = '/content/drive/MyDrive/FYP/Implementation/Resume Dataset/10_resumes_annotated.json'
data = load_json(json_file_path)

# Split the data
train_data, test_data = split_data(data)

# Save the splits
save_json(train_data, 'train_data.json')
save_json(test_data, 'test_data.json')

## Spacy NER

In [78]:
# create dir to place spacy ner data
!mkdir spacy_ner_data

# convert CONLL2003 annotation data into spacy data
!python -m spacy convert 'train_data.json' spacy_ner_data -c ner
!python -m spacy convert "test_data.json" spacy_ner_data -c ner

mkdir: cannot create directory ‘spacy_ner_data’: File exists
[38;5;3m⚠ Can't automatically detect NER format. Conversion may not succeed.
See https://spacy.io/api/cli#convert[0m
[38;5;3m⚠ No sentence boundaries found to use with option `-n 1`. Use `-s` to
automatically segment sentences or `-n 0` to disable.[0m
[38;5;3m⚠ No sentence boundaries found. Use `-s` to automatically segment
sentences.[0m
[38;5;3m⚠ No document delimiters found. Use `-n` to automatically group
sentences into documents.[0m
[38;5;2m✔ Generated output file (1 documents):
spacy_ner_data/train_data.spacy[0m
[38;5;3m⚠ Can't automatically detect NER format. Conversion may not succeed.
See https://spacy.io/api/cli#convert[0m
[38;5;3m⚠ No sentence boundaries found to use with option `-n 1`. Use `-s` to
automatically segment sentences or `-n 0` to disable.[0m
[38;5;3m⚠ No sentence boundaries found. Use `-s` to automatically segment
sentences.[0m
[38;5;3m⚠ No document delimiters found. Use `-n` to automat

In [32]:
# create base_config.cfg and paste the config generated from spacy widget
# update train and test file path
!touch base_config.cfg

In [79]:
# generate config.cfg from base_config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [81]:
# train model using hyperparameters set in config.cfg
# trained model in output/ dir
!python -m spacy train config.cfg --output ./output

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/spacy/language.py", line 1327, in initialize
    init_vocab(
  File "/usr/local/lib/python3.10/dist-packages/spacy/training/initialize.py", line 142, in init_vocab
    load_vectors_into_model(nlp, vectors)
  File "/usr/local/lib/python3.10/dist-packages/spacy/training/initialize.py", line 164, in load_vectors_into_model
    vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
  File "/usr/local/lib/python3.10/dist-packages/spacy/util.py", line 472, in load_model
    raise IOError(Errors.E050.format(name=name))
OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a Python package or a valid path to a data directory.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/r

In [None]:
# evaluate trained model performance
# store output and visualization into result/ dir
!python -m spacy evaluate output/model-best spacy_ner_data/test.spacy -dp result

In [None]:
# download trained model

## Flair NER

In [None]:
!pip install flair

In [None]:
# load data
import flair
from flair.datasets import ColumnCorpus

# Define columns
columns = {0: 'text', 1: 'ner'}

# Specify the path to your training and test data
data_folder = 'path/to/your/data'  # Update this path
train_file = 'train.txt'  # Your training file
test_file = 'test.txt'    # Your testing file

# Create the corpus
corpus = ColumnCorpus(data_folder,
                      { 'train': train_file,
                        'test': test_file },
                      columns=columns)

In [None]:
# create NER tagger
from flair.models import SequenceTagger

tagger = SequenceTagger(hidden_size=256,
                         embeddings='glove',
                         tag_dictionary=corpus.make_tag_dictionary(tag_type='ner'),
                         tag_type='ner',
                         use_crf=True)


In [None]:
# train model
from flair.trainers import ModelTrainer

trainer = ModelTrainer(tagger, corpus)

trainer.train('path/to/save/model',  # Update this path
               learning_rate=0.1,
               mini_batch_size=32,
               max_epochs=10)

In [None]:
# evaluate model
result, score = trainer.evaluate(corpus.test)
print(result)
print(score)

In [None]:
# make prediction
model = SequenceTagger.load('path/to/save/model')
sentence = flair.data.Sentence("Your text here.")

model.predict(sentence)

print(sentence.to_tagged_string())