<a href="https://colab.research.google.com/github/chewzzz1014/fyp/blob/master/ner/src/train_ner_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train NER Models

In [1]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# import CONLL2003 annotated dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/FYP/Implementation/Resume Dataset/10_resumes_annotated.conll", delimiter='\t', names=["word", "pos", "chunk", "ner"], skip_blank_lines=True)

# Train/test split (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the new splits to files if needed
train_df.to_csv("conll_train.conll", sep='\t', index=False, header=False)
test_df.to_csv("conll_test.conll", sep='\t', index=False, header=False)


## Spacy NER

In [5]:
# create dir to place spacy ner data
!mkdir spacy_ner_data

# convert CONLL2003 annotation data into spacy data
!python -m spacy convert "conll_train.conll" spacy_ner_data -c ner
!python -m spacy convert "conll_test.conll" spacy_ner_data -c ner

mkdir: cannot create directory ‘spacy_ner_data’: File exists
[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;3m⚠ Document delimiters found, automatic document segmentation with `-n`
disabled.[0m
[38;5;3m⚠ No sentence boundaries found. Use `-s` to automatically segment
sentences.[0m
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.10/dist-packages/spacy/cli/_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/typer/core.py", line 728, in main
    return _main(
  File

In [None]:
# create base_config.cfg and paste the config generated from spacy widget
# update train and test file path
!touch base_config.cfg

In [None]:
# generate config.cfg from base_config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# train model using hyperparameters set in config.cfg
# trained model in output/ dir
!python -m spacy train config.cfg --output ./output

In [None]:
# evaluate trained model performance
# store output and visualization into result/ dir
!python -m spacy evaluate output/model-best spacy_ner_data/test.spacy -dp result

In [None]:
# download trained model

## Flair NER

In [None]:
!pip install flair

In [None]:
# load data
import flair
from flair.datasets import ColumnCorpus

# Define columns
columns = {0: 'text', 1: 'ner'}

# Specify the path to your training and test data
data_folder = 'path/to/your/data'  # Update this path
train_file = 'train.txt'  # Your training file
test_file = 'test.txt'    # Your testing file

# Create the corpus
corpus = ColumnCorpus(data_folder,
                      { 'train': train_file,
                        'test': test_file },
                      columns=columns)

In [None]:
# create NER tagger
from flair.models import SequenceTagger

tagger = SequenceTagger(hidden_size=256,
                         embeddings='glove',
                         tag_dictionary=corpus.make_tag_dictionary(tag_type='ner'),
                         tag_type='ner',
                         use_crf=True)


In [None]:
# train model
from flair.trainers import ModelTrainer

trainer = ModelTrainer(tagger, corpus)

trainer.train('path/to/save/model',  # Update this path
               learning_rate=0.1,
               mini_batch_size=32,
               max_epochs=10)

In [None]:
# evaluate model
result, score = trainer.evaluate(corpus.test)
print(result)
print(score)

In [None]:
# make prediction
model = SequenceTagger.load('path/to/save/model')
sentence = flair.data.Sentence("Your text here.")

model.predict(sentence)

print(sentence.to_tagged_string())