<a href="https://colab.research.google.com/github/gupta24789/named-entity-recognition/blob/main/spacy_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from spacy import displacy

nlp = spacy.blank("en")

## Utilities

In [None]:
def transform_data(sent, tag):
    df_list = []

    original_sent = sent.strip()
    sent = sent.strip().split(" ")
    tag = tag.strip().split(" ")

    tag_list = []
    end = -1
    for i,(token, tag) in enumerate(zip(sent,tag)):
        start = end + 1
        end = start + len(token)
        tag_list.append((start,end,tag))

    df_list += (original_sent, tag_list)

    return df_list

## Load Data

In [None]:
## train
train_sents = open("data/train/sentences.txt","r").readlines()
train_tags = open("data/train/labels.txt","r").readlines()
## val
val_sents = open("data/val/sentences.txt","r").readlines()
val_tags = open("data/val/labels.txt","r").readlines()
## test
test_sents = open("data/test/sentences.txt","r").readlines()
test_tags = open("data/test/labels.txt","r").readlines()

In [None]:
train_sents[:2]

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .\n',
 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "\n']

In [None]:
train_tags[:2]

['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O\n',
 'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O\n']

In [None]:
sent, tag = train_sents[0], train_tags[0]
sample = transform_data(sent, tag)
sample

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
 [(0, 9, 'O'),
  (10, 12, 'O'),
  (13, 26, 'O'),
  (27, 31, 'O'),
  (32, 39, 'O'),
  (40, 47, 'O'),
  (48, 54, 'B-geo'),
  (55, 57, 'O'),
  (58, 65, 'O'),
  (66, 69, 'O'),
  (70, 73, 'O'),
  (74, 76, 'O'),
  (77, 81, 'B-geo'),
  (82, 85, 'O'),
  (86, 92, 'O'),
  (93, 96, 'O'),
  (97, 107, 'O'),
  (108, 110, 'O'),
  (111, 118, 'B-gpe'),
  (119, 125, 'O'),
  (126, 130, 'O'),
  (131, 135, 'O'),
  (136, 143, 'O'),
  (144, 145, 'O')]]

In [None]:
text, annotations = sample[0], sample[1]
doc = nlp(text)
ents = []
for start, end, label in annotations:
    span = doc.char_span(start, end, label=label)
    ents.append(span)
doc.ents = ents

In [None]:
doc.ents

(Thousands,
 of,
 demonstrators,
 have,
 marched,
 through,
 London,
 to,
 protest,
 the,
 war,
 in,
 Iraq,
 and,
 demand,
 the,
 withdrawal,
 of,
 British,
 troops,
 from,
 that,
 country,
 .)

In [None]:
colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors}
displacy.render(doc, style="ent", options= options, jupyter = True)

## Prepare Data

In [None]:
training_data = []
training_data.extend(transform_data(sent, tag) for sent, tag in zip(train_sents, train_tags))

validation_data = []
validation_data.extend(transform_data(sent, tag) for sent, tag in zip(val_sents, val_tags))

testing_data = []
testing_data.extend(transform_data(sent, tag) for sent, tag in zip(test_sents, test_tags))

## Convert Data into spacy format

In [None]:
def convert_to_spacy(data):
    db = DocBin()
    for sample in data:
        text, annotations = sample[0], sample[1]
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        db.add(doc)

    return db

In [None]:
train_db = convert_to_spacy(training_data)
val_db = convert_to_spacy(validation_data)

train_db.to_disk("./train.spacy")
val_db.to_disk("./val.spacy")

## Traning step


    1. Open this link https://spacy.io/usage/training#quickstart and based on requirement copy the config and save into base_config.cfg file.
        - sample config :
            language : english
            components : ner
            hardware : cpu
            optimize for : accuracy

    2.Prepare config.cfg file
        - Run the below command to modify the config and it will create the config.cfg file which will be used for model training

        ```
        python -m spacy init fill-config base_config.cfg config.cfg
        ```
        - you can modify the config.cfg file
            - batch_size : 128
            - epoch : 10

    3. Model Traning

        - paths.dev : valiation data path
        - paths.train : training data path

        ```
        python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
        ```

In [None]:
## create config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
## modify the config.cfg
## 1. training.max_epochs = 10
## 2. nlp.batch_size = 128

In [None]:
# !python -m spacy download en_core_web_lg

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./val.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     85.24   75.92   75.17   76.70    0.76
  0     200         46.32   3140.81   94.48   94.62   94.35    0.94
  0     400         39.59   2145.42   94.92   95.02   94.81    0.95
  0     600        111.17   2497.73   95.51   95.62   95.40    0.96
  0     800         60.53   2623.06   95.71   95.79   95.64    0.96
  0    1000        135.78   3073.92   96.11   96.20   96.01    0.96
  0    1200        152.36   3717.43   96.09   96.06   96.13    0.96
  0    1400        134.14   4587.71   96.38   96.42   96.34    0.96
  0    1600        146.42   4942.98   96.49   96.52   96.45    0.96
  0    1800        177.37   5778.21   96.74   

## Load Model

In [None]:
model = spacy.load("output/model-best")

In [None]:
result = model(test_sents[0])
colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors}
spacy.displacy.render(result, style="ent", options= options, jupyter=True)

## Test Accuracy

In [None]:
from seqeval import metrics
from tqdm import tqdm

In [None]:
pred_labels = []
true_labels = []

for sample in tqdm(testing_data):
    text, annotations = sample[0], sample[1]
    true = [data[2] for data in annotations]
    result = model(text)
    pred = [ent.label_ for ent in result.ents]

    if len(true)!=len(pred):
        min_len = len(true) if len(true)< len(pred) else len(pred)
        pred = pred[:min_len]
        true = pred[:min_len]

    pred_labels.append(pred)
    true_labels.append(true)

100%|██████████| 7194/7194 [02:26<00:00, 49.00it/s]


In [None]:
print("F1 : ", metrics.f1_score(true_labels, pred_labels))
print("Precision : ",metrics.precision_score(true_labels, pred_labels))
print("Recall : ",metrics.recall_score(true_labels, pred_labels))

F1 :  0.8203425997273428
Precision :  0.8184990241883021
Recall :  0.8221944989009683
