<a href="https://colab.research.google.com/github/girija2204/DLTopics/blob/master/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import pandas as pd
import numpy as np
import os
import random
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse

In [2]:
!python3 -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.1MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp36-none-any.whl size=98051304 sha256=9ee892ec45f994a265e1660c31fee54c97f5a057797795aa835fd27917701d7c
  Stored in directory: /tmp/pip-ephem-wheel-cache-v6z_bax9/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
nlp = spacy.load('en_core_web_sm')
print(nlp.pipe_names)
ner_pipe = nlp.get_pipe('ner')

['tagger', 'parser', 'ner']


In [4]:
# TP/(TP+FP)
def calc_precision(pred, true):
    return len([i for i in pred if i in true]) / len(pred)

In [5]:
# TP/(TP+FN)
def calc_recall(pred, true):
    return len([i for i in pred if i in true]) / len(true)

In [6]:
def calc_f1score(precision, recall):
    return 2 * precision * recall / (precision + recall)

In [7]:
def evaluate(ner, data):
    preds = [ner(x[0]) for x in data]
    precision, recall, f1score = [], [], []
    for pred, true in zip(preds, data):
        print(pred)
        print(true)

In [8]:
def load_data(file):
    # dataset_dir = "C:\\Users\\girij\\PycharmProjects\\other resources\\dl projects\\datasets\\NERdata\\"
    # dataset_files = []
    # for directory in os.listdir(dataset_dir):
    #     for file in os.listdir(dataset_dir + directory):
    #         dataset_files.append(dataset_dir + directory + "\\" + file)

    dataset_files = [file]
    sentence_list = []
    dataset = []
    words_list = []
    labels = []
    unique_labels = []
    start_index = 0
    end_index = 0
    for file in dataset_files:
        file = open(file, 'r')
        for line in file:
            line = line.strip("\n").split("\t")
            if len(line) is 1:
                if len(labels):
                    sentence = ''.join(word + " " for word in words_list)
                    dataset.append((sentence, {'entities': labels}))
                start_index = 0
                end_index = 0
                labels = []
                words_list = []
            else:
                word = str(line[0]).lower()
                words_list.append(word)
                end_index += len(word) + 1
                if line[1] != 'O':
                    label_to_add = f"{line[1]}_disease"
                    labels.append((start_index, end_index - 1, label_to_add))
                    if label_to_add not in unique_labels:
                      unique_labels.append(label_to_add)
                start_index = end_index
    return dataset, unique_labels

In [10]:
dataset, unique_labels = load_data('/content/train.tsv')
test_dataset, _ = load_data('/content/test.tsv')

In [11]:
def train_spacy(train_data, labels, iterations, dropout=0.5, display_freq=1):
    nlp = spacy.load("en_core_web_sm")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')
    for i in labels:
      ner.add_label(i)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data)
            losses = {}
            # for text_data, label_data in train_data:
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
                # text_doc = nlp.make_doc(text_data)
                # gold = GoldParse(text_doc, entities=label_data)
                # nlp.update([text_doc], [gold], dropout, sgd=optimizer, losses=losses)
            evaluate(nlp, test_dataset[:5])
        return nlp

In [12]:
def evaluate(nlp, data):
  preds = [nlp(record[0]) for record in data]
  for pred, true in zip(preds, data):
    print(true)
    print(pred)

In [15]:
test_data_all = [str(record[0]) for record in test_dataset]

In [76]:
test_data_all[:5]

['physical mapping 220 kb centromeric of the human mhc and dna sequence analysis of the 43 - kb segment including the ring1 , hke6 , and hke4 genes . ',
 'dna elements recognizing nf - y and sp1 regulate the human multidrug - resistance gene promoter . ',
 'radioimmunoassay of plasma gonadotropins ; problems of specificity ',
 'as tat itself dramatically increases hiv - 1 gene expression , it too is presumably regulated in the latent state , and may also be activated by mitogenic stimulation . ',
 'to clarify the difference , both the crk ii and crk ii - 23 , proteins were expressed in e . coli and examined their binding capacity in vitro . ']

In [13]:
ner = train_spacy(dataset, unique_labels, 20)
ner.to_disk("model_spacy")

('physical mapping 220 kb centromeric of the human mhc and dna sequence analysis of the 43 - kb segment including the ring1 , hke6 , and hke4 genes . ', {'entities': [(43, 48, 'B_disease'), (49, 52, 'I_disease'), (116, 121, 'B_disease'), (124, 128, 'B_disease'), (135, 139, 'B_disease'), (140, 145, 'I_disease')]})
physical mapping 220 kb centromeric of the human mhc and dna sequence analysis of the 43 - kb segment including the ring1 , hke6 , and hke4 genes . 
('dna elements recognizing nf - y and sp1 regulate the human multidrug - resistance gene promoter . ', {'entities': [(25, 27, 'B_disease'), (28, 29, 'I_disease'), (30, 31, 'I_disease'), (36, 39, 'B_disease'), (53, 58, 'B_disease'), (59, 68, 'I_disease'), (69, 70, 'I_disease'), (71, 81, 'I_disease'), (82, 86, 'I_disease'), (87, 95, 'I_disease')]})
dna elements recognizing nf - y and sp1 regulate the human multidrug - resistance gene promoter . 
('radioimmunoassay of plasma gonadotropins ; problems of specificity ', {'entities': [(2

In [17]:
ner = nlp.get_pipe('ner')
ner.labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [14]:
def load_model(model_path):
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

In [15]:
ner = load_model("/content/model_spacy")

In [17]:
from spacy import displacy
test_sentences = [x[0] for x in test_dataset[0:4]]
for x in test_sentences:
    doc = ner(x)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    displacy.render(doc,jupyter=True, style = "ent")

human 43 48 B_disease
mhc 49 52 I_disease
ring1 116 121 B_disease
hke6 124 128 B_disease
hke4 135 139 B_disease
genes 140 145 I_disease


nf 25 27 B_disease
- 28 29 I_disease
y 30 31 I_disease
sp1 36 39 B_disease
human 53 58 B_disease
multidrug 59 68 I_disease
- 69 70 I_disease
resistance 71 81 I_disease
gene 82 86 I_disease
promoter 87 95 I_disease


plasma 20 26 B_disease
gonadotropins 27 40 I_disease


tat 3 6 B_disease
gene 45 49 I_disease


In [22]:
test_dataset[1]

('dna elements recognizing nf - y and sp1 regulate the human multidrug - resistance gene promoter . ',
 {'entities': [(25, 27, 'B_disease'),
   (28, 29, 'I_disease'),
   (30, 31, 'I_disease'),
   (36, 39, 'B_disease'),
   (53, 58, 'B_disease'),
   (59, 68, 'I_disease'),
   (69, 70, 'I_disease'),
   (71, 81, 'I_disease'),
   (82, 86, 'I_disease'),
   (87, 95, 'I_disease')]})