In NLP task, we are using the ncbi_disease dataset (a high-quality gold standard for disease name recognition and normalization research). We will use the spaCy model. In this task, you will understand. 
1.Create a dataset for training and testing
2.Understand how to pre-process the dataset
3.Training the model
4.Evaluation the model performance by testing dataset



In [1]:
%%capture
!pip install datasets
!pip install sentencepiece
!pip install dataprep
!pip install simpletransformers
!pip install colorama
!pip install plac
!pip install spacy
!pip pathlib
!python -m spacy download en_core_web_lg

In [2]:
from __future__ import unicode_literals, print_function
from pathlib import Path

In [3]:
import spacy
from tqdm import tqdm 
from datasets import load_dataset
from spacy.training import Example
from spacy.scorer import Scorer

In [4]:
from simpletransformers.ner import NERModel, NERArgs
import gc
import re
import string
import operator
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import STOPWORDS

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from dataprep.eda import plot, plot_correlation, create_report, plot_missing
from pprint import pprint

import plac
import random

In [5]:
import re
def clean_text(text):
    text=' '.join([i for i in text.split(',')])
    text=' '.join([i for i in text.split("'")])
    text=' '.join([i for i in text.split() if i.isalpha()])
    
    return text

In [6]:
#from thinc.neural.optimizers import Adam

In [7]:
from thinc.api import Adam

# import Dataset
## In this dataset, we are using ncbi_disease (https://huggingface.co/datasets/ncbi_disease), which consists of 793 PubMed abstracts, which are separated into training (593), development (100) and test (100) subsets. The NCBI Disease corpus is annotated with disease mentions, using concept identifiers from either MeSH or OMIM. The model trained by this dataset can be used to extract disease name information from a given sentence. 

In [8]:
dataset = load_dataset("ncbi_disease")



  0%|          | 0/3 [00:00<?, ?it/s]

The data is split into a train (5433 instances), validation (924 instances) and test set (941 instances).

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 924
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 941
    })
})

In [10]:
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

In [11]:
print(df_train.iloc[0].tokens)
print(df_train.iloc[0].ner_tags)

['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]


In [12]:
eda_df = pd.concat([df_train,df_val,df_test], ignore_index=True)
eda_df.shape

(7298, 3)

In [13]:
eda_df['sentences'] = eda_df['tokens'].apply(lambda x: str(x)[1:-1])

In [14]:
eda_df['content'] = eda_df['sentences'].apply(clean_text)
eda_df.iloc[0]

id                                                           0
tokens       [Identification, of, APC2, ,, a, homologue, of...
ner_tags            [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]
sentences    'Identification', 'of', 'APC2', ',', 'a', 'hom...
content      Identification of a homologue of the adenomato...
Name: 0, dtype: object

# The explaination of dataset
## From upper list, we can find the dataset contains five columns,
### The first column is the sentence id.
### The second is token which is the words separated by space.
### The third is the tag for each token. 0:non-disease, 1:initial word of the disease, and 2: the following word of the disease.
### The sentence columns is the token column removing the empty space.
### The content column is the sentence we are seeing in the document.

In [15]:
import en_core_web_lg
nlp= en_core_web_lg.load()

In [16]:
train_data = []
for i in range(len(df_train)):
    stri = ' '
    ent = {}
    ent['entities'] = []
    for word in df_train.iloc[i]['tokens']:
        stri+=f'{word} '
    for idx, word in zip(df_train.iloc[i]['ner_tags'], df_train.iloc[i]['tokens']):
        if idx == 1:
            start_index = stri.find(word)
            end_index = start_index + len(word)
            s = (start_index, end_index, 'I_Disease')
            ent['entities'].append(s)
        if idx == 2:
            start_index = stri.find(word)
            end_index = start_index + len(word)
            s = (start_index, end_index, 'B_Disease')
            ent['entities'].append(s)
    ent['entities'] = list(set(ent['entities']))
    res = (stri, ent)
    train_data.append(res)
print(len(train_data))

5433


In [17]:
val_data = []
for i in range(len(df_val)):
    stri = ' '
    ent = {}
    ent['entities'] = []
    for word in df_val.iloc[i]['tokens']:
        stri+=f'{word} '
    for idx, word in zip(df_val.iloc[i]['ner_tags'], df_val.iloc[i]['tokens']):
        if idx == 1:
            start_index = stri.find(word)
            end_index = start_index + len(word)
            s = (start_index, end_index, 'I_Disease')
            ent['entities'].append(s)
        if idx == 2:
            start_index = stri.find(word)
            end_index = start_index + len(word)
            s = (start_index, end_index, 'B_Disease')
            ent['entities'].append(s)
    ent['entities'] = list(set(ent['entities']))
    res = (stri, ent)
    val_data.append(res)
print(len(val_data))

924


In [18]:
test_data = []
for i in range(len(df_test)):
    stri = ' '
    ent = {}
    ent['entities'] = []
    for word in df_test.iloc[i]['tokens']:
        stri+=f'{word} '
    for idx, word in zip(df_test.iloc[i]['ner_tags'], df_test.iloc[i]['tokens']):
        if idx == 1:
            start_index = stri.find(word)
            end_index = start_index + len(word)
            s = (start_index, end_index, 'I_Disease')
            ent['entities'].append(s)
        if idx == 2:
            start_index = stri.find(word)
            end_index = start_index + len(word)
            s = (start_index, end_index, 'B_Disease')
            ent['entities'].append(s)
    ent['entities'] = list(set(ent['entities']))
    res = (stri, ent)
    test_data.append(res)
print(len(test_data))

941


In [30]:
directory = ''
file_folder='NER_model'
download_path=directory+file_folder
isExist = os.path.exists(download_path)
if not isExist:
 # Create a new directory because it does not exist
   os.makedirs(download_path)

In [31]:
download_path

'NER_model'

In [33]:
model = None # you can assign the path for some pre-train models
output_dir=Path(download_path)
epoche=5

In [34]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


In [35]:
if 'ner' not in nlp.pipe_names:
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

In [46]:


other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
ind_pr=0.0
with nlp.disable_pipes(*other_pipes):  # only train NER
    nlp.begin_training()
    learn_rate = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8
    L2 = 1e-6
    max_grad_norm = 1.0
    optimizer = Adam( learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
    for itn in range(epoche):
        random.shuffle(train_data)
        losses = {}
        for batch in spacy.util.minibatch(train_data, size=100):
             for text, annotations in batch:
                    try:
                        example = Example.from_dict(nlp.make_doc(text), annotations)
                        nlp.update([example],drop=0.2,sgd=optimizer,losses=losses)
                    except:
                        continue
    if epoche % 5 == 0:
        examples = []
        for text, annotations in val_data:
            try:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                example.predicted = nlp(str(example.predicted))
                examples.append(example)
            except:
                continue
        scorer = Scorer()
        results=scorer.score(examples)
        print(results)
        current_pr=results['ents_p']
        if current_pr>ind_pr:
            nlp.to_disk("NER_model")
            current_pr==ind_pr
        

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': 1.0, 'sents_r': 1.0, 'sents_f': 1.0, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.7820598006644518, 'ents_r': 0.7657774886141835, 'ents_f': 0.7738330046022355, 'ents_per_type': {'I_Disease': {'p': 0.768025078369906, 'r': 0.7573415765069552, 'f': 0.7626459143968871}, 'B_Disease': {'p': 0.7923875432525952, 'r': 0.7719101123595505, 'f': 0.7820147979510529}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}


In [47]:
df = pd.DataFrame(results)
df.to_csv(download_path + "validate_performance.txt")

# Question one : Adjust the learning rate (10e-5 - 10e-3) to see the performance change

**Answer:**

# Question two : add more epoch(5-10 to see the model performance)

**Answer:**

# Question three : try to write the code for testing and report the model performance

In [None]:
##Answer hint+:
examples = []
for text, annotations in test_data:
##finish the code for testing dataset and report the model performance##