In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
doc = nlp("Donald Trump is the President of USA")

In [4]:
from spacy import displacy

displacy.render(doc, style="ent")

In [5]:
import json

with open("medical.json", "r") as f:
    data = json.load(f)

In [6]:
data

{'examples': [['SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up. MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra. ALLERGIES: , She has no known medicine allergies. OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78. HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear. Neck:  Supple witho

In [7]:
data['examples'][0][1]['entities']

[[58, 80, 'DISEASE'],
 [100, 109, 'DISEASE'],
 [200, 208, 'CHEMICAL'],
 [214, 220, 'CHEMICAL'],
 [520, 526, 'DISEASE'],
 [666, 676, 'CHEMICAL'],
 [825, 831, 'DISEASE'],
 [987, 997, 'DISEASE'],
 [1028, 1045, 'DISEASE'],
 [1070, 1076, 'CHEMICAL'],
 [1134, 1144, 'CHEMICAL'],
 [1237, 1244, 'CHEMICAL']]

In [8]:
training_data = []

for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example[0]
    temp_dict['entities'] = []
    for entity in example[1]['entities']:
        start = entity[0]
        end = entity[1]
        label = entity[2]
        temp_dict['entities'].append((start, end, label))
    training_data.append(temp_dict)

In [9]:
training_data[0]['text']

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up. MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra. ALLERGIES: , She has no known medicine allergies. OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78. HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear. Neck:  Supple without adenopathy. 

In [10]:
training_data[0]['entities']

[(58, 80, 'DISEASE'),
 (100, 109, 'DISEASE'),
 (200, 208, 'CHEMICAL'),
 (214, 220, 'CHEMICAL'),
 (520, 526, 'DISEASE'),
 (666, 676, 'CHEMICAL'),
 (825, 831, 'DISEASE'),
 (987, 997, 'DISEASE'),
 (1028, 1045, 'DISEASE'),
 (1070, 1076, 'CHEMICAL'),
 (1134, 1144, 'CHEMICAL'),
 (1237, 1244, 'CHEMICAL')]

In [11]:
training_data[0]['text'][200:208]

'Claritin'

In [12]:
len(training_data)

3362

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
data_1, data_2 = train_test_split(training_data, test_size=0.3, random_state=42)

In [15]:
len(data_1), len(data_2)

(2353, 1009)

In [16]:
train_data, test_data = train_test_split(data_2, test_size=0.2, random_state=42)

In [17]:
len(train_data), len(test_data)

(807, 202)

In [18]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

In [19]:
from spacy.util import filter_spans

for training_example in tqdm(train_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

100%|███████████████████████████████████████████████████████████████████████████████| 807/807 [00:05<00:00, 152.65it/s]


In [20]:
doc_bin = DocBin()

for training_example in tqdm(test_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("test.spacy")

100%|███████████████████████████████████████████████████████████████████████████████| 202/202 [00:01<00:00, 147.86it/s]


In [21]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [22]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./test.spacy

[i] Saving to output directory: .
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    158.10    0.24    0.16    0.47    0.00
  0     200        137.81   7140.03   70.30   79.39   63.08    0.70
  0     400         43.12   2796.73   76.74   77.75   75.75    0.77
  0     600         51.66   2559.16   79.71   79.72   79.70    0.80
  1     800        367.51   2482.24   82.82   86.37   79.56    0.83
  1    1000         79.58   1669.19   81.18   81.25   81.12    0.81
  1    1200        132.31   1858.39   81.38   87.84   75.80    0.81
  1    1400        121.77   1987.09   83.76   83.90   83.62    0.84
  2    1600        224.55   1695.42   82.12   90.50   75.16    0.82
  2    1800        148.92   1427.57   82.52   88.09   77.62    0.83
  2    2000        257.31   1528.84   

In [23]:
nlp_ner = spacy.load("model-best")

In [24]:
doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.")

spacy.displacy.render(doc, style="ent")