# Timing SciSpacy

In [79]:
import argparse
import spacy
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
import en_ner_bc5cdr_md # make sure you download the model with pip install pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz
import pandas as pd
import warnings
import gc
from tqdm.notebook import tqdm

In [80]:
kaggle_patient_notes = pd.read_csv('/home/dtank/data/volume_2/eicu_csv/patient_notes.csv')
kaggle_patient_notes_df = kaggle_patient_notes[:1150] # make a dataframe (42146 rows)

kaggle_patient_notes_str = '; '.join(kaggle_patient_notes_df['pn_history'].tolist()) # make a string

def scispacy_pipeline_df(df, column_name):
    # set environment
    spacy.require_gpu()
    print('gpu enabled')
    warnings.filterwarnings("ignore")
    print('warnings ignored')
    
    # create nlp environment
    nlp = en_ner_bc5cdr_md.load()
    nlp.add_pipe("abbreviation_detector")
    config = {
        "k": 10,
        "resolve_abbreviations": True,
        "linker_name": "umls",
        "max_entities_per_mention": 1
    }
    nlp.add_pipe("scispacy_linker", config=config)
    linker =  nlp.get_pipe("scispacy_linker")
    print('nlp environment set')
    
    # perform entity linking
    for i, sentence in enumerate(tqdm(df[column_name])):
        try:
            note = nlp(sentence)
        except:
            continue
            
        ents = []
        labels = []
        cuis = []
        linked_ent_names = []
            
        for entity in note.ents:
            ents.append(entity.text)
            labels.append(entity.label_)

            for linker_ent in entity._.kb_ents:
                cuis.append(linker.kb.cui_to_entity[linker_ent[0]][0])
                linked_ent_names.append(linker.kb.cui_to_entity[linker_ent[0]][1])
                
        df['entities'] = ', '.join(ents)
        df['labels'] = ', '.join(labels)
        df['cuis'] = ', '.join(cuis)
        df['linked_ent_names'] = ', '.join(linked_ent_names)
    
    return 

def scispacy_pipeline_str(note_str):
    # set environment
    spacy.require_gpu()
    print('gpu enabled')
    warnings.filterwarnings("ignore")
    print('warnings ignored')
    
    # create nlp environment
    nlp = en_ner_bc5cdr_md.load()
    nlp.add_pipe("abbreviation_detector")
    config = {
        "k": 10,
        "resolve_abbreviations": True,
        "linker_name": "umls",
        "max_entities_per_mention": 1
    }
    nlp.add_pipe("scispacy_linker", config=config)
    linker =  nlp.get_pipe("scispacy_linker")
    print('nlp environment set')
    
    # perform entity linking
    note = nlp(note_str)
    
    ents = []
    labels = []
    cuis = []
    linked_ent_names = []
    
    for entity in note.ents:
            ents.append(entity.text)
            labels.append(entity.label_)

            for linker_ent in entity._.kb_ents:
                cuis.append(linker.kb.cui_to_entity[linker_ent[0]][0])
                linked_ent_names.append(linker.kb.cui_to_entity[linker_ent[0]][1])
    
    return ents, labels, cuis, linked_ent_names

In [86]:
%%time
scispacy_pipeline_df(kaggle_patient_notes_df, 'pn_history')

gpu enabled
nlp environment set


  0%|          | 0/1150 [00:00<?, ?it/s]

CPU times: user 2min 2s, sys: 4.36 s, total: 2min 7s
Wall time: 1min 58s


In [None]:
# Text of length 34567170 exceeds maximum of 1.000.000.
# The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. 
# This means long texts may cause memory allocation errors. If you're not using the parser or NER, 
# it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, 
# so you can check whether your inputs are too long by checking `len(text)`.

In [83]:
%%time
ents, labels, cuis, linked_ent_names = scispacy_pipeline_str(kaggle_patient_notes_str)

gpu enabled
nlp environment set
CPU times: user 2min 1s, sys: 6.56 s, total: 2min 7s
Wall time: 1min 58s


In [87]:
kaggle_patient_notes_df

Unnamed: 0,pn_num,case_num,pn_history,entities,labels,cuis,linked_ent_names
0,0,0,"17-year-old male, has come to the student heal...","-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
1,1,0,17 yo male with recurrent palpitations for the...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
4,4,0,17yo male with no pmh here for evaluation of p...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
...,...,...,...,...,...,...,...
1145,1228,0,"HPI: 17 yo M, presenting with paplitation sinc...","-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
1146,1229,0,Dillon Cleveland is 17 yo M who presents with ...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
1147,1230,0,17 YO M WITH PALPITATIONS SINCE A FEW MONTHS A...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."
1148,1231,0,CC: Mr. Cleveland is a 17yoM p/w multiple epis...,"-heart pounding, + palpitations\r\n, SOB, whee...","DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, D...","C0030252, C0013404, C0043144, C0085619, C00186...","Palpitations, Dyspnea, Wheezing, Orthopnea, He..."


In [88]:
print(ents)
print()
print(labels)
print()
print(cuis)
print()
print(linked_ent_names)

['heart pounding', 'dispnea', 'chest pain', 'chills', 'fever', 'nausea', 'vomiting', 'palpitations', 'durign', 'headedness', 'diarrhea', 'complaints of heart pounding', 'pain', 'lightheadedness', 'shortness of breath', 'pounding', 'shortness of breath', 'chest pain', 'anxiety', 'thyroid disease', 'palpitation', 'CAN', 'NO', 'palpitations', 'weight loss', 'fevers', 'PSH', 'FHX', 'thyroid disorder', 'SHX', 'ETOH', 'shortness of breath', 'lightheadedness', 'diaphoresis', 'vomiting', 'tremor', 'loss of consciousness\r\nNo fever', 'nausea', 'vomiting', 'diarrhea', 'rash', 'anxiety', 'palpitations', 'Allergies', 'Adderall', 'cocaine', 'thyroid disease', 'heart pounding', 'shortness of breath', 'fever', 'weight loss', 'Alcohol', 'Adderall', 'Allergies', 'episodic pounding heart', 'Denies syncope', 'lightheadedness', 'heart palpitations', 'SOB', 'cough', 'swelling', 'dizziness', 'pain', 'fever', 'Allergies', 'etOH', 'palpitations', 'palpitations', 'Palpitations', 'palpitations', 'palpitations'