In [2]:
from Bio import Medline
with open("pubmed-Pembrolizu-set.txt") as handle:
    records = [record for record in Medline.parse(handle)]
records

[{'PMID': '30069757',
  'OWN': 'NLM',
  'STAT': 'MEDLINE',
  'DCOM': '20181113',
  'LR': '20181202',
  'IS': '0080-0015 (Print) 0080-0015 (Linking)',
  'VI': '211',
  'DP': '2018',
  'TI': 'Lapatinib.',
  'PG': '19-44',
  'LID': '10.1007/978-3-319-91442-8_2 [doi]',
  'AB': "The human epidermal growth factor receptor (HER) family of receptor tyrosine kinases plays an important role in the biology of many cancers. In breast and gastrointestinal cancer, and at lower rates also in additional tumor types, HER2 and its homo- or heterodimerization with HER1 or HER3 are essential for cancer cell growth and survival. Breast cancer patients overexpressing HER2 have a more aggressive course of their disease. The poor prognosis associated with HER2 overexpression can be substantially improved by adding HER2-targeted therapy to standard of care using the monoclonal antibody trastuzumab. Lapatinib, an oral dual tyrosine kinase inhibitor, blocks HER1 and HER2 tyrosine kinase activity by binding to th

In [3]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import re
from Bio import Medline

model_checkpoint = "jsylee/scibert_scivocab_uncased-finetuned-ner"
ner_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)
ner_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=5,
                                                            id2label={0: 'O', 1: 'B-DRUG', 2: 'I-DRUG', 3: 'B-EFFECT', 4: 'I-EFFECT'}
                                                           )      
ner_pipeline = pipeline(task='ner', model=ner_model, tokenizer=ner_tokenizer)

In [4]:
entities = ner_pipeline(abstracts)

NameError: name 'abstracts' is not defined

In [5]:
entities

[[{'entity': 'B-DRUG',
   'score': 0.88066787,
   'index': 109,
   'word': 'trast',
   'start': 599,
   'end': 604},
  {'entity': 'I-DRUG',
   'score': 0.49762312,
   'index': 110,
   'word': '##uzumab',
   'start': 604,
   'end': 610},
  {'entity': 'B-DRUG',
   'score': 0.99585295,
   'index': 112,
   'word': 'lapa',
   'start': 612,
   'end': 616},
  {'entity': 'I-DRUG',
   'score': 0.9784958,
   'index': 113,
   'word': '##tin',
   'start': 616,
   'end': 619},
  {'entity': 'I-DRUG',
   'score': 0.9909567,
   'index': 114,
   'word': '##ib',
   'start': 619,
   'end': 621},
  {'entity': 'B-EFFECT',
   'score': 0.6955988,
   'index': 150,
   'word': 'inhibition',
   'start': 797,
   'end': 807},
  {'entity': 'I-EFFECT',
   'score': 0.5332807,
   'index': 151,
   'word': 'of',
   'start': 808,
   'end': 810},
  {'entity': 'I-EFFECT',
   'score': 0.48473087,
   'index': 152,
   'word': 'tumor',
   'start': 811,
   'end': 816},
  {'entity': 'I-EFFECT',
   'score': 0.77938753,
   'index'

In [16]:
a = []
for abstract in entities:
    for entity in records:
        if entity['entity'] == "B-DRUG" or entity['entity'] == "I-DRUG":
            a.append((entity['word'], entity['entity']))

In [5]:
first_abstract = ner_pipeline(abstracts[0])

NameError: name 'abstracts' is not defined

In [6]:
second_abstract = ner_pipeline(records[1])

ValueError: At least one input is required.

In [44]:
a = [{entity['word']:entity['entity']} for entity in first_abstract
     if entity['entity'] == "B-DRUG" or entity['entity'] == "I-DRUG"]
a

[{'trast': 'B-DRUG'},
 {'##uzumab': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'cape': 'B-DRUG'},
 {'##citabine': 'I-DRUG'},
 {'anthr': 'B-DRUG'},
 {'taxa': 'B-DRUG'},
 {'##nes': 'I-DRUG'},
 {'trast': 'B-DRUG'},
 {'##uzumab': 'I-DRUG'},
 {'trast': 'B-DRUG'},
 {'##uzumab': 'I-DRUG'},
 {'trast': 'B-DRUG'},
 {'##uzumab': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'let': 'B-DRUG'},
 {'##roz': 'I-DRUG'},
 {'##ole': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'trast': 'B-DRUG'},
 {'##uzumab': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'trast': 'B-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'},
 {'lapa': 'B-DRUG'},
 {'##tin': 'I-DRUG'},
 {'##ib': 'I-DRUG'}]

In [37]:
drug_names = ""
for item in a:
    for word, tag in item.items():
        if tag == "B-DRUG":
            drug_names += "DRUG-" + word
        elif tag == "I-DRUG":
            drug_names += word.removeprefix('##')
        else:
            continue

In [38]:
drug_list = drug_names.split("DRUG-")
drug_list = list(set([d for d in drug_list if d]))  # remove any empty strings

print(drug_list)

['taxanes', 'trastuzumab', 'lapatinib', 'capecitabine', 'letrozole', 'anthr', 'trast']


In [36]:
b = [{entity['word']:entity['entity']} for entity in first_abstract
     if entity['entity'] == "B-EFFECT" or entity['entity'] == "I-EFFECT"]
b

[{'inhibition': 'B-EFFECT'},
 {'of': 'I-EFFECT'},
 {'tumor': 'I-EFFECT'},
 {'cell': 'I-EFFECT'},
 {'growth': 'I-EFFECT'},
 {'diarrhea': 'B-EFFECT'}]

In [41]:
ades = ""
for item in b:
    for word, tag in item.items():
        if tag == 'B-EFFECT':
            ades += 'ADE-' + word + " "
        elif tag == 'I-EFFECT':
            ades += word.removeprefix('##') + " "
        else:
            continue
ades

'ADE-inhibition of tumor cell growth ADE-diarrhea '

In [43]:
ade_list = ades.split("ADE-")
ade_list = list(set([d.strip() for d in ade_list if d]))  # remove any empty strings

print(ade_list)

['inhibition of tumor cell growth', 'diarrhea']


In [11]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

In [12]:
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmp9n6g5ivx
Finished download, copying /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmp9n6g5ivx to cache at /Users/macpc/.scispacy/datasets/2b79923846fb52e62d686f2db846392575c8eb5b732d9d26cd3ca9378c622d40.87bd52d0f0ee055c1e455ef54ba45149d188552f07991b765da256a1b512ca0b.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin not found in cache, downloading to /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmpn8yxap4b
Finished download, copying /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmpn8yxap4b to cache at /Users/macpc/.scispacy/datasets/7e8e091ec80370b87b1652f461eae9d926e543a403a69c1f0968f71157322c25.6d801a1e14867953e36258b0e19a23723ae84b0abd2a723bdd3574c3e0c873b4.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json not found in cache, downloading to /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmpuqk5wkqf
Finished download, copying /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmpuqk5wkqf to cache at /Users/macpc/.scispacy/datasets/6238f505f56aca33290aab44097f67dd1b88880e3be6d6dcce65e56e9255b7d4.d7f77b1629001b40f1b1bc951f3a890ff2d516fb8fbae3111b236b31b33d6dcf.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2023-04-23/umls_2022_ab_cat0129.jsonl not found in cache, downloading to /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmpddb1mdz2
Finished download, copying /var/folders/sg/gtmmqq4j5slbfj0931xyb33r0000gn/T/tmpddb1mdz2 to cache at /Users/macpc/.scispacy/datasets/d5e593bc2d8adeee7754be423cd64f5d331ebf26272074a2575616be55697632.0660f30a60ad00fffd8bbf084a18eb3f462fd192ac5563bf50940fc32a850a3c.umls_2022_ab_cat0129.jsonl
https://s3-us-west-2.amazonaws.com/ai

<scispacy.linking.EntityLinker at 0x7fec97650460>

In [21]:
doc = nlp('cardiac toxicity')

In [24]:
entity = doc.ents[1]

print("Name: ", entity)

Name:  toxicity


In [25]:
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

CUI: C0040539, Name: Toxicity aspects
Definition: Used with drugs and chemicals for experimental human and animal studies of their ill effects. It includes studies to determine the margin of safety or the reactions accompanying administration at various dose levels. It is used also for exposure to environmental agents. Poisoning should be considered for life-threatening exposure to environmental agents.
TUI(s): T080
Aliases: (total: 1): 
	 toxicity
CUI: C0600688, Name: Toxic effect
Definition: The finding of bodily harm due to the poisonous effects of something.
TUI(s): T037
Aliases: (total: 9): 
	 Toxic effect, toxicities, toxic, effects toxics, Toxic effect of, toxic effect, Toxicity, toxics, toxicity
CUI: C0242890, Name: Toxicity Tests
Definition: An array of tests used to determine the toxicity of a substance to living systems. These include tests on clinical drugs, foods, and environmental pollutants.
TUI(s): T060
Aliases: (total: 8): 
	 toxicity tests, testing toxicity, Test, Tox