In [1]:
import pickle
import pandas as pd
import spacy

from ade_entity_extraction import ADE_NP_Extraction


# Extracting Lists for CheckList
Lists of Drugs and ADEs \
Corpus: PsyTAR\

Request access to the [PsyTAR corpus](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6495095/).

In [7]:
# manually add the PsyTAR drug names
drugs = ["Drug_1", "Drug_2"]
# add path to PsyTAR data
path_to_data = "" # (2)

### 1. Drug Names
PsyTar is annotated for drugs, used with different capitalization

In [8]:
# add different capitalizations 
drugs_lowercase = [d.lower() for d in drugs]
drugs += drugs_lowercase

print(drugs)
print(len(drugs))

['Drug_1', 'Drug_2', 'drug_1', 'drug_2']
4


In [3]:
# save drug list
with open("extraction_results/extracted_psytar_drugs.pkl", "wb") as fp:   
    pickle.dump(drugs, fp)

### 2. ADEs
PsyTar is annotated for ADEs. All unique ADEs are extracted from the corpus and subsequently filtered for small to medium sized noun phrases.

In [None]:
# load data sheet with annotated ADEs

psytar_ade = pd.read_excel(path_to_data, sheet_name="ADR_Identified")
psytar_ade.head()

In [None]:
# extract all ADEs
ade_columns = psytar_ade.columns[4:]
ades_list = []

for col in ade_columns: # loop over ADE columns, get all ADEs
    ades_list += psytar_ade[col].unique().tolist()

# remove duplicates
ades_set = set(ades_list)
ades_d = list(ades_set)

# remove nans
ades = [a for a in ades_d if type(a)==str]

print(len(ades))

ADEs in PsyTar come in different phrase types. Only short or medium length noun phrases (NP) can be used to fill the templates. ADEs in this format will be extracted in the next steps.

In [None]:
# spacy model
spacy_modelname = "en_core_web_sm" # must be installed

extractor = ADE_NP_Extraction(ade_list=ades, spacy_model=spacy_modelname)

tagsets_np = extractor.create_NP_tagsets() # create POS tagsets to filter ADEs
ades_extracted = extractor.extract_NP_ADEs(tagsets_np, n=-1) # get all filtered ADEs from corpus
ades_extracted[:10]

In [None]:
# show relation between ADEs matching the filters and others in the whole corpus
extractor = ADE_NP_Extraction(ade_list=ades, spacy_model=spacy_modelname)

ade_nouns_all, ade_other = extractor.count_matching_ADEs(tagsets_np)

In [8]:
# save extracted ADEs
extractor.save_list(ades_extracted, output_filename="extraction_results/extracted_psytar_ades.pkl")