In [1]:
import os
import requests
import io
import zipfile

In [2]:
# create data directories
data_dirs = [
    "data","data/cvponline_extract_20191231", 
    "data/featurized_sentences", "data/knowledge_base", 
    "data/pubmed_json", "data/pubmed_packed", "data/statistic",
    "data/statistic/featurize_stat", "data/statistic/tag_stats",
    "data/tagged_sentences"
]

for dir_name in data_dirs:
    try:
        os.mkdir(dir_name)
    except FileExistsError:
        pass

In [3]:
# download cvponline data base

url = "https://www.canada.ca/content/dam/hc-sc/migration/hc-sc/dhp-mps/alt_formats/zip/medeff/databasdon/extract_extrait.zip"
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as thezip:
    thezip.extractall(path="data/")

In [4]:
# Create data directory
for dir_name in os.listdir("data/"):
    if dir_name.startswith("cvponline_extract"):
        os.rename("data/" + dir_name, "data/cvponline_extract")
        break

In [5]:
from src.kb_preprocessing.get_drug_name_reaction_pairs import create_drug_name_reaction_pairs

create_drug_name_reaction_pairs(
    react_data_path="data/cvponline_extract/reactions.txt",
    report_drug_path="data/cvponline_extract/report_drug.txt",
    react_drug_pair_targ_path="data/knowledge_base/reaction_drug_brand_pairs.txt"
)

________________________
reactions data
len reactions: 2998059
examples
[CVPReaction(reaction_identifier='2601', adverse_react_report='26', adverse_react_term='Rash', system_organ_class='Skin and subcutaneous tissue disorders'), CVPReaction(reaction_identifier='2702', adverse_react_report='27', adverse_react_term='Thrombocytopenia', system_organ_class='Blood and lymphatic system disorders'), CVPReaction(reaction_identifier='2701', adverse_react_report='27', adverse_react_term='Dermatitis bullous', system_organ_class='Skin and subcutaneous tissue disorders'), CVPReaction(reaction_identifier='2801', adverse_react_report='28', adverse_react_term='Chest pain', system_organ_class='General disorders and administration site conditions'), CVPReaction(reaction_identifier='2804', adverse_react_report='28', adverse_react_term='Vomiting', system_organ_class='Gastrointestinal disorders')]
________________________
report drug data
length rep_drug: 3289727
examples
[CVPReportDrug(report_drug_identifi

In [6]:
from src.kb_preprocessing.get_kb_triple import create_kb_triple

create_kb_triple()

#################
Counter({'total_drug_name_reaction_pairs_database': 19628047, 'total_brand_names_database': 37233, 'total_drug_name_database_without_suffix_filtered_drug_token': 28060, 'total_reactions_database': 12508, 'total_drug_name_database_with_single_suffix_filtered_drug_token': 7828, 'total_drug_names_suffix_filtered': 2939, 'total_drug_name_database_with_multiple_suffix_filtered_drug_token': 1345, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_2': 1067, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_3': 159, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_4': 63, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_5': 31, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_6': 17, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_7': 5, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_9': 1, 'total_drug_name_database_multiple_suffix_filtered_drug_token_NO_8': 1, 'tot

In [7]:
from src.kb_preprocessing.reaction_name_extraction import extract_sorted_reaction_dict

extract_sorted_reaction_dict()

In [8]:
start_index = 1
end_index = 2

In [9]:
from src.medline_preprocessing.download import numbered_files_ftp_download

numbered_files_ftp_download(start_index=start_index, end_index=end_index)

In [10]:
from src.medline_preprocessing.medl_xml_to_json import pubmed_pack_to_json

pubmed_pack_to_json(start_index, end_index)



In [11]:
from src.medline_preprocessing.medl_sent_parse import parse_sents

# Attention: parsing one file takes a little more time
#            parsing many files takes many hours
parse_sents(start_index=start_index, end_index=end_index)

reactions keys: dict_keys(['5', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
number drug_names: 2939


In [12]:
from src.medline_preprocessing.featurize import SentenceFeaturerizer

sent_featurizer = SentenceFeaturerizer()
sent_featurizer.featurize_tagged_pubmed_sents(start_index, end_index)