In [10]:
import json
import nltk
import pickle
import networkx as nx

import pmdlib as pm 
# import parse_gaf_annotation

from networkx.readwrite import json_graph
# from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize

In [11]:
# parsing pubmed articles from the database file
pmid_abstracts_dict = pm.parse_medline_rmap(medline_file="../datasets/myMEDLINE-file-cancer-04252024.text")

In [12]:
# testing the extraction
counter = 0
for pmid, abst in pmid_abstracts_dict.items():
    print("PMID: ", pmid, "ABST: ", abst)
    print("-----------------------------")
    counter += 1
    if counter > 1:
        break
    

PMID:  439 ABST:  The authors, with 67 Gallium, have obtained positive scintigraphy of the breast only in cases of carcinoma. Reliability of negative scintigraphy however is less good. Isotopic investigation of the bones is important in breast cancer and reveal early osseous metastasis.
-----------------------------
PMID:  2274 ABST:  Serum prolactin concentrations were measured by radioimmunoassays in 98 patients with established carcinoma of breast, 12 patients with cystic mastitis and 10 patients with gynaecomastia and compared with that of age matched normal control women. The serum prolactin levels in the patients with breast cancer, gynaecomastia or cystic mastitis were observed to be similar to that in normal women. It was interesting to note that the levels of prolactin in the luteal phase of the cycle were higher than that in the early follicular phase in normal women.
-----------------------------


In [13]:
def load_pathway_proteins(prot_file):
    prot_list = []
    handle = open(prot_file)
    for line in handle.readlines():
        prot_list.append(line.strip().lower())
    return prot_list
        
# test the function with the protein file    
full_pwprot_list = load_pathway_proteins("../resources/kegg-unique-cancer-proteins.txt")
full_pwprot_list[:5]

['esr1', 'er', 'esr', 'esra', 'estrr']

In [14]:
def load_nct_drug_members(drug_file):
    nct_drug_list = []
    handle = open(drug_file)
    for line in handle.readlines():
        nct_drug_list.append(line.strip().lower())
    return nct_drug_list
        
# test the function with the protein file    
nct_drug_list = load_nct_drug_members("../resources/nct-combo-members.txt")
nct_drug_list[:5]
    

['rosiglitazone', 'pioglitazone', 'alli', 'inh', 'dv']

In [15]:
# Step 3: Create a function to compute the distance between two words
def word_distance(word_list, word1, word2):
    try:
        index1 = word_list.index(word1)
        index2 = word_list.index(word2)
        return abs(index1 - index2)
    except ValueError as e:
        return str(e)

In [16]:
def find_drug_pwprot(drug_list, pwprot_list, pubs_dict):
    count = 0
    drug_pwprot_dict = {}
    for pmid, abst in pubs_dict.items():
        medline_record = []
        words = word_tokenize(abst.lower())
        # for word in words:
        for drug in drug_list:
            for pwprot in pwprot_list:
                if drug in words and pwprot in words:
                    proximity = word_distance(words, drug, pwprot)  
                    medline_record.append((drug, pwprot, proximity))                        
                    
                    count += 1
                    if count % 1000 == 0:
                        print("Current count: ", count)
                        print("PMID: ", pmid, ", Drug: ->", drug, " -> ", ", Pwprot: -> ",  pwprot, ", Proximity:  -> ", proximity)
                        # break
        if len(medline_record) > 0:              
            drug_pwprot_dict[pmid] = medline_record
    return drug_pwprot_dict

In [17]:
drug_prot_info_dict = find_drug_pwprot(nct_drug_list, full_pwprot_list, pmid_abstracts_dict)

Current count:  1000
PMID:  2903322 , Drug: -> progesterone  ->  , Pwprot: ->  pgr , Proximity:  ->  2
Current count:  2000
PMID:  7499360 , Drug: -> progesterone  ->  , Pwprot: ->  p53 , Proximity:  ->  14
Current count:  3000
PMID:  8706561 , Drug: -> testosterone  ->  , Pwprot: ->  er , Proximity:  ->  14
Current count:  4000
PMID:  9816160 , Drug: -> progesterone  ->  , Pwprot: ->  pgr , Proximity:  ->  97
Current count:  5000
PMID:  11062733 , Drug: -> danazol  ->  , Pwprot: ->  pr , Proximity:  ->  17
Current count:  6000
PMID:  12046611 , Drug: -> progesterone  ->  , Pwprot: ->  er , Proximity:  ->  9
Current count:  7000
PMID:  14529565 , Drug: -> progesterone  ->  , Pwprot: ->  p53 , Proximity:  ->  19
Current count:  8000
PMID:  15601642 , Drug: -> progesterone  ->  , Pwprot: ->  er , Proximity:  ->  55
Current count:  9000
PMID:  16334152 , Drug: -> progesterone  ->  , Pwprot: ->  er , Proximity:  ->  3
Current count:  10000
PMID:  17169390 , Drug: -> exemestane  ->  , Pwpro

In [18]:
len(drug_prot_info_dict)

18293

In [21]:
counter = 0

for key, val in drug_prot_info_dict.items():
    print("key: ", key, ", val: ", val)
    counter += 1
    if counter > 10:
        break

key:  92478 , val:  [('progesterone', 'pr', 3)]
key:  221750 , val:  [('progesterone', 'er', 48), ('progesterone', 'pr', 3)]
key:  303543 , val:  [('cyclophosphamide', 'pr', 79), ('methotrexate', 'pr', 69)]
key:  326386 , val:  [('progesterone', 'er', 118), ('progesterone', 'pgr', 52)]
key:  345042 , val:  [('progesterone', 'er', 70), ('progesterone', 'era', 74)]
key:  363253 , val:  [('cyclophosphamide', 'pr', 67), ('methotrexate', 'pr', 62), ('prednisone', 'pr', 46)]
key:  385079 , val:  [('progesterone', 'er', 110), ('progesterone', 'pgr', 3)]
key:  394576 , val:  [('cyclophosphamide', 'pr', 89), ('methotrexate', 'pr', 82)]
key:  455226 , val:  [('estradiol', 'er', 20)]
key:  467095 , val:  [('progesterone', 'er', 105), ('progesterone', 'pgr', 107)]
key:  494363 , val:  [('estradiol', 'er', 27)]


In [19]:
# File path where you want to save the JSON file
file_path = '../outputs/nct-drug-cancer-pathways-06062024.json'


# Write the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(drug_prot_info_dict, json_file, indent=4)  # indent=4 is used for pretty-printing

print(f"Dictionary has been saved to {file_path}")

Dictionary has been saved to ../outputs/nct-drug-cancer-pathways-06062024.json


In [None]:
# # File path where you want to save the JSON file
# file_path = '../outputs/nct-drug-pwprot-06012024.json'


# # Write the dictionary to a JSON file
# with open(file_path, 'w') as json_file:
#     json.dump(drug_prot_info_dict, json_file, indent=4)  # indent=4 is used for pretty-printing

# print(f"Dictionary has been saved to {file_path}")

In [66]:
# explorer = 0
# for key, value in drug_prot_info_dict.items():
#     if len(value) > 0:
        # print("Key: ", key, " Value: ", value)
    # explorer +=1
    # if explorer < 100:
    #     break

In [58]:
# pip install tqdm

In [44]:
from tqdm import tqdm
import time

# Example task: a loop that sleeps for a short time
for i in tqdm(range(10)):
    time.sleep(0.5)  # Simulate a task taking some time

100%|██████████| 10/10 [00:05<00:00,  1.98it/s]


In [102]:
import pickle
import networkx as nx

# Load graph layers from pickle files
combos_pickle_path = "../datasets/fda-validated-combos.pickle"

# Load the combo and data target graph layers
with open(combos_pickle_path, "rb") as file:
    combosGraphLayer = pickle.load(file)

In [107]:
# Load the combo and data target graph layers
drug_file = "../datasets/nct_drugs.txt"

nct_drugs = []
handle = open(drug_file)
for drug in handle.readlines():
    nct_drugs.append(drug.strip().lower())
    
   

In [109]:
# for drug in nct_drugs:
#     print(drug)

In [86]:
def calc_dist_and_chunk(text, word1, word2):
    # Find the indices of the first occurrence of each word in the text
    index1 = text.find(word1)
    index2 = text.find(word2)  
    
    # If either word is not found, return -1 indicating an error
    if index1 == -1 or index2 == -1:
        distance = -1
    else:
    
        # Calculate the distance between the words
        distance = abs(index1 - index2) 
    
    return distance


In [116]:
count = 0
hits_count = 0
current_count = 0
list_of_hits = []
for pmid, abst in pmid_abstracts_dict.items():
    current_count += 1
    for drug in nct_drugs:
        for prot in full_prot_list:
            if prot in abst and drug in abst:
                proximity, chunk = calc_dist_and_chunk(abst, drug, prot)
                if proximity > 0:
                    hits_count += 1
                    list_of_hits.append((drug, prot, proximity, pmid, abst))

    if hits_count % 101 == 0:
        print("Current number of hits: ", hits_count)
        hits_count += 1
    
    if current_count % 100000 == 0 :
        print("Publications processed : ", current_count)        


Current number of hits:  0
Current number of hits:  303
Current number of hits:  505
Current number of hits:  808
Current number of hits:  909
Current number of hits:  1010
Current number of hits:  1212
Current number of hits:  1313
Current number of hits:  1414
Current number of hits:  1616
Current number of hits:  1818
Current number of hits:  1919
Current number of hits:  2020
Current number of hits:  2222
Current number of hits:  2424
Current number of hits:  2525
Current number of hits:  2626
Current number of hits:  2727
Current number of hits:  2929
Current number of hits:  3232
Current number of hits:  3333
Current number of hits:  3535
Current number of hits:  3737
Current number of hits:  4141
Current number of hits:  4444
Current number of hits:  4545
Current number of hits:  4646
Current number of hits:  4747
Current number of hits:  4848
Current number of hits:  4949
Current number of hits:  5151
Current number of hits:  5252
Current number of hits:  5353
Current number of

In [145]:
import json

# Define the file path
file_path = "../datasets/drug-protein-prox-hits.json"

# Write the data to a JSON file
with open(file_path, "w") as json_file:
    json.dump(list_of_hits, json_file)





