In [1]:
#######JSON TO CSV FUNCTION to turn indication paths.json --> indication paths.csv#########

import json
import os
import pandas as pd

def json_to_csv(json_file, csv_file):
    with open(json_file) as f:
        json_data = json.load(f)
    
    df = pd.json_normalize(json_data)
    df.to_csv(csv_file, index=False)

current_dir = os.getcwd()

# Define the sibling directory name
drugmechdb_data_dir = os.path.join(os.path.dirname(current_dir), 'DrugMechDB Data')

# Example usage
json_file = os.path.join(drugmechdb_data_dir, 'Raw Data', 'indication_paths.json')
csv_file = os.path.join(drugmechdb_data_dir, 'Raw Data', 'indication_paths.csv')
json_to_csv(json_file, csv_file)

In [2]:
#######EXTRACTING TRIPLES FROM DRUGMECHDB PATHWAYS#########
df_raw = pd.read_csv(os.path.join(drugmechdb_data_dir, 'Raw Data','indication_paths.csv'))
links = df_raw['links']
nodes = df_raw['nodes']
triples_list = []

def check_column_empty_at_index(i): ###function to return true if comments column at specified row is empty. singles out less reliable indication paths 
    if pd.isnull(df_raw.at[i,'comment']) and pd.isnull(df_raw.at[i,'comments']) and pd.isnull(df_raw.at[i,'references']) and pd.isnull(df_raw.at[i,'commments']) and pd.isnull(df_raw.at[i,'comemnt']) == True:
        return True
    else:
        return False
        
for i in range(len(nodes)):
    protein_exists = links[i].find('UniProt:')
    if protein_exists == -1: #singles out the faulty pathways and ones that don't contain proteins
        continue
    if check_column_empty_at_index(i) == True:                      ### if no comments in this indication path, then execute all these lines below
        drug = df_raw['graph.drug'][i].lower() #extract drug/disease names and id's
        drug_id_mesh = df_raw['graph.drug_mesh'][i]
        drug_id_bank = df_raw['graph.drugbank'][i].replace('DB:DB','DRUGBANK:DB')
        disease = df_raw['graph.disease'][i].lower()
        disease_id_mesh = df_raw['graph.disease_mesh'][i]

        links_clean = links[i].replace("{",'').replace("}",'').replace("'",'').replace("[",'').replace("]",'').split(", ") ###cleaning edges of indication paths
        links_grouped = [links_clean[i:i+3] for i in range(0, len(links_clean), 3)] ###all indication paths are separated into different edges
        links_grouped_clean = [', '.join(inner_list) for inner_list in links_grouped] ###each edge's information (source, path, target) is grouped together
        
        drug_positively_regulates_protein = f"key: positively regulates, source: {drug_id_mesh}, target: UniProt:" 
        drugbank_positively_regulates_protein = f"key: positively regulates, source: {drug_id_bank.replace('DRUGBANK:','DB:')}, target: UniProt:"
        drug_negatively_regulates_protein_count = f"key: negatively regulates, source: {drug_id_mesh}, target: UniProt:"
        drugbank_negatively_regulates_protein_count = f"key: negatively regulates, source: {drug_id_bank.replace('DRUGBANK:','DB:')}, target: UniProt:"
        drug_decreases_activity_of_protein_count = f"key: decreases activity of, source: {drug_id_mesh}, target: UniProt:"
        drugbank_decreases_activity_of_protein_count = f"key: decreases activity of, source: {drug_id_bank.replace('DRUGBANK:','DB:')}, target: UniProt:"
        drug_increases_activity_of_protein_count = f"key: increases activity of, source: {drug_id_mesh}, target: UniProt:" 
        drugbank_increases_activity_of_protein_count = f"key: increases activity of, source: {drug_id_bank.replace('DRUGBANK:','DB:')}, target: UniProt:"
        array = [drug_positively_regulates_protein, drugbank_positively_regulates_protein, drug_negatively_regulates_protein_count, drugbank_negatively_regulates_protein_count, drug_decreases_activity_of_protein_count, drugbank_decreases_activity_of_protein_count, drug_increases_activity_of_protein_count, drugbank_increases_activity_of_protein_count]
        
        for j in range(len(links_grouped_clean)): ###iterates thru each edge in an indication path 
            is_present = any(array[s] in links_grouped_clean[j] for s in range(len(array))) ###checks if specified edge is a direct drug-protein relationship]
            if is_present:  ###if specified edge is a direct drug-protein relationship, extract protein id and make a triple out of it
                protein_id = links_grouped[j][2].replace("target: UniProt", "UniProtKB")
                protein_name = 'null' ###we can do this because we already have protein id. once we node normalize in the next step, we will get a gene name
                triples_temp = [drug.replace("'",''),drug_id_bank.replace("'",''),disease.replace("'",''),disease_id_mesh.replace("'",''),protein_name, protein_id]
                triples_list.append(triples_temp)
    

In [3]:
triples_list[228][1] = "DRUGBANK:DB08902" ####fixing raw data error.
triples_list[2301][1] = "DRUGBANK:DB02362"
triples_list[2302][1] = "DRUGBANK:DB02362"
triples_list[2303][1] = "DRUGBANK:DB02362"
triples_list[2304][1] = "DRUGBANK:DB02362"

In [4]:
print(links_grouped[1])

['key: positively correlated with', 'source: UniProt:P04150', 'target: GO:0120178']


In [5]:
print(len(triples_list))

3882


In [6]:
import numpy as np
triples_array = np.array(triples_list)
df = pd.DataFrame(data = triples_array)
df.columns = ['drug_name', 'drug_id', 'disease_name','disease_id','gene_name','gene_id']
df.to_csv(os.path.join(drugmechdb_data_dir, 'Processed Data', 'DrugMechDB Processed triples.csv'))