In [4]:

import pandas as pd
import requests as rq
import os
parent_dir =os.path.dirname(os.getcwd()) 

drugmech_array = pd.read_csv(os.path.join(parent_dir,'DrugmechDB Data/Processed Data/DrugMechDB Processed Triples.csv'))

robokops_array = pd.read_csv(os.path.join(parent_dir,'ROBOKOP Data/ROBOKOP Processed Triples.csv'))



In [5]:
stacked_df = pd.concat([drugmech_array, robokops_array], axis=0,ignore_index=True)
stacked_df = stacked_df.drop('Unnamed: 0', axis = 1)

In [6]:
print(stacked_df)

           drug_name           drug_id  \
0           imatinib  DRUGBANK:DB00619   
1           imatinib  DRUGBANK:DB00619   
2           imatinib  DRUGBANK:DB00619   
3      acetaminophen  DRUGBANK:DB00316   
4      acetaminophen  DRUGBANK:DB00316   
...              ...               ...   
11940   theophylline       CHEBI:28177   
11941     dyphylline        CHEBI:4728   
11942     dyphylline        CHEBI:4728   
11943     dyphylline        CHEBI:4728   
11944     dyphylline        CHEBI:4728   

                                            disease_name     disease_id  \
0                                              cml (ph+)   MESH:D015464   
1                             systemic mast cell disease   MESH:D034721   
2                             systemic mast cell disease   MESH:D034721   
3                                                   pain   MESH:D010146   
4                                                   pain   MESH:D010146   
...                                          

In [7]:
import json
import time 

def run_node_normalizer(id):
    id_url = id.replace(":","%3A")
    URL= f"https://nodenormalization-sri.renci.org/1.5/get_normalized_nodes?curie={id_url}&conflate=true&drug_chemical_conflate=true&description=false"
    response = rq.get(url = URL)
    response_json = response.json()
    if response.status_code == 200 and response_json[id] != None:
        identifier = response_json[id]["id"]["identifier"]
        name = response_json[id]["id"]["label"].lower()
    return identifier, name

In [8]:
drug_dict = {}

for index, row in stacked_df.iterrows():
    try:
        print(f"Iteration: {index}")
        if row['drug_id'] in drug_dict.keys():
            stacked_df.loc[index,'drug_name'] = drug_dict[row['drug_id']][1]
            stacked_df.loc[index,'drug_id'] = drug_dict[row['drug_id']][0]
        else: 
            id, name = run_node_normalizer(row['drug_id']) 
            drug_dict.update({row['drug_id']:[id, name]})
            stacked_df.loc[index,'drug_name'] = drug_dict[row['drug_id']][1]
            stacked_df.loc[index,'drug_id'] = drug_dict[row['drug_id']][0]
    except UnboundLocalError: ###if the triple can't be node normalized, remove this row
        stacked_df.loc[index,'drug_name'] = 0
        print(f"Skipped {index} bc node normalizer cant find this")
        continue
        

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Skipped 12 bc node normalizer cant find this
Iteration: 13
Skipped 13 bc node normalizer cant find this
Iteration: 14
Skipped 14 bc node normalizer cant find this
Iteration: 15
Skipped 15 bc node normalizer cant find this
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32


KeyboardInterrupt: 

In [None]:
disease_dict = {}

for index, row in stacked_df.iterrows():
    try:
        print(f"Iteration: {index}")
        if row['disease_id'] in disease_dict.keys():
            stacked_df.loc[index,'disease_name'] = disease_dict[row['disease_id']][1]
            stacked_df.loc[index,'disease_id'] = disease_dict[row['disease_id']][0]
        else: 
            id, name = run_node_normalizer(row['disease_id'])
            disease_dict.update({row['disease_id']:[id, name]})
            stacked_df.loc[index,'disease_name'] = disease_dict[row['disease_id']][1]
            stacked_df.loc[index,'disease_id'] = disease_dict[row['disease_id']][0]
    except UnboundLocalError:
        stacked_df.loc[index,'disease_name'] = 0
        print(f"Skipped {index} bc node normalizer cant find this")
        continue
    
        

In [None]:
gene_dict = {}

for index, row in stacked_df.iterrows():
    try:
        print(f"Iteration: {index}")
        if row['gene_id'] in gene_dict.keys():
            stacked_df.loc[index,'gene_name'] = gene_dict[row['gene_id']][1]
            stacked_df.loc[index,'gene_id'] = gene_dict[row['gene_id']][0]
        else: 
            id, name = run_node_normalizer(row['gene_id']) 
            gene_dict.update({row['gene_id']:[id, name]})
            stacked_df.loc[index,'gene_name'] = gene_dict[row['gene_id']][1]
            stacked_df.loc[index,'gene_id'] = gene_dict[row['gene_id']][0]
    except UnboundLocalError:
        stacked_df.loc[index,'gene_name'] = 0
        print(f"Skipped {index} bc node normalizer cant find this")
        continue
    
        

In [8]:
#####data cleaning ###remove all nodes that can't be node normalized
columns_to_check = ['drug_name','disease_name','gene_name']
stacked_df = stacked_df[~(stacked_df[columns_to_check] == 0).any(axis=1)] 

In [10]:
#####import hgnc complete set to change gene names that can be changed to protein names
json_file = os.path.join(parent_dir, 'hgnc_complete_set.json')
with open(json_file, 'r') as file:
    data = json.load(file)

data_len = len(data['response']['docs'])

gene_prot_list = []

for i in range(data_len):
    gene_symbol = data['response']['docs'][i]['symbol']
    protein_name = data['response']['docs'][i]['name']
    row = [gene_symbol,protein_name]
    gene_prot_list.append(row)

columns = ['symbol','name']

hgnc_df = pd.DataFrame(gene_prot_list, columns=['symbol','name'])


In [11]:
print(stacked_df)

          drug_name      drug_id  \
0          imatinib  CHEBI:45783   
1          imatinib  CHEBI:45783   
2          imatinib  CHEBI:45783   
3       paracetamol  CHEBI:46195   
4       paracetamol  CHEBI:46195   
...             ...          ...   
11940  theophylline  CHEBI:28177   
11941    dyphylline   CHEBI:4728   
11942    dyphylline   CHEBI:4728   
11943    dyphylline   CHEBI:4728   
11944    dyphylline   CHEBI:4728   

                                            disease_name     disease_id  \
0                                              cml (ph+)   MESH:D015464   
1                             systemic mast cell disease   MESH:D034721   
2                             systemic mast cell disease   MESH:D034721   
3                                                   pain   MESH:D010146   
4                                                   pain   MESH:D010146   
...                                                  ...            ...   
11940  asthma-chronic obstructive pulmonar

In [11]:
#####TO OBTAIN DATASET WITH ONLY PROTEIN DESCRIPTORS (IF YOU ONLY WANT GENE NAMES SKIP THIS STEP)######
import pandas as pd

def GetProteinName(gene_name):
    try:
        i = hgnc_df[hgnc_df['symbol']==gene_name.upper()].index.values
        #print(str(i) + " is the index")
        index = int(i[0])
        protein = hgnc_df.at[index, 'name']
        #print(gene + " maps to " + protein)
    except:
        print(f"Could not map gene symbol:{gene_name}")
        protein = 'null'
    return protein

for i in range(len(stacked_df)):
    protein_descriptor = GetProteinName(stacked_df.iloc[i,4]).lower()
    stacked_df.iloc[i,4] = protein_descriptor


Could not map gene symbol:parc_haein dna topoisomerase 4 subunit a (sprot)
Could not map gene symbol:rs12_ecoli small ribosomal subunit protein us12 (sprot)
Could not map gene symbol:erg11
Could not map gene symbol:pbp2_strr6 penicillin-binding protein 2b (sprot)
Could not map gene symbol:fusa
Could not map gene symbol:rl10_haein large ribosomal subunit protein ul10 (sprot)
Could not map gene symbol:parc_strpn dna topoisomerase 4 subunit a (sprot)
Could not map gene symbol:mrda_ecoli peptidoglycan d,d-transpeptidase mrda (sprot)
Could not map gene symbol:mrda_haein peptidoglycan d,d-transpeptidase mrda (sprot)
Could not map gene symbol:rl10_salti large ribosomal subunit protein ul10 (sprot)
Could not map gene symbol:dpol_hcmva dna polymerase catalytic subunit (sprot)
Could not map gene symbol:rpoc
Could not map gene symbol:dacb_ecoli d-alanyl-d-alanine carboxypeptidase dacb (sprot)
Could not map gene symbol:pbpa_strr6 penicillin-binding protein 1a (sprot)
Could not map gene symbol:ccd8

In [12]:
mask = stacked_df['gene_name'] != 'null' ###remove all rows where gene couldnt be converted to protein name
filtered_df = stacked_df[mask]
filtered_df.rename(columns ={'gene_name':'protein_name'},inplace = True) ####we only select drug, disease, and protein columns for final dataframe
final_columns = ['drug_name','disease_name','protein_name']
final_df = filtered_df.drop_duplicates(subset = final_columns)
final_df = final_df.reset_index().drop('index', axis = 1).drop('gene_id',axis=1).drop('drug_id',axis=1).drop('disease_id',axis=1)

final_df.to_csv(os.path.join(parent_dir,'ROBOKOP+DrugmechDB Data/ROBOMechDB Processed Triples.csv'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.rename(columns ={'gene_name':'protein_name'},inplace = True) ####we only select drug, disease, and protein columns for final dataframe


In [14]:
final_df.to_csv('/Users/eding/Desktop/U24 ROBOKOP Project/Step 1 Data Processing/ROBOKOP+DrugmechDB/ROBOKOP+DrugmechDB Data/ROBOMechDB Processed Triples.csv')