In [1]:
from IPython import get_ipython
from importlib import reload

# Import the desired libraries
import txgnn

# Reload the libraries
reload(txgnn)

from txgnn import TxData, TxGNN, TxEval

import numpy as np
import pandas as pd
import torch
from torch_geometric.utils import k_hop_subgraph
from txgnn.data_splits.do_obo_parser import OBOReader as DO_Reader
import os
from goatools.obo_parser import GODag
from goatools.godag.go_tasks import get_go2parents, get_go2children
from txgnn.utils import preprocess_kg, create_split, process_disease_area_split, create_dgl_graph, evaluate_graph_construct, convert2str, data_download_wrapper, map_node_id_2_idx
dirname = '/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits'

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class DataSplit:
      def __init__(self, kg_path=''): 
         self.kg, self.nodes, self.edges = self.load_kg(kg_path)
         self.edge_index = torch.LongTensor(self.edges.get(['x_index', 'y_index']).values.T)
         self.goid2parent, self.goid2children = self.load_go()
         
      def load_kg(self, pth=''):
         kg = pd.read_csv(pth+'kg.csv', low_memory=False)
         nodes = pd.read_csv(pth+'nodes.csv', low_memory=False)
         edges = pd.read_csv(pth+'edges.csv', low_memory=False)
         return kg, nodes, edges    
      
      def load_go(self):
          go_dag = GODag('/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo')
          goid2parent = get_go2parents(go_dag, relationships=set())
          goid2children = get_go2children(go_dag, relationships=set())
          return goid2parent, goid2children
      
      def get_molfunc_nodes(self):
         self.molfunc_nodes = np.array(self.nodes[(self.nodes['node_type']=='molecular_function') & (self.nodes['node_source']=='GO')].node_index)
      
      def get_children(self, go_id, obo_file=os.path.join(dirname, 'go-basic.obo')):
         go_dag = GODag('/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo')
         go_term = go_dag[go_id]

         children = go_term.get_all_children()
         children_data = [{'node_name': go_dag[child_id].name, 'go_id': child_id} for child_id in children]
         children_df = pd.DataFrame(children_data)

         return children_df

      def get_nodes_for_goid(self, code):
          children_df = self.get_children(code)
          if children_df.empty:
            children_df = pd.DataFrame(columns=['node_name', 'go_id'])
          merged_df = pd.merge(self.nodes, children_df, on='node_name', how='inner')
          return merged_df
          return merged_df.get('node_index').values
      
      def get_edge_group(self, nodes, test_size = 0.05, add_prot_mf=True): 
         test_num_edges = round(self.edge_index.shape[1]*test_size)
         
         if add_prot_mf: 
            x = self.edges.query('x_index in @nodes or y_index in @nodes').query('relation=="molfunc_protein" or relation=="rev_molfunc_protein"')
            drug_dis_edges = x.get(['x_index','y_index']).values.T
            num_random_edges = test_num_edges - drug_dis_edges.shape[1]
         else: 
            num_random_edges = test_num_edges
            
         
         subgraph_nodes, filtered_edge_index, node_map, edge_mask = k_hop_subgraph(list(nodes), 2, self.edge_index) #one hop neighborhood
         sample_idx = np.random.choice(filtered_edge_index.shape[1], num_random_edges, replace=False)
         sample_edges = filtered_edge_index[:, sample_idx].numpy()
         
         if add_prot_mf:
            test_edges = np.concatenate([drug_dis_edges, sample_edges], axis=1)
         else: 
            test_edges = sample_edges
         
         test_edges = np.unique(test_edges, axis=1)
         return test_edges 
      
      def get_test_kg_for_disease(self, goid_code, test_size = 0.05, add_prot_mf=True): 
         molfunc_nodes = self.get_nodes_for_goid(goid_code)
         molfunc_edges = self.get_edge_group(molfunc_nodes, test_size = test_size, add_prot_mf=add_prot_mf)
         molfunc_edges = pd.DataFrame(molfunc_edges.T, columns=['x_index','y_index'])
         select_kg = pd.merge(self.kg, molfunc_edges, 'right').drop_duplicates()
         return select_kg
      

In [8]:
ds = DataSplit('/Users/emmatysinger/Develop/meng/kg/')
ds.get_molfunc_nodes()

/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms


In [11]:
name2id = {
            'binding': 'GO:0005488', 
            'transporter activity': 'GO:0005215',
            'molecular function regulator': 'GO:0098772', 
            'oxidoreductase activity': 'GO:0016491',
            'hydrolase activity': 'GO:0016787',
            'transferase activity': 'GO:0016740'
        }

for name, id in name2id.items():
    data_folder = '/Users/emmatysinger/Develop/meng/kg/molecular_function_files/'
    nodes_df = ds.get_nodes_for_goid(id)
    nodes_df[['node_id', 'node_type', 'node_name', 'node_source']].to_csv(os.path.join(data_folder, name+'.csv'))

/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms


In [9]:
nodes_df = ds.get_nodes_for_goid('GO:0016491')
nodes_df[['node_id', 'node_type', 'node_name', 'node_source']].to_csv()

/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms


Unnamed: 0,node_index,node_id,node_type,node_name,node_source,go_id
0,53534,33735,molecular_function,aspartate dehydrogenase activity,GO,GO:0033735
1,53574,16709,molecular_function,"oxidoreductase activity, acting on paired dono...",GO,GO:0016709
2,53576,8395,molecular_function,steroid hydroxylase activity,GO,GO:0008395
3,53577,16703,molecular_function,"oxidoreductase activity, acting on single dono...",GO,GO:0016703
4,53589,16616,molecular_function,"oxidoreductase activity, acting on the CH-OH g...",GO,GO:0016616
...,...,...,...,...,...,...
2272,124201,103012,molecular_function,ferredoxin-thioredoxin reductase activity,GO,GO:0103012
2273,124202,52661,molecular_function,S-lactaldehyde reductase activity,GO,GO:0052661
2274,124203,52660,molecular_function,R-lactaldehyde reductase activity,GO,GO:0052660
2275,124204,36456,molecular_function,L-methionine-(S)-S-oxide reductase activity,GO,GO:0036456


In [10]:
nodes_df[['node_id', 'node_type', 'node_name', 'node_source']]

Unnamed: 0,node_id,node_type,node_name,node_source
0,33735,molecular_function,aspartate dehydrogenase activity,GO
1,16709,molecular_function,"oxidoreductase activity, acting on paired dono...",GO
2,8395,molecular_function,steroid hydroxylase activity,GO
3,16703,molecular_function,"oxidoreductase activity, acting on single dono...",GO
4,16616,molecular_function,"oxidoreductase activity, acting on the CH-OH g...",GO
...,...,...,...,...
2272,103012,molecular_function,ferredoxin-thioredoxin reductase activity,GO
2273,52661,molecular_function,S-lactaldehyde reductase activity,GO
2274,52660,molecular_function,R-lactaldehyde reductase activity,GO
2275,36456,molecular_function,L-methionine-(S)-S-oxide reductase activity,GO


In [97]:
for id in ds.goid2children['GO:0003824']:
    print(id, len(ds.get_nodes_for_goid(id)))


/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0016491 2277
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0016874 279
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0016218 0
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0016829 605
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0061783 8
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0140640 327
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0004133 0
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:1904091 2
/Users/emmatysi

In [83]:
for id in ['GO:0005488', 'GO:0005198', 'GO:0003824', 'GO:0038024', 'GO:0016209', 'GO:0140104', 'GO:0005215', 'GO:0045182', 'GO:0140110', 'GO:0098772']:
    print(id, len(ds.get_nodes_for_goid(id)))


/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0005488 1794
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0005198 41
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0003824 7102
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0038024 17
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0016209 24
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0140104 15
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0005215 950
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
GO:0045182 12
/Users/emma

In [21]:
ds.nodes[(ds.nodes['node_type']=='molecular_function') & (ds.nodes['node_source']=='GO')]

Unnamed: 0,node_index,node_id,node_type,node_name,node_source
53517,53517,8168,molecular_function,methyltransferase activity,GO
53518,53518,140101,molecular_function,"catalytic activity, acting on a tRNA",GO
53519,53519,140097,molecular_function,"catalytic activity, acting on DNA",GO
53520,53520,140096,molecular_function,"catalytic activity, acting on a protein",GO
53521,53521,140098,molecular_function,"catalytic activity, acting on RNA",GO
...,...,...,...,...,...
124217,124217,61922,molecular_function,histone propionyltransferase activity,GO
124218,124218,61995,molecular_function,ATP-dependent protein-DNA complex displacement...,GO
124219,124219,51266,molecular_function,sirohydrochlorin ferrochelatase activity,GO
124220,124220,51740,molecular_function,ethylene binding,GO


In [36]:
# Filtering the DataFrame
filtered_df = ds.nodes[(ds.nodes['node_type'] == 'molecular_function') & (ds.nodes['node_source'] == 'GO')]

# Creating the dictionary
node_dict = filtered_df.set_index('node_name')['node_index'].to_dict()

In [13]:
kg_path = '/Users/emmatysinger/Develop/meng/kg/'
split = 'binding'
preprocess_kg(kg_path, split)

Generating molecular function area using gene ontology... might take several minutes...
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
Iterating over relations...


100%|██████████| 30/30 [00:39<00:00,  1.32s/it]


Iterating over node types...


100%|██████████| 10/10 [00:22<00:00,  2.29s/it]


In [23]:
kg_path = '/Users/emmatysinger/Develop/meng/kg/'
data_folder = '/Users/emmatysinger/Develop/meng/kg/'
seed = 42
kg_path = os.path.join(kg_path, split + '_kg', 'kg_directed.csv')
df = pd.read_csv(kg_path)
split_data_path = os.path.join(data_folder, split + '_' + str(seed))
             
if not os.path.exists(os.path.join(split_data_path, 'train.csv')):
    if not os.path.exists(split_data_path):
        os.mkdir(split_data_path)           
    print('Creating splits... it takes several minutes...')
    df_train, df_valid, df_test = create_split(df, split, None, split_data_path, seed)
else:
    print('Splits detected... Loading splits....')
    df_train = pd.read_csv(os.path.join(split_data_path, 'train.csv'))
    df_valid = pd.read_csv(os.path.join(split_data_path, 'valid.csv'))
    df_test = pd.read_csv(os.path.join(split_data_path, 'test.csv'))

Splits detected... Loading splits....


In [28]:
def process_molfunc_area_split(data_folder, df, df_test, split):
    molfunc_file_path = os.path.join(data_folder, 'molecular_function_files')
    molfunc_list = pd.read_csv(os.path.join(molfunc_file_path, split + '.csv'))
    print(molfunc_list)
    
    id2idx = dict(df[df.x_type == 'molecular_function'][['x_id', 'x_idx']].drop_duplicates().values)
    id2idx.update(dict(df[df.y_type == 'molecular_function'][['y_id', 'y_idx']].drop_duplicates().values))

    temp_dict = {}

    # for merged disease ids
    for i,j in id2idx.items():
        try:
            if '_' in i:
                for x in i.split('_'):
                    temp_dict[str(float(x))] = j
        except:
            temp_dict[str(float(i))] = j

    id2idx.update(temp_dict)

    molfunc_list['node_idx'] = molfunc_list.node_id.apply(lambda x: map_node_id_2_idx(x, id2idx))
    print(molfunc_list)

    molfunc_rel_types = ['molfunc_protein']
    temp = df_test[df_test.relation.isin(molfunc_rel_types)]
    df_test = df_test.drop(temp[~temp.x_idx.isin(molfunc_list.node_idx.unique())].index)
    
    return df_test

In [29]:
df_test = process_molfunc_area_split(data_folder, df, df_test, split)

      Unnamed: 0  node_id           node_type  \
0              0     3723  molecular_function   
1              1    90079  molecular_function   
2              2     1067  molecular_function   
3              3     5152  molecular_function   
4              4    46906  molecular_function   
...          ...      ...                 ...   
1789        1789    42924  molecular_function   
1790        1790    43426  molecular_function   
1791        1791    31267  molecular_function   
1792        1792     3786  molecular_function   
1793        1793    51740  molecular_function   

                                              node_name node_source  
0                                           RNA binding          GO  
1     translation regulator activity, nucleic acid b...          GO  
2     transcription regulatory region nucleic acid b...          GO  
3            interleukin-1 receptor antagonist activity          GO  
4                                  tetrapyrrole binding      

In [2]:
TxData_inst = TxData(data_folder_path = '/Users/emmatysinger/Develop/meng/kg/')
TxData_inst.prepare_split(split = 'molecular function regulator', seed = 42, no_kg = False)

Found local copy...
Found local copy...
Found local copy...
First time usage... Mapping TxData raw KG to directed csv... it takes several minutes...
Generating molecular function area using gene ontology... might take several minutes...
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
/Users/emmatysinger/Develop/meng/TxGNN/txgnn/data_splits/go-basic.obo: fmt(1.2) rel(2023-10-09) 46,296 Terms
Iterating over relations...


100%|██████████| 30/30 [00:39<00:00,  1.30s/it]


Iterating over node types...


100%|██████████| 10/10 [00:23<00:00,  2.36s/it]


Creating splits... it takes several minutes...
     Unnamed: 0  node_id           node_type  \
0             0     5152  molecular_function   
1             1    30695  molecular_function   
2             2    48018  molecular_function   
3             3    30547  molecular_function   
4             4    30546  molecular_function   
..          ...      ...                 ...   
221         221     1691  molecular_function   
222         222    10859  molecular_function   
223         223  1990948  molecular_function   
224         224    72587  molecular_function   
225         225    10855  molecular_function   

                                             node_name node_source  
0           interleukin-1 receptor antagonist activity          GO  
1                            GTPase regulator activity          GO  
2                             receptor ligand activity          GO  
3                signaling receptor inhibitor activity          GO  
4                signaling rece