# Subgraph creation

This notebook is used to create a subgraph using the selected datasets.

# Imports

In [1]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm

from networkx import DiGraph, connected_components

from utils import filter_graph, KG_DATA_PATH, DATA_DIR

In [2]:
logger = logging.getLogger(__name__)

# Pre-requisite

- Run **rcr/gene_count.py** before this step. This can be done using the following command in your terminal: `python src/rcr/gene_count.py`

# Load chemicals, genes and diseases in dataset

In [3]:
# Loading the combined data
with open(os.path.join(DATA_DIR, 'combined', 'dataset_genes.json')) as f:
    data = json.load(f)

# Constants

In [4]:
OPENLINK_NORMALIZED_FILE = 'openbiolink_kg_normalized.tsv'
CUSTOM_NORMALIZED_FILE = 'custom_kg_normalized.tsv'

# Load network

In [5]:
network_1 = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'raw', 'data', 'openbiolink_kg.tsv'),
    sep='\t'
)
network_1['source'] = network_1['source'].apply(lambda x: 
                                                  f'{x.split(":")[0].lower()}:{x.split(":")[1]}')
network_1['target'] = network_1['target'].apply(lambda x: 
                                                  f'{x.split(":")[0].lower()}:{x.split(":")[1]}')

network_1 = filter_graph(network_df=network_1, file_name=OPENLINK_NORMALIZED_FILE, data_dict=data)

network_1

Normalizing graph: 100%|██████████| 235881/235881 [05:32<00:00, 708.42it/s] 


Unnamed: 0,source,target,polarity
0,pubchem.compound:10000456,ncbigene:6347,-1
1,pubchem.compound:10022508,ncbigene:7157,1
2,pubchem.compound:10071166,ncbigene:3776,1
3,pubchem.compound:10074640,ncbigene:3815,1
4,pubchem.compound:10074640,ncbigene:1436,-1
...,...,...,...
48873,ncbigene:999,ncbigene:3480,-1
48874,ncbigene:999,ncbigene:4233,-1
48875,ncbigene:999,ncbigene:5594,-1
48876,ncbigene:999,ncbigene:5595,-1


In [6]:
network_2 = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'raw', 'data', 'custom_kg.tsv'),
    sep='\t',
    usecols=['source', 'target', 'relation']
)
network_2['source'] = network_2['source'].apply(lambda x: 
                                                  f'{x.split(":")[0].lower()}:{x.split(":")[1]}')
network_2['target'] = network_2['target'].apply(lambda x: 
                                                  f'{x.split(":")[0].lower()}:{x.split(":")[1]}')

network_2 = filter_graph(network_df=network_2, file_name=CUSTOM_NORMALIZED_FILE, data_dict=data)
network_2

Normalizing graph: 100%|██████████| 220369/220369 [02:42<00:00, 1356.92it/s]


Unnamed: 0,source,target,polarity
0,pubchem.compound:4943,ncbigene:2562,1
1,pubchem.compound:667468,ncbigene:147,-1
2,pubchem.compound:4011,ncbigene:1133,-1
3,pubchem.compound:4636,ncbigene:148,1
4,pubchem.compound:2083,ncbigene:154,1
...,...,...,...
52177,ncbigene:120892,mondo:0005180,1
52178,ncbigene:283120,mondo:0019004,1
52179,ncbigene:653361,mondo:0018305,1
52180,ncbigene:727897,mondo:0008345,1


# Graph analysis

In [8]:
graph = DiGraph()

for sub_name, obj_name, relation in network_1.values:
    # Store edge in the graph
    graph.add_edge(
        sub_name,
        obj_name,
        polarity=relation,
    )


connected_components_subgraph = [
    component
    for component in sorted(connected_components(graph.to_undirected()), key=len, reverse=True)
]

graph1 = DiGraph()

for sub_name, obj_name, relation in network_2.values:
    # Store edge in the graph
    graph1.add_edge(
        sub_name,
        obj_name,
        polarity=relation,
    )


connected_components_subgraph1 = [
    component
    for component in sorted(connected_components(graph1.to_undirected()), key=len, reverse=True)
]

print('OpenBioLink KG:')
for i in connected_components_subgraph:
    if len(i) < 7:
        print(i)
        
print('\nCustom KG:')
for i in connected_components_subgraph1:
    if len(i) < 7:
        print(i)

OpenBioLink
{'ncbigene:374403', 'ncbigene:83874', 'ncbigene:54662', 'ncbigene:26000', 'ncbigene:11021', 'ncbigene:57465'}
{'ncbigene:10006', 'ncbigene:81624', 'ncbigene:10787', 'ncbigene:26999', 'ncbigene:10163'}
{'ncbigene:150274', 'ncbigene:51218', 'ncbigene:81689', 'ncbigene:122961'}
{'pubchem.compound:2688', 'ncbigene:90550', 'ncbigene:9187'}
{'ncbigene:887', 'pubchem.compound:4355450', 'pubchem.compound:2802894'}
{'pubchem.compound:2993', 'ncbigene:1260', 'ncbigene:1259'}
{'pubchem.compound:4046', 'ncbigene:3039', 'ncbigene:3040'}
{'ncbigene:48', 'ncbigene:3658', 'ncbigene:10539'}
{'mondo:0018544', 'ncbigene:5428', 'ncbigene:215'}
{'ncbigene:22', 'ncbigene:4682', 'ncbigene:10101'}
{'ncbigene:9146', 'ncbigene:9525', 'ncbigene:27183'}
{'ncbigene:79751', 'ncbigene:83733', 'ncbigene:2978'}
{'ncbigene:1465', 'ncbigene:6620', 'ncbigene:4520'}
{'ncbigene:5007', 'ncbigene:9217', 'ncbigene:151742'}
{'ncbigene:5286', 'pubchem.compound:105007'}
{'pubchem.compound:1464', 'ncbigene:64321'}
{'p