# Kg-statistics

This notebook compares the graph statistics of the permuted and the raw graphs

# Imports

In [1]:
import os
import pandas as pd
import networkx as nx

from utils import KG_DATA_PATH, create_graph_from_df

# Helper Functions

In [2]:
def get_longest_shortest_path(df, graph):
    longest = 5e-324
    
    drugs = set(df[df['source'].str.startswith('pubchem')]['source'])
    diseases = set(df[df['target'].str.startswith('mondo')]['target'])
    
    for drug in drugs:
        for disease in diseases:
            # some paths do not exist
            try:
                path_length = len(nx.shortest_path(graph, source = drug, target = disease))
            except:
                continue
            if path_length > longest:
                longest = path_length
    
    return longest

# Load data

In [3]:
custom = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'), 
    sep='\t'
)

openbiolink = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'), 
    sep='\t'
)
openbiolink

Unnamed: 0,source,target,polarity
0,ncbigene:3308,ncbigene:6622,-1
1,ncbigene:4804,ncbigene:2885,1
2,ncbigene:4804,ncbigene:3265,1
3,ncbigene:4804,ncbigene:5290,1
4,ncbigene:4804,ncbigene:5295,1
...,...,...,...
41494,ncbigene:9982,ncbigene:2246,1
41495,ncbigene:9982,ncbigene:2248,1
41496,ncbigene:9982,ncbigene:2252,1
41497,ncbigene:9982,ncbigene:2255,1


In [4]:
permuted_custom = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'shuffled_custom.tsv'), 
    sep='\t'
)
permuted_custom.rename(columns={"relation": "polarity"}, inplace=True)


permuted_openbiolink = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'shuffled_openbiolink.tsv'), 
    sep='\t'
)
permuted_openbiolink.rename(columns={"relation": "polarity"}, inplace=True)
permuted_openbiolink

Unnamed: 0,source,target,polarity
0,ncbigene:29101,ncbigene:36,-1
1,ncbigene:64108,ncbigene:201456,1
2,ncbigene:6907,ncbigene:79019,1
3,ncbigene:5905,ncbigene:55055,1
4,ncbigene:54821,ncbigene:3295,1
...,...,...,...
41494,ncbigene:2966,ncbigene:5519,1
41495,pubchem.compound:5280453,ncbigene:2194,1
41496,ncbigene:1027,ncbigene:1647,1
41497,pubchem.compound:24826799,ncbigene:219952,1


# Create graph

In [5]:
graph_openbio = create_graph_from_df(openbiolink)
graph_custom = create_graph_from_df(custom)

graph_permuted_openbio = create_graph_from_df(permuted_openbiolink)
graph_permuted_custom = create_graph_from_df(permuted_custom)

Report on the number of relations: {-1: 12477, 1: 29022}
Report on the number of relations: {1: 43578, -1: 8045}
Report on the number of relations: {-1: 12477, 1: 29022}
Report on the number of relations: {1: 43578, -1: 8045}


# Graph stats

In [6]:
data = {
    'raw': {},
    'permuted': {}
}

for i in ['permuted', 'raw']:
    if i == 'raw':
        openbio_g = graph_openbio
        custom_g = graph_custom
        openbio_df = openbiolink
        custom_df = custom
    else:
        openbio_g = graph_permuted_openbio
        custom_g = graph_permuted_custom
        openbio_df = permuted_openbiolink
        custom_df = permuted_custom
        
    o_nodes = list(openbio_g.nodes())
    
    o_degree = list(openbio_g.degree(o_nodes))
    o_degree = [el for node, el in o_degree]
    
    c_nodes = list(custom_g.nodes())
    
    c_degree = list(custom_g.degree(c_nodes))
    c_degree = [el for node, el in c_degree]
    
    
    data[i]['openbio_nodes'] = str(len(o_nodes))
    data[i]['custom_nodes'] = str(len(c_nodes))
        
    data[i]['openbio_degree'] = str(round(sum(o_degree) / len(o_degree), 3))
    data[i]['custom_degree'] = str(round(sum(c_degree) / len(c_degree), 3))
    
    o_edges = len(openbio_g.edges())
    c_edges = len(custom_g.edges())
    
    data[i]['openbio_edges'] = str(o_edges)
    data[i]['custom_edges'] = str(c_edges)
    
    data[i]['openbio_activatory'] = len(openbio_df.index[openbio_df['polarity'] == 1])
    data[i]['custom_activatory'] = len(custom_df.index[custom_df['polarity'] == 1])
    
    data[i]['openbio_inhibitory'] = len(openbio_df.index[openbio_df['polarity'] == -1])
    data[i]['custom_inhibitory'] = len(custom_df.index[custom_df['polarity'] == -1])
    
    data[i]['openbio_longest_shortest_path'] = get_longest_shortest_path(openbio_df, openbio_g)
    data[i]['custom_longest_shortest_path'] = get_longest_shortest_path(custom_df, custom_g)    

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,raw,permuted
openbio_nodes,4831.0,4862.0
custom_nodes,8489.0,8470.0
openbio_degree,17.17,17.07
custom_degree,12.161,12.187
openbio_edges,41474.0,41496.0
custom_edges,51617.0,51611.0
openbio_activatory,29022.0,29022.0
custom_activatory,43578.0,43578.0
openbio_inhibitory,12477.0,12477.0
custom_inhibitory,8045.0,8045.0
