# Kg-statistics

This notebook compares the graph statistics of the permuted and the raw graphs

# Imports

In [1]:
import os
import pandas as pd
import networkx as nx

from utils import KG_DATA_PATH, create_graph_from_df

# Helper Functions

In [2]:
def get_longest_shortest_path(df, graph):
    longest = 5e-324
    
    drugs = set(df[df['source'].str.startswith('pubchem')]['source'])
    diseases = set(df[df['target'].str.startswith('mondo')]['target'])
    
    for drug in drugs:
        for disease in diseases:
            # some paths do not exist
            try:
                path_length = len(nx.shortest_path(graph, source = drug, target = disease))
            except:
                continue
            if path_length > longest:
                longest = path_length
    
    return longest

# Load data

In [3]:
custom = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'custom_filtered_kg.tsv'), 
    sep='\t'
)

openbiolink = pd.read_csv(
    os.path.join(KG_DATA_PATH, 'openbiolink_filtered_kg.tsv'), 
    sep='\t'
)
openbiolink

Unnamed: 0,source,target,polarity
0,ncbigene:3308,ncbigene:6622,-1
1,ncbigene:4804,ncbigene:2885,1
2,ncbigene:4804,ncbigene:3265,1
3,ncbigene:4804,ncbigene:5290,1
4,ncbigene:4804,ncbigene:5295,1
...,...,...,...
41494,ncbigene:9982,ncbigene:2246,1
41495,ncbigene:9982,ncbigene:2248,1
41496,ncbigene:9982,ncbigene:2252,1
41497,ncbigene:9982,ncbigene:2255,1


# Create graph

In [4]:
graph_openbio = create_graph_from_df(openbiolink)
graph_custom = create_graph_from_df(custom)

Report on the number of relations: {-1: 12477, 1: 29022}
Report on the number of relations: {1: 43578, -1: 8045}


# Graph stats

In [5]:
data = {
    'openbio': {},
    'custom': {}
}

openbio_g = graph_openbio
custom_g = graph_custom
openbio_df = openbiolink
custom_df = custom


o_nodes = list(openbio_g.nodes())

o_degree = list(openbio_g.degree(o_nodes))
o_degree = [el for node, el in o_degree]

c_nodes = list(custom_g.nodes())

c_degree = list(custom_g.degree(c_nodes))
c_degree = [el for node, el in c_degree]


data['openbio']['# nodes'] = str(len(o_nodes))
data['custom']['# nodes'] = str(len(c_nodes))

data['openbio']['# degree'] = str(round(sum(o_degree) / len(o_degree), 3))
data['custom']['# degree'] = str(round(sum(c_degree) / len(c_degree), 3))

o_edges = len(openbio_g.edges())
c_edges = len(custom_g.edges())

data['openbio']['# edges'] = str(o_edges)
data['custom']['# edges'] = str(c_edges)

data['openbio']['# activatory'] = len(openbio_df.index[openbio_df['polarity'] == 1])
data['custom']['# activatory'] = len(custom_df.index[custom_df['polarity'] == 1])

data['openbio']['# inhibitory'] = len(openbio_df.index[openbio_df['polarity'] == -1])
data['custom']['# inhibitory'] = len(custom_df.index[custom_df['polarity'] == -1])

data['openbio']['longest_shortest_path'] = get_longest_shortest_path(openbio_df, openbio_g)
data['custom']['longest_shortest_path'] = get_longest_shortest_path(custom_df, custom_g)    

In [6]:
df = pd.DataFrame(data)
df

Unnamed: 0,openbio,custom
# nodes,4831.0,8489.0
# degree,17.17,12.161
# edges,41474.0,51617.0
# activatory,29022.0,43578.0
# inhibitory,12477.0,8045.0
longest_shortest_path,11.0,11.0
