In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
from graph_curvature.curvature import GraphCurvature
from os.path import exists

## Load Data

In [14]:
graph_name = 'medium_conf_G.pkl'
scalar_curvatures_name = 'scalar_curvatures_C800_med_conf.csv'

In [None]:
if not exists(graph_name):
    genes_ja = pd.read_csv('run1/C800_log2normcounts.csv')
    names = pd.read_csv('9606.protein.aliases.v11.5.txt',sep='\t')
    c_genes = pd.DataFrame(genes_ja['gene'].drop_duplicates())

    c_genes['convs'] = c_genes['gene'].apply(lambda x: names[names['alias']==x]['#string_protein_id'].tolist()[0] if x in names['alias'].tolist() else np.nan)

    c_genes.dropna(inplace=True)

    links = pd.read_csv('9606.protein.links.v11.5.txt',sep=' ')
    # low confidence: 150
    # medium confidence: 400
    # high confidence: 700
    # highest confidence: 900
    links = links[links['combined_score']>400]
    string_to_gene = c_genes.set_index('convs').to_dict()['gene']
    links['gene1'] = links['protein1'].apply(lambda x: string_to_gene[x] if x in string_to_gene.keys() else np.nan)
    links['gene2'] = links['protein2'].apply(lambda x: string_to_gene[x] if x in string_to_gene.keys() else np.nan)
    links.dropna(inplace=True)
    links = links[['gene1','gene2','combined_score']].sort_values(by='combined_score').drop_duplicates(subset=['gene1','gene2'], keep='last')
    
    # Construct list of tuples to define edges
    #links = links[links['combined_score']>300]
    edges = [(_[0],_[1],{'weight':_[2]/1000.0}) for _ in list(zip(links['gene1'].tolist(),links['gene2'].tolist(),links['combined_score']))]

    # Construct NetworkX graph
    G=nx.Graph()

    # Set nodes
    G.add_nodes_from(c_genes['gene'].tolist())
    #nx.set_node_attributes(G,node_attrs,"weight")

    # Define edges
    G.add_edges_from(edges)

    # Pull out largest connected component
    G_cc = [G.subgraph(c).copy() for c in nx.connected_components(G) if c == max(nx.connected_components(G), key=len)][0]
    nx.write_gpickle(G_cc, graph_name)
else:
    G_cc = nx.read_gpickle(graph_name)

## Compute Graph Curvature

In [4]:
if not exists(scalar_curvatures_name):
    orc = GraphCurvature(G_cc, n_procs=24)
    orc.compute()
    orc.scalar_curvatures.to_csv(scalar_curvatures_name)
else:
    scalar_curvatures = pd.read_csv(scalar_curvatures_name)
    orc = GraphCurvature.from_save(G_cc, scalar_curvatures)