## Getting the two networks

In [1]:
# Install a pip package in the current Jupyter kernel
import sys
sys.path.append('../')
# This is only needed for jupyter notebook
import json
import networkx as nx
import pandas as pd
import numpy as np
df_human = pd.read_csv("../data/biogrid_files/human.tsv", sep = "\t", header = None)
df_mouse = pd.read_csv("../data/biogrid_files/mouse.tsv", sep = "\t", header = None)
df_human['weight'] = 1
df_mouse['weight'] = 1

In [2]:
ghuman = nx.from_pandas_edgelist(df_human, 0, 1, ['weight'])
gmouse = nx.from_pandas_edgelist(df_mouse, 0, 1, ['weight'])
human_nodelist = list(ghuman.nodes())
mouse_nodelist = list(gmouse.nodes())
"""
# SAVING THE JSON FILES
with open("human.json", "w") as hj:
    human_map = {k:i for i, k in enumerate(human_nodelist)}
    json.dump(human_map, hj)
with open("mouse.json", "w") as mj:
    mouse_map = {k:i for i, k in enumerate(mouse_nodelist)}
    json.dump(mouse_map, mj)
"""


'\n# SAVING THE JSON FILES\nwith open("human.json", "w") as hj:\n    human_map = {k:i for i, k in enumerate(human_nodelist)}\n    json.dump(human_map, hj)\nwith open("mouse.json", "w") as mj:\n    mouse_map = {k:i for i, k in enumerate(mouse_nodelist)}\n    json.dump(mouse_map, mj)\n'

In [3]:
from gmundo.alignment import isorank
# matches = isorank(ghuman, gmouse, human_map, mouse_map, 1, 50, iterations = 2)
# this isorank code takes too much space, so have to optimize this properly
def randomized_mapping(n1, n2, n_mapping):
    rn1 = np.random.permutation(len(n1))[:n_mapping]
    rn2 = np.random.permutation(len(n2))[:n_mapping]
    i_mapping = list(zip(rn1, rn2))
    mapping   = [(n1[p], n2[q]) for p, q in zip(rn1, rn2)]
    return mapping, i_mapping
mapping, i_mapping = randomized_mapping(human_nodelist, mouse_nodelist, 50)

In [3]:
# Computing the DSD matrices for human and mouse
# This function takes a lot of time, so I have saved the numpy file to save time
from gmundo.linalg import compute_dsd_embedding
mouse_dsd  = compute_dsd_embedding(gmouse, mouse_nodelist)
human_dsd  = compute_dsd_embedding(ghuman, human_nodelist)

In [8]:
import numpy as np
np.save("human_dsd_emb.npy", human_dsd)
np.save("mouse_dsd_emb.npy", mouse_dsd)

In [9]:
from scipy.spatial.distance import squareform, pdist

human_dist = squareform(pdist(human_dsd))
mouse_dist = squareform(pdist(mouse_dsd))

In [10]:
np.save("human_dist.npy", human_dist)
np.save("mouse_dist.npy", mouse_dist)

In [13]:
from sklearn.metrics.pairwise import laplacian_kernel
gamma = 1 / 10
human_rbf = laplacian_kernel(human_dist)
mouse_rbf = laplacian_kernel(mouse_dist)
np.save("human_rbf_0.1.npy", human_rbf)
np.save("mouse_rbf_0.1.npy", mouse_rbf)

In [4]:
from gmundo.coembed import coembed_networks
human_rbf = np.load("human_rbf_0.1.npy")
mouse_rbf = np.load("mouse_rbf_0.1.npy")
munk = coembed_networks(human_rbf, mouse_rbf, i_mapping, verbose = True)

	Computing RKHS for source network... 
	Embedding matrices... 
	Creating final munk matrix... 


In [6]:
np.save("munk.npy", munk)

### PREDICTION using MUNK associations computed above

In [4]:
from gmundo.prediction.go_process import get_go_labels

filter_label = {"namespace": "molecular_function", "min_level": 5}
filter_prot  = {"namespace": "molecular_function", "lower_bound": 50}
human_labels, human_go_prots_dict = get_go_labels(filter_prot, filter_label, human_nodelist, "../data/go_files/gene2go", "../data/go_files/go-basic.obo", 9606, verbose = True)

HMS:0:00:02.654168 335,350 annotations, 20,702 genes, 18,726 GOs, 1 taxids READ: ../data/go_files/gene2go 
18674 IDs in loaded association branch, molecular_function
  EXISTS: ../data/go_files/go-basic.obo
../data/go_files/go-basic.obo: fmt(1.2) rel(2021-12-15) 47,157 Terms; optional_attrs(relationship)
Number of GO-terms: 32


In [5]:
filter_label = {"namespace": "molecular_function", "min_level": 5}
filter_prot  = {"namespace": "molecular_function", "lower_bound": 20}
mouse_labels, mouse_go_prots_dict = get_go_labels(filter_prot, filter_label, mouse_nodelist, "../data/go_files/gene2go", "../data/go_files/go-basic.obo", 10090, verbose = True)

HMS:0:00:03.491622 419,936 annotations, 29,777 genes, 18,906 GOs, 1 taxids READ: ../data/go_files/gene2go 
18880 IDs in loaded association branch, molecular_function
  EXISTS: ../data/go_files/go-basic.obo
../data/go_files/go-basic.obo: fmt(1.2) rel(2021-12-15) 47,157 Terms; optional_attrs(relationship)
Number of GO-terms: 39


In [6]:
with open("human.json", "r") as hj:
    h_entrez_id = json.load(hj)
with open("mouse.json", "r") as mj:
    m_entrez_id = json.load(mj)
def get_prot_go_dict(go_prot_dict, entrez_id_map):
    prot_go = {}
    for l in go_prot_dict:
        for p in go_prot_dict[l]:
            if entrez_id_map[str(p)] not in prot_go:
                prot_go[entrez_id_map[str(p)]] = l
    return prot_go
mouse_prot_go = get_prot_go_dict(mouse_go_prots_dict, m_entrez_id)
human_prot_go = get_prot_go_dict(human_go_prots_dict, h_entrez_id)


In [16]:
# Construct the target and MUNK maps
mouse_rbf = np.load("mouse_rbf_0.1.npy")
mouse_neighbors = np.argsort(-mouse_rbf, axis = 1)
# Need to discuss this
mouse_neighbors = mouse_neighbors[:, :20]

In [19]:
munk  = np.load("munk.npy")
munk_neighbors = np.argsort(-munk, axis = 1)
munk_neighbors = munk_neighbors[:, :20]

def convert_to_dict(npy_neighbors):
    ndict = {}
    n, _  = npy_neighbors.shape
    for i in range(n):
        ndict[i] = npy_neighbors[i, :]
    return ndict

munk_neigh_dict = convert_to_dict(munk_neighbors)
mouse_neigh_dict = convert_to_dict(mouse_neighbors)

2192


IndexError: index 2192 is out of bounds for axis 0 with size 2192

In [9]:
# Constructing a predictor function
from gmundo.prediction.predict import MUNDO_predict
def construct_predictor_MUNDO(target_neighbors, munk_neighbors, source_prot_go, n_neighbors = 20, MUNK_weight = 0.5):
    def predictor(target_prot_go):
        return MUNDO_predict(target_neighbors,
                             munk_neighbors,
                             n_neighbors,
                             target_prot_go,
                             source_prot_go,
                             MUNK_weight)
    return predictor

In [10]:
from gmundo.prediction.scoring import kfoldcv
accs = kfoldcv(5,
              mouse_prot_go,
              construct_predictor_MUNDO(mouse_neighbors,
                                        munk_neighbors,
                                       human_prot_go)
              )

IndexError: invalid index to scalar variable.

In [11]:
x = np.arange(9).reshape(3,3)

In [12]:
x

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [13]:
x[0]

array([0, 1, 2])