## Analyze the dataset

In [16]:
import numpy as np
import pandas as pd
from pathlib import Path
import networkx as nx
import random

In [7]:
SRC       = "human"
TGT       = "fly"
NETSRC    = f"../../../runs/only_few_landmarks/data/intact/{SRC}.s.tsv"
NETTGT    = f"../../../runs/only_few_landmarks/data/intact/{TGT}.s.tsv"
ALIGNMENT = f"../../../runs/only_few_landmarks/temp/outputs/fly_human_lr_0.001_ep_100_svdr_100_nL_100_dthres_10_ialpha_0.7_wB_0.66/isorank.tsv"
ALIGNMENT = Path(ALIGNMENT)

In [107]:
df    = pd.read_csv(ALIGNMENT, sep = "\t")
dfsrc = pd.read_csv(NETSRC, sep = "\t", header=None)
dftgt = pd.read_csv(NETTGT, sep = "\t", header=None)
Gsrc  = nx.from_pandas_edgelist(dfsrc, 0, 1)
Gtgt  = nx.from_pandas_edgelist(dftgt, 0, 1)

In [108]:
srcmusthave = df[SRC].values
tgtmusthave = df[TGT].values

In [109]:
srcnodes = set(random.sample(list(Gsrc.nodes), 4000) + srcmusthave.tolist())
tgtnodes = set(random.sample(list(Gtgt.nodes), 4000) + tgtmusthave.tolist())

In [110]:
Gsubsrc = Gsrc.subgraph(srcnodes)
Gsubtgt = Gtgt.subgraph(tgtnodes)

## Remove high degree nodes

In [112]:
Gsubsrc = Gsubsrc.subgraph([x for x, i in list(Gsubsrc.degree()) if i < 25])
Gsubtgt = Gsubtgt.subgraph([x for x, i in list(Gsubtgt.degree()) if i < 25])

In [113]:
srccc   = max(nx.connected_components(Gsubsrc), key = len)
tgtcc   = max(nx.connected_components(Gsubtgt), key = len)
Gsubsrc = Gsubsrc.subgraph(srccc)
Gsubtgt = Gsubtgt.subgraph(tgtcc)
srcmusthave = set(srccc).intersection(srcmusthave)
tgtmusthave = set(tgtcc).intersection(tgtmusthave)

## Constructing a super-matrix that combines both source and target

In [138]:
smap        = {k: i for i, k in enumerate(Gsubsrc.nodes())}
tmap        = {k: i for i, k in enumerate(Gsubtgt.nodes())}
srcmat      = nx.adjacency_matrix(Gsubsrc).toarray()
tgtmat      = nx.adjacency_matrix(Gsubtgt).toarray()
tgt_src_mat = np.zeros((tgtmat.shape[0], srcmat.shape[0]))
SC          = 20
for sc, tg in df[[SRC, TGT]].values:
    try:
        idxsc = smap[sc]
        idxtg = tmap[tg]
    except Exception as e:
        continue
    tgt_src_mat[idxtg, idxsc] = SC 

In [139]:
combinedmat = np.zeros((srcmat.shape[0]+tgtmat.shape[0], srcmat.shape[0]+tgtmat.shape[0]))
combinedmat[:tgtmat.shape[0], :tgtmat.shape[0] ] = tgtmat
combinedmat[tgtmat.shape[0]:,  tgtmat.shape[0]:] = srcmat
combinedmat[:tgtmat.shape[0],  tgtmat.shape[0]:] = tgt_src_mat
combinedmat[tgtmat.shape[0]:, :tgtmat.shape[0] ] = tgt_src_mat.T

## Finally generating the DSD matrix

In [140]:
import glidetools.algorithm.dsd as dsd
CDSD = dsd.compute_dsd_embedding(combinedmat, is_normalized=True)

In [141]:
from scipy.spatial.distance import pdist, squareform

In [142]:
DIST = squareform(pdist(CDSD))

## Extract the upper right matrix representing the relationship between two species, and select the five top locations

In [143]:
UPPER_RIGHT = DIST[:tgtmat.shape[0], tgtmat.shape[0]:]

In [153]:
UPPER_RIGHT[:, [1743,  866, 1919,  685, 1614]]

array([[2.8341139 , 2.88245601, 2.9569097 , 2.99021641, 2.99163416],
       [1.50698668, 1.5851532 , 1.75954602, 1.76970928, 1.83798052],
       [1.43427177, 1.49524712, 1.65894424, 1.70206979, 1.8757339 ],
       ...,
       [1.8132829 , 1.9260618 , 2.0074604 , 2.08157445, 2.10421221],
       [1.97581172, 2.18444092, 2.28710767, 2.32496595, 2.42347029],
       [2.09585465, 2.19581816, 2.26391336, 2.33281939, 2.35112139]])

In [163]:
CLOSEST

array([[1743,  866, 1614, 1919,  685],
       [1743,  866, 1919, 1614,  685],
       [1743,  866, 1919,  685, 1614],
       ...,
       [1743,  866, 1919, 1614,  685],
       [1743,  866, 1919,  685, 1614],
       [1743,  866, 1919, 1614,  685]])

In [165]:
CLOSEST = np.argsort(UPPER_RIGHT, axis = 1)[:, :250]
rtmap = {v: k for k, v in tmap.items()}
rsmap = {v: k for k, v in smap.items()}
clmat   = []
columns = ["target"] + [f"src-{idx}" for idx in range(250)]
for i, entries in enumerate(CLOSEST):
    clmat.append([rtmap[i]] + [rsmap[j] for j in entries])
pd.DataFrame(clmat, columns = columns).set_index("target").to_csv(f"{SRC}_{TGT}_{SC}.tsv", sep = "\t")