In [27]:
import pandas as pd
import numpy as np
from numpy import linalg as LA

In [70]:
file_name = 'joint_[64,64]_[linear,relu]_[10,10]'

In [71]:
emb = pd.read_csv(f'./emb/{file_name}.csv', index_col=0)

k_emb = emb[emb['type'] == 1]
gm_emb = emb[emb['type'] == 2]
gene_emb = emb[emb['type'] == 0]

print(f'K562: {k_emb.shape[0]}')
print(f'GM12878: {gm_emb.shape[0]}')
print(f'Gene: {gene_emb.shape[0]}')

K562: 68
GM12878: 68
Gene: 8693


## Calculate embedding difference
1. Traverse all the combinations of [tf, gene], calculate L2 distance
2. Rank by smallest to furthest

In [16]:
def get_emb(name):
    return emb[emb.index == name].to_numpy()

K562: 68
GM12878: 68
Gene: 8693


In [57]:
gene_names = gene_emb.index


# K562
k_names = k_emb.index

iterables = [list(k_names), list(gene_names)]
idx = pd.MultiIndex.from_product(iterables, names=['tf', 'gene'])

k_emb_dist = pd.DataFrame([0] * len(idx), index=idx, columns=['distance'])

k_emb_dist['tf_emb'] = k_emb_dist.index.map(lambda x: emb.loc[x[0]])
k_emb_dist['gene_emb'] = k_emb_dist.index.map(lambda x: emb.loc[x[1]])

k_emb_dist
# for i, k_name in enumerate(k_names):
#     for gene_name in gene_names:
#         _k_emb = get_emb(k_name)
#         _gene_emb = get_emb(gene_name)
#         k_emb_dist[k_name][gene_name] = LA.norm(_k_emb - _gene_emb)
#     print(f'loop {i} complete')
        
    
# k_emb_dist['ATF3_k']['A2M']

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,tf_emb,gene_emb
tf,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATF3_k,@UBC,0,0 0.293517 1 0.000000 2 0.05...,0 0.000000 1 0.000000 2 0.00...
ATF3_k,A1CF,0,0 0.293517 1 0.000000 2 0.05...,0 0.017925 1 0.000000 2 0.06...
ATF3_k,A2M,1,0 0.293517 1 0.000000 2 0.05...,0 0.225240 1 0.000000 2 0.14...
ATF3_k,AAAS,0,0 0.293517 1 0.000000 2 0.05...,0 0.257550 1 0.000000 2 0.26...
ATF3_k,AACS,0,0 0.293517 1 0.000000 2 0.05...,0 0.243427 1 0.000000 2 0.26...
...,...,...,...,...
ZNF274_k,ZYG11A,0,0 0.245803 1 0.000000 2 0.00...,0 0.341797 1 0.000000 2 0.21...
ZNF274_k,ZYG11B,0,0 0.245803 1 0.000000 2 0.00...,0 0.186693 1 0.000000 2 0.00...
ZNF274_k,ZYX,0,0 0.245803 1 0.000000 2 0.00...,0 0.094624 1 0.000000 2 0.02...
ZNF274_k,ZZEF1,0,0 0.245803 1 0.000000 2 0.00...,0 0.303926 1 0.000000 2 0.20...


In [73]:
# GM12878
gm_names = gm_emb.index

iterables = [list(gm_names), list(gene_names)]
idx = pd.MultiIndex.from_product(iterables, names=['tf', 'gene'])

gm_emb_dist = pd.DataFrame([0] * len(idx), index=idx, columns=['distance'])

gm_emb_dist['tf_emb'] = gm_emb_dist.index.map(lambda x: emb.loc[x[0]])
gm_emb_dist['gene_emb'] = gm_emb_dist.index.map(lambda x: emb.loc[x[1]])

gm_emb_dist

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,tf_emb,gene_emb
tf,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATF3_gm,@UBC,0,0 0.000000 1 0.000000 2 0.00...,0 0.000000 1 0.000000 2 0.00...
ATF3_gm,A1CF,0,0 0.000000 1 0.000000 2 0.00...,0 0.017925 1 0.000000 2 0.06...
ATF3_gm,A2M,0,0 0.000000 1 0.000000 2 0.00...,0 0.225240 1 0.000000 2 0.14...
ATF3_gm,AAAS,0,0 0.000000 1 0.000000 2 0.00...,0 0.257550 1 0.000000 2 0.26...
ATF3_gm,AACS,0,0 0.000000 1 0.000000 2 0.00...,0 0.243427 1 0.000000 2 0.26...
...,...,...,...,...
ZNF143_gm,ZYG11A,0,0 0.000000 1 0.000000 2 0.00...,0 0.341797 1 0.000000 2 0.21...
ZNF143_gm,ZYG11B,0,0 0.000000 1 0.000000 2 0.00...,0 0.186693 1 0.000000 2 0.00...
ZNF143_gm,ZYX,0,0 0.000000 1 0.000000 2 0.00...,0 0.094624 1 0.000000 2 0.02...
ZNF143_gm,ZZEF1,0,0 0.000000 1 0.000000 2 0.00...,0 0.303926 1 0.000000 2 0.20...


In [74]:
def get_dist(x):
    return LA.norm(x[0].to_numpy() - x[1].to_numpy())

k_emb_dist['distance'] = k_emb_dist[['tf_emb', 'gene_emb']].apply(get_dist, axis=1)
gm_emb_dist['distance'] = gm_emb_dist[['tf_emb', 'gene_emb']].apply(get_dist, axis=1)


# k_emb_dist.iloc[:3][['tf_emb', 'gene_emb']].apply(get_dist, axis=1)


In [77]:
k_emb_dist.sort_values(['distance'], inplace=True)
gm_emb_dist.sort_values(['distance'], inplace=True)

In [80]:
k_emb_dist[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,tf_emb,gene_emb
tf,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NBN_k,GIGYF1,1.057169,0 0.186252 1 0.000000 2 0.08...,0 0.192540 1 0.000000 2 0.14...
ZNF143_k,NOMO2,1.059281,0 0.343758 1 0.000000 2 0.20...,0 0.350530 1 0.000000 2 0.21...
NBN_k,GRN,1.059487,0 0.186252 1 0.000000 2 0.08...,0 0.274292 1 0.000000 2 0.12...
NBN_k,MED11,1.059783,0 0.186252 1 0.000000 2 0.08...,0 0.189661 1 0.000000 2 0.20...
NBN_k,RGS20,1.060948,0 0.186252 1 0.000000 2 0.08...,0 0.239964 1 0.000000 2 0.18...
ZNF143_k,FAM60A,1.063547,0 0.343758 1 0.000000 2 0.20...,0 0.375516 1 0.000000 2 0.35...
NBN_k,CCNC,1.064344,0 0.186252 1 0.000000 2 0.08...,0 0.198693 1 0.000000 2 0.07...
NBN_k,C21orf7,1.064501,0 0.186252 1 0.000000 2 0.08...,0 0.219645 1 0.000000 2 0.10...
IKZF1_k,CD177,1.065168,0 0.000000 1 0.000000 2 0.00...,0 0.0 1 0.0 2 0.0 3 0....
NBN_k,LARP7,1.065363,0 0.186252 1 0.000000 2 0.08...,0 0.138265 1 0.000000 2 0.12...


In [83]:
gm_emb_dist[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,tf_emb,gene_emb
tf,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NFE2_gm,KIF9,2.019069,0 0.000000 1 0.000000 2 0.00...,0 0.000000 1 0.000000 2 0.00...
NFE2_gm,PITPNC1,2.020178,0 0.000000 1 0.000000 2 0.00...,0 0.026606 1 0.000000 2 0.00...
RCOR1_gm,KIF9,2.021372,0 0.000000 1 0.000000 2 0.00...,0 0.000000 1 0.000000 2 0.00...
NFE2_gm,PCGF3,2.022163,0 0.000000 1 0.000000 2 0.00...,0 0.0 1 0.0 2 0.0 3 0....
USF1_gm,PCGF3,2.023252,0 0.000000 1 0.000000 2 0.00...,0 0.0 1 0.0 2 0.0 3 0....
RCOR1_gm,PITPNC1,2.023389,0 0.000000 1 0.000000 2 0.00...,0 0.026606 1 0.000000 2 0.00...
RAD21_gm,KIF9,2.023484,0 0.000000 1 0.000000 2 0.00...,0 0.000000 1 0.000000 2 0.00...
ETS1_gm,PITPNC1,2.02405,0 0.000000 1 0.000000 2 0.00...,0 0.026606 1 0.000000 2 0.00...
USF1_gm,KIF9,2.024224,0 0.000000 1 0.000000 2 0.00...,0 0.000000 1 0.000000 2 0.00...
ETS1_gm,KIF9,2.025061,0 0.000000 1 0.000000 2 0.00...,0 0.000000 1 0.000000 2 0.00...


In [84]:
k_emb_dist[['distance']].to_csv(f'./emb/{file_name}_k_dist.csv')

In [85]:
gm_emb_dist[['distance']].to_csv(f'./emb/{file_name}_gm_dist.csv')