In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
TARGET_FOLDER = Path('../../data/features/')

**Create a Vector for each node, the vector represents tfs that it's connected to**
keep in mind
1. tf2tf is directional
2. tf2gene is directional

In [3]:
DATA_FOLDER = Path('../../data/')

# 1. Read edgelist
df = pd.read_csv(DATA_FOLDER / '2gm_renamed.csv', dtype={
    'cell_type': object,
    'source': object,
    'target': object,
    'type': object,
    'weight': 'float64'
})
df

Unnamed: 0,cell_type,source,target,type,weight
0,GM12878,ATF2_gm1,HES4,TSS,674.551267
1,GM12878,ATF2_gm1,ISG15,TSS,261.000000
2,GM12878,ATF2_gm1,AGRN,TSS,190.035403
3,GM12878,ATF2_gm1,ACAP3,TSS,226.268091
4,GM12878,ATF2_gm1,PUSL1,TSS,241.000000
...,...,...,...,...,...
1054219,,ZYX,NEDD8,PPI,
1054220,,ZYX,UBC,PPI,
1054221,,ZYX,VASP,PPI,
1054222,,ZZEF1,UBC,PPI,


In [6]:
# read common tfs
common_tf = list(set(pd.read_csv(DATA_FOLDER / 'common_tf.csv')['tf']))
common_tf_k = [tf + '_gm1' for tf in common_tf]
common_tf_gm = [tf + '_gm2' for tf in common_tf]
common_tf = common_tf_k + common_tf_gm
common_tf.sort()
len(common_tf)

[&#39;ATF3_gm1&#39;, &#39;ATF3_gm2&#39;, &#39;BCLAF1_gm1&#39;, &#39;BCLAF1_gm2&#39;, &#39;BHLHE40_gm1&#39;]

In [17]:
common_tf = set(common_tf)

df_gm = df[df['source'].isin(common_tf)]
df_ppi = df[df['type'] == "PPI"]

df = pd.concat([df_gm, df_ppi], axis=0)
df.shape

(751104, 5)

In [19]:
nodes = list(set(df['source']).union(set(df['target'])))
len(nodes)

17226

In [6]:
d = dict()
for i, tf in enumerate(common_tf):
    d[tf] = i


def node2neighbors(node, df=df):
    feature = np.array([0] * len(common_tf))
    a = df['target'] == node
    b = df['source'].isin(common_tf)
    regulators = list(df[a & b]['source'])
    linked_tf_pos = list(map(lambda tf: d[tf], regulators))
    feature[linked_tf_pos] = 1
    return feature


features = dict(zip(nodes, map(node2neighbors, nodes)))    

In [8]:
feature_df = pd.DataFrame.from_dict(features, orient='index')
feature_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,128,129,130,131,132,133,134,135,136,137
KIAA1958,0,0,0,1,1,1,0,0,0,1,...,0,0,0,0,1,1,1,0,0,0
WRNIP1,0,1,0,0,1,1,0,0,0,1,...,0,0,1,1,0,1,1,1,0,0
HEATR6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
SLC46A3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
MUC17,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HPS5,1,1,0,1,1,1,0,0,0,1,...,1,1,1,0,1,1,1,1,0,0
MT1H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ELP5,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0
PDE6B,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [24]:
feature_df.loc[~feature_df.index.isin(common_tf)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,128,129,130,131,132,133,134,135,136,137
TRIM59,0,0,1,1,0,1,1,1,0,1,...,0,0,0,1,1,1,1,1,0,0
HAUS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0
DKFZP761J1410,0,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
ABI1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,1,0,1,1,0,0
CCDC15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OVCH1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PES1,0,1,1,0,1,1,0,0,0,0,...,0,0,0,1,1,1,1,1,0,0
SAMD1,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,1,1,1,1,0,0
ALG14,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,1,1,1,0,0


In [14]:
# test = pd.DataFrame.from_dict({
#     'A': [0, 0, 0, 1],
#     'B': [1, 1, 1, 1]
# }, orient='index')
# test['count_1'] = test.apply(lambda x: sum(x), axis=1)
# test

feature_df['count_1'] = feature_df.sum(axis=1)
feature_df[feature_df['count_1'] == 0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,129,130,131,132,133,134,135,136,137,count_1
HSFY1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TUBB7P,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CSNK2A1P,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCF1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRYGC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZGP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
H2BFS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KRTAP3-2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
test = pd.read_csv(DATA_FOLDER / 'joint_renamed_v2.csv')
test

Unnamed: 0,cell_type,source,target,type,weight
0,GM12878,ATF3_gm,SDF4,TSS,179.618811
1,GM12878,ATF3_gm,B3GALT6,TSS,178.254494
2,GM12878,ATF3_gm,SSU72,TSS,413.000000
3,GM12878,ATF3_gm,AL645728.1,TSS,413.000000
4,GM12878,ATF3_gm,CDK11A,TSS,160.000000
...,...,...,...,...,...
777916,,ZYX,NEDD8,PPI,
777917,,ZYX,UBC,PPI,
777918,,ZYX,VASP,PPI,
777919,,ZZEF1,UBC,PPI,


In [35]:
G = test[test['cell_type'] == 'GM12878']
G

Unnamed: 0,cell_type,source,target,type,weight
0,GM12878,ATF3_gm,SDF4,TSS,179.618811
1,GM12878,ATF3_gm,B3GALT6,TSS,178.254494
2,GM12878,ATF3_gm,SSU72,TSS,413.000000
3,GM12878,ATF3_gm,AL645728.1,TSS,413.000000
4,GM12878,ATF3_gm,CDK11A,TSS,160.000000
...,...,...,...,...,...
350469,GM12878,JUND_gm,G6PD,TSS,401.905464
350470,GM12878,JUND_gm,VAMP7,TSS,823.020345
350471,GM12878,JUND_gm,CRLF2,TSS,1000.000000
350472,GM12878,JUND_gm,ASMTL,TSS,1363.003897


In [39]:
G_target = set(G['target'])
K_target = set(K['target'])

In [42]:
ppi = pd.read_csv(DATA_FOLDER / 'ppi_undirected.csv')

In [43]:
gene = set(ppi['source']).union(set(ppi['target']))

In [46]:
len(G_target.intersection(gene))

7791

In [47]:
len(K_target.intersection(gene))

7889