In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
TARGET_FOLDER = Path('../../data/features/')

**Create a Vector for each node, the vector represents tfs that it's connected to**
keep in mind
1. tf2tf is directional
2. tf2gene is directional

In [3]:
DATA_FOLDER = Path('../../data/')

# 1. Read edgelist
df = pd.read_csv(DATA_FOLDER / 'joint_renamed_v2.csv', dtype={
    'cell_type': object,
    'source': object,
    'target': object,
    'type': object,
    'weight': 'float64'
})
df

Unnamed: 0,cell_type,source,target,type,weight
0,GM12878,ATF2_gm1,HES4,TSS,674.551267
1,GM12878,ATF2_gm1,ISG15,TSS,261.000000
2,GM12878,ATF2_gm1,AGRN,TSS,190.035403
3,GM12878,ATF2_gm1,ACAP3,TSS,226.268091
4,GM12878,ATF2_gm1,PUSL1,TSS,241.000000
...,...,...,...,...,...
1054219,,ZYX,NEDD8,PPI,
1054220,,ZYX,UBC,PPI,
1054221,,ZYX,VASP,PPI,
1054222,,ZZEF1,UBC,PPI,


In [4]:
# read common tfs
common_tf = list(set(pd.read_csv(DATA_FOLDER / 'common_tf.csv')['tf']))
common_tf_k = [tf + '_k' for tf in common_tf]
common_tf_gm = [tf + '_gm' for tf in common_tf]
common_tf = common_tf_k + common_tf_gm
common_tf.sort()
len(common_tf)

138

In [6]:
nodes = list(set(df['source']).union(set(df['target'])))
len(nodes)

17226

In [7]:
d = dict()
for i, tf in enumerate(common_tf):
    d[tf] = i


def node2neighbors(node, df=df):
    feature = np.array([0] * len(common_tf))
    a = df['target'] == node
    b = df['source'].isin(common_tf)
    regulators = list(df[a & b]['source'])
    linked_tf_pos = list(map(lambda tf: d[tf], regulators))
    feature[linked_tf_pos] = 1
    return feature


features = dict(zip(nodes, map(node2neighbors, nodes)))    

In [12]:
feature_df = pd.DataFrame.from_dict(features, orient='index')
feature_df

In [18]:
feature_df.loc[~feature_df.index.isin(common_tf)]

(17226, 138)

In [10]:
# test = pd.DataFrame.from_dict({
#     'A': [0, 0, 0, 1],
#     'B': [1, 1, 1, 1]
# }, orient='index')
# test['count_1'] = test.apply(lambda x: sum(x), axis=1)
# test

feature_df['count_1'] = feature_df.sum(axis=1)
feature_df[feature_df['count_1'] == 0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,129,130,131,132,133,134,135,136,137,count_1
WDR45L,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RGPD1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
IL22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM108C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COL6A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RPL10L,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YBX1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HTR2C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TMEM85,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
