In [1]:
motif_location_output_file = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/motif_odds.tsv"

# For the TF of interest, create smaller output file

In [2]:
import requests
import os
import pandas as pd
import numpy as np

In [3]:
def get_interacting_tfs(annot_url):
    r = requests.get(annot_url)
    tmp_annot_file = "example_annotations.tsv"
    with open(tmp_annot_file, 'wb') as f:
        f.write(r.content)
    df_tmp = pd.read_csv(tmp_annot_file, sep="\t")
    os.remove(tmp_annot_file)
    all_nodes = df_tmp.iloc[:, 0].values
    return list(all_nodes)

In [4]:
tf = "atf2"
# top 25 obtained from STRING interaction networks: https://version-11-5.string-db.org/cgi/network?networkId=bCDZU3zOYfL5
tf_annot_url = "https://version-11-5.string-db.org/cgi/generatetaskspecificdownloadfile?taskId=bEhvDpvv6bKQ&downloadDataFormat=annotations&cpnonce=bs4pIOY9yWSW&downloadFileName=string_protein_annotations.tsv"
potentially_interacting_tfs = get_interacting_tfs(tf_annot_url)

In [5]:
potentially_interacting_tfs

['ATF2',
 'ATF3',
 'ATF7',
 'BATF3',
 'CEBPG',
 'CREB5',
 'CREBBP',
 'EP300',
 'FOS',
 'FOSL2',
 'IRF3',
 'JDP2',
 'JUN',
 'JUNB',
 'JUND',
 'MAPK1',
 'MAPK10',
 'MAPK11',
 'MAPK13',
 'MAPK14',
 'MAPK3',
 'MAPK8',
 'MAPK9',
 'RB1',
 'RPS6KA5',
 'RUVBL2']

In [6]:
# append motifs of your choice if any
potentially_interacting_tfs.append("ATF4")

In [7]:
# map of potentially interacting TFs with their homer motif names
homer_motif_file = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/homer.motifs"
hmf  = open(homer_motif_file, "r")
motif_names = []
for line in hmf:
    if line.startswith(">"):
        motif_name = line.split("\t")[1]
        motif_names.append(motif_name)

In [8]:
pim_dict = {tf:[] for tf in potentially_interacting_tfs}

In [9]:
for m in motif_names:
    m_proc = m.lower()
    for t in potentially_interacting_tfs:
        t_proc = t.lower()
        if t_proc in m_proc:
            pim_dict[t].append(m)

In [10]:
rmotifs = list(set([lv for k in pim_dict.keys() for lv in pim_dict[k]]))

In [11]:
rmotifs

['Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer',
 'Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer',
 'Fosl2(bZIP)/3T3L1-Fosl2-ChIP-Seq(GSE56872)/Homer',
 'JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer',
 'Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer',
 'Fos(bZIP)/TSC-Fos-ChIP-Seq(GSE110950)/Homer',
 'Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer',
 'Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer',
 'IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer',
 'JunD(bZIP)/K562-JunD-ChIP-Seq/Homer',
 'CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer',
 'c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer']

In [12]:
# in the mega motif tsv file look if either of the above motifs are present
mega_motif = open(motif_location_output_file, "r")
small_motif = open("/data5/deepro/starrseq/main_lib/results/activity_prediction/data/atf2_related_motifs.csv", "w")


In [13]:
for line in mega_motif:
    curr_line = line.split("\t")
    if curr_line[3] in rmotifs:
        small_motif.write(",".join(curr_line))   

In [14]:
mega_motif.close()
small_motif.close()

# Need to parse the output file to be compatible with activity

## Motif probability

In [15]:
atf2_motif_file = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/atf2_related_motifs.csv"

motif_raw_df = pd.read_csv(atf2_motif_file, header=None)

In [16]:
def process(df_row):
    chrm = df_row[0].split(":")[0]
    start,end = df_row[0].split(":")[1].split("-")
    prob = np.exp(df_row[5])/(1 + np.exp(df_row[5]))
    return pd.Series({"chrm":chrm, "start":start, "end": end, "motif": df_row[3], "strand": df_row[4], "probability": prob})

In [17]:
motif_df_path = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/atf2/motif_df_tmp.tsv.gz"

In [18]:
# taking 5 minutes
if not os.path.exists(motif_df_path):
    motif_df = motif_raw_df.apply(process, axis=1)
    motif_df.to_pickle(motif_df_path)
else:
    motif_df = pd.read_pickle(motif_df_path)

In [19]:
motif_df.head()

Unnamed: 0,chrm,start,end,motif,strand,probability
0,chrX,154799250,154799750,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,+,0.999635
1,chrX,154799250,154799750,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,-,0.999798
2,chrX,154799200,154799700,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,+,0.999635
3,chrX,154799200,154799700,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,-,0.999798
4,chrX,154799150,154799650,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,+,0.999635


In [20]:
motif_df_pivot = motif_df.pivot_table('probability', ["chrm", "start", "end"], ["motif", "strand"], fill_value=0.0)

In [21]:
motif_df_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,motif,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer,...,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer,Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer,Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer,c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer,c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer
Unnamed: 0_level_1,Unnamed: 1_level_1,strand,+,-,+,-,+,-,+,-,+,-,...,+,-,+,-,+,-,+,-,+,-
chrm,start,end,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
chr1,100048750,100049250,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048800,100049300,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048850,100049350,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048900,100049400,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100061300,100061800,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.999793,0.999785,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,9341750,9342250,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.999985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,9341800,9342300,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.999875,0.999985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,9341850,9342350,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.999875,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,9341900,9342400,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.999875,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Region wise activity

In [22]:
input_cov_bed = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/input.cov.bed"
cc_cov_bed = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/cc.cov.bed"
ko_cov_bed = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/atf2.cov.bed"

In [23]:
df_in_cov = pd.read_csv(input_cov_bed, header=None, sep="\t")
df_cc_cov = pd.read_csv(cc_cov_bed, header=None, sep="\t")
df_ko_cov = pd.read_csv(ko_cov_bed, header=None, sep="\t")

In [78]:
df_cc_cov.loc[((df_cc_cov[0]=="chr17") & (df_cc_cov[1]==38510550) & (df_cc_cov[2]==38511050))]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
364082,chr17,38510550,38511050,chr17:38510550-38511050,0,.,0,0,500,0.0


In [77]:
# chr17	38510550	38511050
df_in_cov.loc[((df_in_cov[0]=="chr17") & (df_in_cov[1]==38510550) & (df_in_cov[2]==38511050))]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
364082,chr17,38510550,38511050,chr17:38510550-38511050,0,.,0,0,500,0.0


In [24]:
df_cc_act = df_cc_cov.iloc[:, [0,1,2]].astype(str).merge(df_cc_cov[6]/df_in_cov[6], left_index=True, right_index=True).set_index([0,1,2])
df_ko_act = df_cc_cov.iloc[:, [0,1,2]].astype(str).merge(df_ko_cov[6]/df_in_cov[6], left_index=True, right_index=True).set_index([0,1,2])

In [79]:
df_cc_act.loc[df_cc_act.isna().any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,activity
0,1,2,Unnamed: 3_level_1
chr1,39081000,39081500,
chr1,39081050,39081550,
chr1,39081100,39081600,
chr1,39081150,39081650,
chr1,65253950,65254450,
...,...,...,...
chrX,25013450,25013950,
chrX,25013500,25014000,
chrX,25013550,25014050,
chrX,25013600,25014100,


In [25]:
df_cc_act.rename(columns={6: "activity"}, inplace=True)
df_ko_act.rename(columns={6: "activity"}, inplace=True)


In [26]:
motif_df_pivot.columns = [f'{i}|{j}' if j != '' else f'{i}' for i,j in motif_df_pivot.columns]

In [27]:
motif_df_with_cc_act = motif_df_pivot.merge(df_cc_act, left_on=["chrm", "start", "end"], right_on=[0,1,2])
motif_df_with_ko_act = motif_df_pivot.merge(df_ko_act, left_on=["chrm", "start", "end"], right_on=[0,1,2])

In [28]:
motif_df_with_cc_act.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|+,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|-,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|+,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|-,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|+,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|-,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|+,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|-,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|+,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|-,...,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer|-,Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|+,Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|-,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|+,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|-,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|+,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|-,c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|+,c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|-,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048750,100049250,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.851064
chr1,100048800,100049300,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.881356
chr1,100048850,100049350,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.887273
chr1,100048900,100049400,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.842718
chr1,100061300,100061800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999793,0.999785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.585887


In [29]:
motif_df_with_ko_act.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|+,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|-,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|+,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|-,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|+,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|-,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|+,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|-,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|+,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|-,...,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer|-,Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|+,Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|-,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|+,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|-,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|+,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|-,c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|+,c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer|-,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048750,100049250,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.648936
chr1,100048800,100049300,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.659322
chr1,100048850,100049350,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.669091
chr1,100048900,100049400,0.0,0.0,0.998378,0.998412,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666019
chr1,100061300,100061800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999793,0.999785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.522748


## Motif wise concentration 

In [30]:
# TODO: need to get it from metadata
motif2conc_dict = {
    'Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer': 'ATF2',
    'Fos(bZIP)/TSC-Fos-ChIP-Seq(GSE110950)/Homer': 'FOS', 
    'IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer': 'IRF3', 
    'Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer': 'ATF4', 
    'JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer': 'JUNB', 
    'Jun-AP1(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer': '', 
    'Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer': 'ATF3', 
    'c-Jun-CRE(bZIP)/K562-cJun-ChIP-Seq(GSE31477)/Homer': '', 
    'CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer': 'CREB5', 
    'Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer': 'ATF7',  
    'JunD(bZIP)/K562-JunD-ChIP-Seq/Homer': 'JUND', 
    'Fosl2(bZIP)/3T3L1-Fosl2-ChIP-Seq(GSE56872)/Homer': 'FOSL2'
}

In [31]:
atf2_conc = "/data5/deepro/starrseq/rnaseq/count/rep3/ATF2_R3_S6_R1_001_normalized_lognormalized.tsv"
cc_conc = "/data5/deepro/starrseq/rnaseq/count/rep3/CC_R3_S24_R1_001_normalized_lognormalized.tsv"

In [32]:
atf2_conc_df = pd.read_csv(atf2_conc, sep="\t", header=None, index_col=0)
cc_conc_df = pd.read_csv(cc_conc, sep="\t", header=None, index_col=0)

In [33]:
ratf2conc_df = atf2_conc_df.loc[[m for m in motif2conc_dict.values() if m]]
rccconc_df = cc_conc_df.loc[[m for m in motif2conc_dict.values() if m]]

In [34]:
conc_df = rccconc_df.merge(ratf2conc_df, left_index=True, right_index=True)

In [35]:
ncdf = conc_df.divide(conc_df.max(axis=1), axis=0)

In [36]:
ncdf.columns = [c.split("_")[1] for c in ncdf.columns]

In [37]:
motif_conc_df = ncdf.groupby(by= ncdf.columns, axis=1).agg("mean")

# Rebuilding final dataframe as a training set

Final table format

chrm | start | end | motif1+ | motif1- | motif1conc_cc | motif2+ | motif2- | motif2conc_cc | ... | cc_activity

.
.
.

chrm | start | end | motif1+ | motif1- | motif1conc_ko | motif2+ | motif2- | motif2conc_ko | ... | ko_activity

In [38]:
keep_columns = [k+ f"|{s}" for s in ("+", "-") for k,v in motif2conc_dict.items() if v]
keep_columns += ["activity"]

In [39]:
motif_df_with_cc_act = motif_df_with_cc_act.loc[:, keep_columns]
motif_df_with_ko_act = motif_df_with_ko_act.loc[:, keep_columns]

In [40]:
motif_df_with_ko_act

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|+,Fos(bZIP)/TSC-Fos-ChIP-Seq(GSE110950)/Homer|+,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer|+,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|+,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|+,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|+,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|+,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|+,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|+,Fosl2(bZIP)/3T3L1-Fosl2-ChIP-Seq(GSE56872)/Homer|+,...,Fos(bZIP)/TSC-Fos-ChIP-Seq(GSE110950)/Homer|-,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer|-,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|-,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|-,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|-,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|-,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|-,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|-,Fosl2(bZIP)/3T3L1-Fosl2-ChIP-Seq(GSE56872)/Homer|-,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048750,100049250,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.648936
chr1,100048800,100049300,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.659322
chr1,100048850,100049350,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.669091
chr1,100048900,100049400,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.666019
chr1,100061300,100061800,0.0,0.0,0.000000,0.0,0.0,0.000000,0.999793,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.999785,0.0,0.0,0.0,0.522748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,9341750,9342250,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.999985,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.357977
chrX,9341800,9342300,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.999985,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.367263
chrX,9341850,9342350,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.340237
chrX,9341900,9342400,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.310569


In [41]:
motif_df_with_cc_act

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|+,Fos(bZIP)/TSC-Fos-ChIP-Seq(GSE110950)/Homer|+,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer|+,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|+,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|+,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|+,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|+,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|+,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|+,Fosl2(bZIP)/3T3L1-Fosl2-ChIP-Seq(GSE56872)/Homer|+,...,Fos(bZIP)/TSC-Fos-ChIP-Seq(GSE110950)/Homer|-,IRF3(IRF)/BMDM-Irf3-ChIP-Seq(GSE67343)/Homer|-,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|-,JunB(bZIP)/DendriticCells-Junb-ChIP-Seq(GSE36099)/Homer|-,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|-,CREB5(bZIP)/LNCaP-CREB5.V5-ChIP-Seq(GSE137775)/Homer|-,Atf7(bZIP)/3T3L1-Atf7-ChIP-Seq(GSE56872)/Homer|-,JunD(bZIP)/K562-JunD-ChIP-Seq/Homer|-,Fosl2(bZIP)/3T3L1-Fosl2-ChIP-Seq(GSE56872)/Homer|-,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048750,100049250,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.851064
chr1,100048800,100049300,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.881356
chr1,100048850,100049350,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.887273
chr1,100048900,100049400,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.998412,0.000000,0.0,0.0,0.0,0.842718
chr1,100061300,100061800,0.0,0.0,0.000000,0.0,0.0,0.000000,0.999793,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.999785,0.0,0.0,0.0,0.585887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,9341750,9342250,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.999985,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.286641
chrX,9341800,9342300,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.999985,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.288858
chrX,9341850,9342350,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.266272
chrX,9341900,9342400,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.253659


In [42]:
renamed_cols = [motif2conc_dict[colname[:-2]] + colname[-1:] for colname in keep_columns[:-1]]
renamed_cols += ["activity"]

In [43]:
motif_df_with_cc_act.columns = renamed_cols
motif_df_with_ko_act.columns = renamed_cols

In [44]:
new_cols = [x+"conc" for x in motif_conc_df.index]

In [45]:
motif_conc_df.loc["ATF2", "x"]

0.945075627796863

In [46]:
for col in new_cols:
    motif_df_with_cc_act[col] = motif_conc_df.loc[col.replace("conc", ""), "x"]
    motif_df_with_ko_act[col] = motif_conc_df.loc[col.replace("conc", ""), "y"]
    

In [70]:
final_activity_df = pd.concat((motif_df_with_cc_act, motif_df_with_ko_act), axis=0)

In [71]:
final_activity_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ATF2+,FOS+,IRF3+,ATF4+,JUNB+,ATF3+,CREB5+,ATF7+,JUND+,FOSL2+,...,ATF2conc,FOSconc,IRF3conc,ATF4conc,JUNBconc,ATF3conc,CREB5conc,ATF7conc,JUNDconc,FOSL2conc
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048750,100049250,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.945076,0.560994,0.881392,0.907077,0.624504,0.467825,0.90614,0.940654,0.620068,0.169023
chr1,100048800,100049300,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.945076,0.560994,0.881392,0.907077,0.624504,0.467825,0.90614,0.940654,0.620068,0.169023
chr1,100048850,100049350,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.945076,0.560994,0.881392,0.907077,0.624504,0.467825,0.90614,0.940654,0.620068,0.169023
chr1,100048900,100049400,0.0,0.0,0.000000,0.0,0.0,0.998378,0.000000,0.0,0.0,0.0,...,0.945076,0.560994,0.881392,0.907077,0.624504,0.467825,0.90614,0.940654,0.620068,0.169023
chr1,100061300,100061800,0.0,0.0,0.000000,0.0,0.0,0.000000,0.999793,0.0,0.0,0.0,...,0.945076,0.560994,0.881392,0.907077,0.624504,0.467825,0.90614,0.940654,0.620068,0.169023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,9341750,9342250,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.500809,0.947392,0.957930,0.386333,0.878609,0.915016,0.85843,0.846648,0.913771,0.898798
chrX,9341800,9342300,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.500809,0.947392,0.957930,0.386333,0.878609,0.915016,0.85843,0.846648,0.913771,0.898798
chrX,9341850,9342350,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.500809,0.947392,0.957930,0.386333,0.878609,0.915016,0.85843,0.846648,0.913771,0.898798
chrX,9341900,9342400,0.0,0.0,0.999875,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.500809,0.947392,0.957930,0.386333,0.878609,0.915016,0.85843,0.846648,0.913771,0.898798


In [54]:
refined_motifs = [v for k,v in motif2conc_dict.items() if v]

In [66]:
all_cols = [m + suf for m in refined_motifs for suf in ("+", "-", "conc")]

In [67]:
all_cols += ["activity"]

In [72]:
final_activity_df = final_activity_df.loc[:, all_cols].reset_index()

In [74]:
final_activity_df.to_csv(os.path.join(os.path.dirname(motif_df_path), "activity_data.csv.gz"), index=False)