In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
motif_location_output_file = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/motif_odds.tsv"

# Need to parse the output file to be compatible with activity

## Motif probability

In [3]:
motif_raw_df = pd.read_csv(motif_location_output_file, sep="\t")

In [4]:
motif_raw_df.head()

Unnamed: 0,PositionID,Offset,Sequence,Motif Name,Strand,MotifScore
0,chrX:154800550-154801050,-93,ATGAGTAAGC,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,+,6.562859
1,chrX:154800550-154801050,-85,GATGAGTAAG,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,-,6.561993
2,chrX:154800500-154801000,-43,ATGAGTAAGC,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,+,6.562859
3,chrX:154800500-154801000,-35,GATGAGTAAG,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,-,6.561993
4,chrX:154800450-154800950,7,ATGAGTAAGC,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,+,6.562859


In [5]:
def process(df_row):
    chrm = df_row[0].split(":")[0]
    start,end = df_row[0].split(":")[1].split("-")
    prob = np.exp(df_row[5])/(1 + np.exp(df_row[5]))
    return pd.Series({"chrm":chrm, "start":start, "end": end, "motif": df_row[3], "strand": df_row[4], "prob": prob})

In [6]:
motif_df_path = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/motif_df_tmp.tsv.gz"

In [7]:
# taking 246 minutes
if not os.path.exists(motif_df_path):
    motif_df = motif_raw_df.apply(process, axis=1)
    motif_df.to_pickle(motif_df_path)
else:
    motif_df = pd.read_pickle(motif_df_path)

In [8]:
motif_df.head()

Unnamed: 0,chrm,start,end,motif,strand,prob
0,chrX,154800550,154801050,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,+,0.99859
1,chrX,154800550,154801050,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,-,0.998589
2,chrX,154800500,154801000,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,+,0.99859
3,chrX,154800500,154801000,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,-,0.998589
4,chrX,154800450,154800950,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,+,0.99859


In [9]:
motif_df_pivot = motif_df.pivot_table('prob', ["chrm", "start", "end"], ["motif", "strand"], fill_value=0.0)

In [10]:
motif_df_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,motif,AMYB(HTH)/Testes-AMYB-ChIP-Seq(GSE44588)/Homer,AMYB(HTH)/Testes-AMYB-ChIP-Seq(GSE44588)/Homer,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer,AP-2alpha(AP2)/Hela-AP2alpha-ChIP-Seq(GSE31477)/Homer,AP-2alpha(AP2)/Hela-AP2alpha-ChIP-Seq(GSE31477)/Homer,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer,...,n-Myc(bHLH)/mES-nMyc-ChIP-Seq(GSE11431)/Homer,n-Myc(bHLH)/mES-nMyc-ChIP-Seq(GSE11431)/Homer,p53(p53)/Saos-p53-ChIP-Seq(GSE15780)/Homer,p53(p53)/Saos-p53-ChIP-Seq(GSE15780)/Homer,p53(p53)/mES-cMyc-ChIP-Seq(GSE11431)/Homer,p53(p53)/mES-cMyc-ChIP-Seq(GSE11431)/Homer,p63(p53)/Keratinocyte-p63-ChIP-Seq(GSE17611)/Homer,p63(p53)/Keratinocyte-p63-ChIP-Seq(GSE17611)/Homer,p73(p53)/Trachea-p73-ChIP-Seq(PRJNA310161)/Homer,p73(p53)/Trachea-p73-ChIP-Seq(PRJNA310161)/Homer
Unnamed: 0_level_1,Unnamed: 1_level_1,strand,+,-,+,-,+,-,+,-,+,-,...,+,-,+,-,+,-,+,-,+,-
chrm,start,end,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
chr1,100048200,100048700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048250,100048750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048300,100048800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048350,100048850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chr1,100048400,100048900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,9341850,9342350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,9341900,9342400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,9341950,9342450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrX,9342000,9342500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Region wise activity

In [11]:
input_cov_bed = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/input.cov.bed"
cc_cov_bed = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/cc/cc.cov.bed"
ko_cov_bed = "/data5/deepro/starrseq/main_lib/results/activity_prediction/data/atf2/atf2.cov.bed"

In [12]:
df_in_cov = pd.read_csv(input_cov_bed, header=None, sep="\t")
df_cc_cov = pd.read_csv(cc_cov_bed, header=None, sep="\t")
df_ko_cov = pd.read_csv(ko_cov_bed, header=None, sep="\t")

In [13]:
df_cc_cov.loc[((df_cc_cov[0]=="chr17") & (df_cc_cov[1]==38510550) & (df_cc_cov[2]==38511050))]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
364082,chr17,38510550,38511050,chr17:38510550-38511050,0,.,0,0,500,0.0


In [14]:
# chr17	38510550	38511050
df_in_cov.loc[((df_in_cov[0]=="chr17") & (df_in_cov[1]==38510550) & (df_in_cov[2]==38511050))]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
364082,chr17,38510550,38511050,chr17:38510550-38511050,0,.,0,0,500,0.0


In [15]:
df_cc_act = df_cc_cov.iloc[:, [0,1,2]].astype(str).merge(df_cc_cov[6]/df_in_cov[6], left_index=True, right_index=True).set_index([0,1,2])
df_ko_act = df_cc_cov.iloc[:, [0,1,2]].astype(str).merge(df_ko_cov[6]/df_in_cov[6], left_index=True, right_index=True).set_index([0,1,2])

In [18]:
df_cc_act.rename(columns={6: "activity"}, inplace=True)
df_ko_act.rename(columns={6: "activity"}, inplace=True)


In [23]:
df_cc_act = df_cc_act.dropna()
df_ko_act = df_ko_act.dropna()

In [28]:
motif_df_pivot.columns = [f'{i}|{j}' if j != '' else f'{i}' for i,j in motif_df_pivot.columns]

In [29]:
motif_df_with_cc_act = motif_df_pivot.merge(df_cc_act, left_on=["chrm", "start", "end"], right_on=[0,1,2])
motif_df_with_ko_act = motif_df_pivot.merge(df_ko_act, left_on=["chrm", "start", "end"], right_on=[0,1,2])

In [30]:
motif_df_with_cc_act.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AMYB(HTH)/Testes-AMYB-ChIP-Seq(GSE44588)/Homer|+,AMYB(HTH)/Testes-AMYB-ChIP-Seq(GSE44588)/Homer|-,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer|+,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer|-,AP-2alpha(AP2)/Hela-AP2alpha-ChIP-Seq(GSE31477)/Homer|+,AP-2alpha(AP2)/Hela-AP2alpha-ChIP-Seq(GSE31477)/Homer|-,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|+,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|-,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer|+,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer|-,...,n-Myc(bHLH)/mES-nMyc-ChIP-Seq(GSE11431)/Homer|-,p53(p53)/Saos-p53-ChIP-Seq(GSE15780)/Homer|+,p53(p53)/Saos-p53-ChIP-Seq(GSE15780)/Homer|-,p53(p53)/mES-cMyc-ChIP-Seq(GSE11431)/Homer|+,p53(p53)/mES-cMyc-ChIP-Seq(GSE11431)/Homer|-,p63(p53)/Keratinocyte-p63-ChIP-Seq(GSE17611)/Homer|+,p63(p53)/Keratinocyte-p63-ChIP-Seq(GSE17611)/Homer|-,p73(p53)/Trachea-p73-ChIP-Seq(PRJNA310161)/Homer|+,p73(p53)/Trachea-p73-ChIP-Seq(PRJNA310161)/Homer|-,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048200,100048700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54717
chr1,100048250,100048750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.696721
chr1,100048300,100048800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.786378
chr1,100048350,100048850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75
chr1,100048400,100048900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.698876


In [31]:
motif_df_with_ko_act.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AMYB(HTH)/Testes-AMYB-ChIP-Seq(GSE44588)/Homer|+,AMYB(HTH)/Testes-AMYB-ChIP-Seq(GSE44588)/Homer|-,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer|+,AP-1(bZIP)/ThioMac-PU.1-ChIP-Seq(GSE21512)/Homer|-,AP-2alpha(AP2)/Hela-AP2alpha-ChIP-Seq(GSE31477)/Homer|+,AP-2alpha(AP2)/Hela-AP2alpha-ChIP-Seq(GSE31477)/Homer|-,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|+,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|-,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer|+,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer|-,...,n-Myc(bHLH)/mES-nMyc-ChIP-Seq(GSE11431)/Homer|-,p53(p53)/Saos-p53-ChIP-Seq(GSE15780)/Homer|+,p53(p53)/Saos-p53-ChIP-Seq(GSE15780)/Homer|-,p53(p53)/mES-cMyc-ChIP-Seq(GSE11431)/Homer|+,p53(p53)/mES-cMyc-ChIP-Seq(GSE11431)/Homer|-,p63(p53)/Keratinocyte-p63-ChIP-Seq(GSE17611)/Homer|+,p63(p53)/Keratinocyte-p63-ChIP-Seq(GSE17611)/Homer|-,p73(p53)/Trachea-p73-ChIP-Seq(PRJNA310161)/Homer|+,p73(p53)/Trachea-p73-ChIP-Seq(PRJNA310161)/Homer|-,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048200,100048700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471698
chr1,100048250,100048750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.520492
chr1,100048300,100048800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.597523
chr1,100048350,100048850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609694
chr1,100048400,100048900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.604494


## Motif wise concentration 

In [32]:
## Creating a motif metadata for concentration
import json
motif_metadata_file = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/metadata_motif.json"

motif_meta_dict = {m:"" for m in motif_df["motif"].unique()}

if not os.path.exists(motif_metadata_file):
    with open(motif_metadata_file, "w") as f:  
        json.dump(motif_meta_dict, f, indent=4)

else:
    with open(motif_metadata_file, "r") as f:
        motif_meta_dict = json.load(f)

In [33]:
# TODO: add concentration from all KO lines
atf2_conc = "/data5/deepro/starrseq/rnaseq/count/rep3/ATF2_R3_S6_R1_001_normalized_lognormalized.tsv"
cc_conc = "/data5/deepro/starrseq/rnaseq/count/rep3/CC_R3_S24_R1_001_normalized_lognormalized.tsv"

In [34]:
atf2_conc_df = pd.read_csv(atf2_conc, sep="\t", header=None, index_col=0)
cc_conc_df = pd.read_csv(cc_conc, sep="\t", header=None, index_col=0)

In [35]:
ratf2conc_df = atf2_conc_df.loc[[m for m in set(motif_meta_dict.values()) if m]]
rccconc_df = cc_conc_df.loc[[m for m in set(motif_meta_dict.values()) if m]]

In [36]:
conc_df = rccconc_df.merge(ratf2conc_df, left_index=True, right_index=True)

In [37]:
ncdf = conc_df.divide(conc_df.max(axis=1), axis=0).fillna(0.0)

In [38]:
ncdf.columns = [c.split("_")[1] for c in ncdf.columns]

In [39]:
motif_conc_df = ncdf.groupby(by= ncdf.columns, axis=1).agg("mean")

# Rebuilding final dataframe as a training set

Final table format

chrm | start | end | motif1+ | motif1- | motif1conc_cc | motif2+ | motif2- | motif2conc_cc | ... | cc_activity

.
.
.

chrm | start | end | motif1+ | motif1- | motif1conc_ko | motif2+ | motif2- | motif2conc_ko | ... | ko_activity

In [40]:
keep_columns = [k+ f"|{s}" for s in ("+", "-") for k,v in motif_meta_dict.items() if v]
keep_columns += ["activity"]

In [41]:
motif_df_with_cc_act = motif_df_with_cc_act.loc[:, keep_columns]
motif_df_with_ko_act = motif_df_with_ko_act.loc[:, keep_columns]

In [44]:
new_cols = [k+ "|conc" for k,v in motif_meta_dict.items() if v]

In [45]:
motif_conc_df.loc[motif_meta_dict[new_cols[0].replace("|conc", "")], "x"]

0.8889495161404644

In [46]:
for col in new_cols:
    motif_df_with_cc_act[col] = motif_conc_df.loc[motif_meta_dict[col.replace("|conc", "")], "x"]
    motif_df_with_ko_act[col] = motif_conc_df.loc[motif_meta_dict[col.replace("|conc", "")], "y"]
    

  motif_df_with_cc_act[col] = motif_conc_df.loc[motif_meta_dict[col.replace("|conc", "")], "x"]
  motif_df_with_ko_act[col] = motif_conc_df.loc[motif_meta_dict[col.replace("|conc", "")], "y"]


In [47]:
final_activity_df = pd.concat((motif_df_with_cc_act, motif_df_with_ko_act), axis=0)

In [48]:
final_activity_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|+,Ap4(bHLH)/AML-Tfap4-ChIP-Seq(GSE45738)/Homer|+,"FOXA1:AR(Forkhead,NR)/LNCAP-AR-ChIP-Seq(GSE27824)/Homer|+",Arnt:Ahr(bHLH)/MCF7-Arnt-ChIP-Seq(Lo_et_al.)/Homer|+,Ascl1(bHLH)/NeuralTubes-Ascl1-ChIP-Seq(GSE55840)/Homer|+,Ascl2(bHLH)/ESC-Ascl2-ChIP-Seq(GSE97712)/Homer|+,Atf1(bZIP)/K562-ATF1-ChIP-Seq(GSE31477)/Homer|+,Atf2(bZIP)/3T3L1-Atf2-ChIP-Seq(GSE56872)/Homer|+,Atf3(bZIP)/GBM-ATF3-ChIP-Seq(GSE33912)/Homer|+,Atf4(bZIP)/MEF-Atf4-ChIP-Seq(GSE35681)/Homer|+,...,ZNF519(Zf)/HEK293-ZNF519.GFP-ChIP-Seq(GSE58341)/Homer|conc,ZNF528(Zf)/HEK293-ZNF528.GFP-ChIP-Seq(GSE58341)/Homer|conc,ZNF652/HepG2-ZNF652.Flag-ChIP-Seq(Encode)/Homer|conc,ZNF669(Zf)/HEK293-ZNF669.GFP-ChIP-Seq(GSE58341)/Homer|conc,ZNF675(Zf)/HEK293-ZNF675.GFP-ChIP-Seq(GSE58341)/Homer|conc,ZNF692(Zf)/HEK293-ZNF692.GFP-ChIP-Seq(GSE58341)/Homer|conc,ZNF711(Zf)/SHSY5Y-ZNF711-ChIP-Seq(GSE20673)/Homer|conc,ZNF768(Zf)/Rajj-ZNF768-ChIP-Seq(GSE111879)/Homer|conc,ZNF7(Zf)/HepG2-ZNF7.Flag-ChIP-Seq(Encode)/Homer|conc,ZSCAN22(Zf)/HEK293-ZSCAN22.GFP-ChIP-Seq(GSE58341)/Homer|conc
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048200,100048700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630057,0.024305,0.652352,0.921915,0.337694,0.923242,0.067274,0.965006,0.728655,0.745794
chr1,100048250,100048750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630057,0.024305,0.652352,0.921915,0.337694,0.923242,0.067274,0.965006,0.728655,0.745794
chr1,100048300,100048800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630057,0.024305,0.652352,0.921915,0.337694,0.923242,0.067274,0.965006,0.728655,0.745794
chr1,100048350,100048850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630057,0.024305,0.652352,0.921915,0.337694,0.923242,0.067274,0.965006,0.728655,0.745794
chr1,100048400,100048900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630057,0.024305,0.652352,0.921915,0.337694,0.923242,0.067274,0.965006,0.728655,0.745794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrX,9341850,9342350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.850625,0.890435,0.974039,0.592056,0.867670,0.439885,0.922270,0.414931,0.925236,0.912235
chrX,9341900,9342400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.850625,0.890435,0.974039,0.592056,0.867670,0.439885,0.922270,0.414931,0.925236,0.912235
chrX,9341950,9342450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.850625,0.890435,0.974039,0.592056,0.867670,0.439885,0.922270,0.414931,0.925236,0.912235
chrX,9342000,9342500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.850625,0.890435,0.974039,0.592056,0.867670,0.439885,0.922270,0.414931,0.925236,0.912235


In [49]:
cols_together = [k + suf for k,v in motif_meta_dict.items() if v for suf in ("|+", "|-", "|conc")]
cols_together += ["activity"]

In [50]:
len(cols_together)

1042

In [51]:
final_activity_df = final_activity_df.loc[:, cols_together]

In [52]:
final_activity_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|+,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|-,AP-2gamma(AP2)/MCF7-TFAP2C-ChIP-Seq(GSE21234)/Homer|conc,Ap4(bHLH)/AML-Tfap4-ChIP-Seq(GSE45738)/Homer|+,Ap4(bHLH)/AML-Tfap4-ChIP-Seq(GSE45738)/Homer|-,Ap4(bHLH)/AML-Tfap4-ChIP-Seq(GSE45738)/Homer|conc,"FOXA1:AR(Forkhead,NR)/LNCAP-AR-ChIP-Seq(GSE27824)/Homer|+","FOXA1:AR(Forkhead,NR)/LNCAP-AR-ChIP-Seq(GSE27824)/Homer|-","FOXA1:AR(Forkhead,NR)/LNCAP-AR-ChIP-Seq(GSE27824)/Homer|conc",Arnt:Ahr(bHLH)/MCF7-Arnt-ChIP-Seq(Lo_et_al.)/Homer|+,...,ZNF768(Zf)/Rajj-ZNF768-ChIP-Seq(GSE111879)/Homer|+,ZNF768(Zf)/Rajj-ZNF768-ChIP-Seq(GSE111879)/Homer|-,ZNF768(Zf)/Rajj-ZNF768-ChIP-Seq(GSE111879)/Homer|conc,ZNF7(Zf)/HepG2-ZNF7.Flag-ChIP-Seq(Encode)/Homer|+,ZNF7(Zf)/HepG2-ZNF7.Flag-ChIP-Seq(Encode)/Homer|-,ZNF7(Zf)/HepG2-ZNF7.Flag-ChIP-Seq(Encode)/Homer|conc,ZSCAN22(Zf)/HEK293-ZSCAN22.GFP-ChIP-Seq(GSE58341)/Homer|+,ZSCAN22(Zf)/HEK293-ZSCAN22.GFP-ChIP-Seq(GSE58341)/Homer|-,ZSCAN22(Zf)/HEK293-ZSCAN22.GFP-ChIP-Seq(GSE58341)/Homer|conc,activity
chrm,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
chr1,100048200,100048700,0.0,0.0,0.88895,0.0,0.0,0.989811,0.0,0.0,0.636163,0.0,...,0.0,0.0,0.965006,0.0,0.0,0.728655,0.0,0.0,0.745794,0.54717
chr1,100048250,100048750,0.0,0.0,0.88895,0.0,0.0,0.989811,0.0,0.0,0.636163,0.0,...,0.0,0.0,0.965006,0.0,0.0,0.728655,0.0,0.0,0.745794,0.696721
chr1,100048300,100048800,0.0,0.0,0.88895,0.0,0.0,0.989811,0.0,0.0,0.636163,0.0,...,0.0,0.0,0.965006,0.0,0.0,0.728655,0.0,0.0,0.745794,0.786378
chr1,100048350,100048850,0.0,0.0,0.88895,0.0,0.0,0.989811,0.0,0.0,0.636163,0.0,...,0.0,0.0,0.965006,0.0,0.0,0.728655,0.0,0.0,0.745794,0.75
chr1,100048400,100048900,0.0,0.0,0.88895,0.0,0.0,0.989811,0.0,0.0,0.636163,0.0,...,0.0,0.0,0.965006,0.0,0.0,0.728655,0.0,0.0,0.745794,0.698876


# groupby their columns

In [53]:
import re

In [54]:
def grouping_function(index_val):
    pattern = re.compile("(.+)\|.+")
    index_match = re.match(pattern, index_val).group(1)
    return index_match

In [55]:
def aggregate_function(cols):
    return cols.iloc[:, [0,1]].max(axis=1).multiply(cols.iloc[:, 2])

In [56]:
fadf = final_activity_df.iloc[:, :-1].groupby(final_activity_df.columns[:-1].map(grouping_function), axis=1).agg(aggregate_function)

In [57]:
fadf.shape

(1893193, 347)

In [58]:
fadf["activity"] = final_activity_df["activity"]

In [63]:
fadf.reset_index().to_csv(os.path.join(os.path.dirname(motif_df_path), "activity_data.csv.gz"), index=False)