# Motifs that were enriched in categorized fragments responding to the depletion

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def get_top_features(filename, ntop=10):
    top_features_df = pd.read_csv(filename, header=None)
    return top_features_df.loc[:ntop-1, 0].to_list()

def get_scores(filename):
    return pd.read_csv(filename).T


# Top 10 motifs for each ko lines by enhancer category

In [10]:
categories = ["peaks_vs_notpeaks", "responsive_vs_nonresponsive", "induced_vs_repressed"]
ko_lines = ["ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]
store_dir = "../data/"

def get_feature_file(store_dir, ko, cat):
    filen = os.path.join(store_dir, ko, cat, "features.csv")
    return filen

def get_score_file(store_dir, ko, cat):
    filen = os.path.join(store_dir, ko, cat, "scores.csv")
    return filen

def create_ko_specific_df(store_dir, ko, categories):
    data_dict = {f"{c}":[] for c in categories}
    score_sers = []
    for c in categories:
        feature_file = get_feature_file(store_dir, ko, c)
        features = get_top_features(feature_file)
        data_dict[f"{c}"] = features
        score_file = get_score_file(store_dir, ko, c)
        score_ser = get_scores(score_file)
        score_sers.append(score_ser)
    features_df = pd.DataFrame(data=data_dict, columns=categories)
    score_df = pd.concat(score_sers, axis=1)
    score_df.columns = categories
    return features_df, score_df




# ATF2

In [11]:
atf2_features_df, atf2_scores_df = create_ko_specific_df(store_dir, ko_lines[0], categories)

In [12]:
atf2_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|+,ZFX_HUMAN.H11MO.0.A|-
1,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|-,ZFX_HUMAN.H11MO.0.A|+
2,P63_HUMAN.H11MO.0.A|-,ATF4_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|+
3,P63_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-
4,ATF4_HUMAN.H11MO.0.A|+,ATF4_HUMAN.H11MO.0.A|+,ATF4_HUMAN.H11MO.0.A|+
5,CREB1_HUMAN.H11MO.0.A|-,CEBPG_HUMAN.H11MO.0.B|-,P63_HUMAN.H11MO.0.A|-
6,CEBPG_HUMAN.H11MO.0.B|-,ETS2_HUMAN.H11MO.0.B|+,NFIA_HUMAN.H11MO.0.C|+
7,TYY1_HUMAN.H11MO.0.A|+,SUH_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|+
8,P73_HUMAN.H11MO.0.A|+,IKZF1_HUMAN.H11MO.0.C|-,ATF4_HUMAN.H11MO.0.A|-
9,P73_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|+,SUH_HUMAN.H11MO.0.A|+


In [13]:
atf2_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.045371,0.154215,0.708835
Precision,0.010298,0.123425,0.634789
Recall,0.639896,0.526944,0.71406
Accuracy,0.707807,0.594089,0.727434
F1_score,0.020253,0.199788,0.671833
ROCAUC,0.779731,0.633072,0.803681


# CTCF

In [14]:
ctcf_features_df, ctcf_scores_df = create_ko_specific_df(store_dir, ko_lines[1], categories)

In [21]:
ctcf_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,P53_HUMAN.H11MO.0.A|+,ATF4_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+
1,P53_HUMAN.H11MO.0.A|-,CEBPG_HUMAN.H11MO.0.B|-,ATF4_HUMAN.H11MO.0.A|+
2,P63_HUMAN.H11MO.0.A|+,ATF4_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-
3,P63_HUMAN.H11MO.0.A|-,CEBPG_HUMAN.H11MO.0.B|+,ATF2_HUMAN.H11MO.0.B|+
4,ATF4_HUMAN.H11MO.0.A|+,ATF2_HUMAN.H11MO.0.B|+,CEBPG_HUMAN.H11MO.0.B|-
5,P73_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-,ATF4_HUMAN.H11MO.0.A|-
6,P73_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|-
7,ATF4_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|+
8,CEBPG_HUMAN.H11MO.0.B|-,P63_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|-
9,TYY1_HUMAN.H11MO.0.A|+,ATF3_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|+


In [22]:
ctcf_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.058259,0.154977,0.936802
Precision,0.025347,0.103001,0.867614
Recall,0.546461,0.597864,0.830514
Accuracy,0.688942,0.679895,0.848253
F1_score,0.048439,0.174724,0.847536
ROCAUC,0.756005,0.743409,0.923748


# FOXA1

In [15]:
foxa1_features_df, foxa1_scores_df = create_ko_specific_df(store_dir, ko_lines[2], categories)

In [23]:
foxa1_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|-
1,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-,CREM_HUMAN.H11MO.0.C|+
2,P63_HUMAN.H11MO.0.A|+,FOSL1_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+
3,P63_HUMAN.H11MO.0.A|-,FOSL2_HUMAN.H11MO.0.A|-,TYY1_HUMAN.H11MO.0.A|-
4,ATF4_HUMAN.H11MO.0.A|+,MBD2_HUMAN.H11MO.0.B|+,P63_HUMAN.H11MO.0.A|+
5,ATF4_HUMAN.H11MO.0.A|-,ATF3_HUMAN.H11MO.0.A|-,ZFP42_HUMAN.H11MO.0.A|+
6,P73_HUMAN.H11MO.0.A|+,MECP2_HUMAN.H11MO.0.C|-,SUH_HUMAN.H11MO.0.A|-
7,CEBPG_HUMAN.H11MO.0.B|-,KAISO_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|-
8,CREB1_HUMAN.H11MO.0.A|-,CREB1_HUMAN.H11MO.0.A|-,CREB1_HUMAN.H11MO.0.A|-
9,P73_HUMAN.H11MO.0.A|-,ZFX_HUMAN.H11MO.0.A|-,ZFP42_HUMAN.H11MO.0.A|-


In [24]:
foxa1_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.059363,0.374243,0.980916
Precision,0.031999,0.269844,0.954224
Recall,0.566147,0.588537,0.840219
Accuracy,0.68102,0.664412,0.805793
F1_score,0.060558,0.366838,0.892366
ROCAUC,0.738903,0.717628,0.899497


# LEF1

In [16]:
lef1_features_df, lef1_scores_df = create_ko_specific_df(store_dir, ko_lines[3], categories)

In [25]:
lef1_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,ZFX_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|-
1,ATF4_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|+
2,ATF4_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|-,P73_HUMAN.H11MO.0.A|-
3,TYY1_HUMAN.H11MO.0.A|-,FOSL2_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|-
4,CREB1_HUMAN.H11MO.0.A|-,ATF4_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|+
5,TYY1_HUMAN.H11MO.0.A|+,JUND_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|+
6,ZFX_HUMAN.H11MO.0.A|+,ERR3_HUMAN.H11MO.0.B|-,ZBT14_HUMAN.H11MO.0.C|-
7,FOSL2_HUMAN.H11MO.0.A|-,STAT1_HUMAN.H11MO.0.A|+,FOSL1_HUMAN.H11MO.0.A|+
8,SRY_HUMAN.H11MO.0.B|+,AP2C_HUMAN.H11MO.0.A|+,FOSL2_HUMAN.H11MO.0.A|+
9,ARI5B_HUMAN.H11MO.0.C|+,ARNT_HUMAN.H11MO.0.B|+,ATF2_HUMAN.H11MO.0.B|-


In [26]:
lef1_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.04047,0.155028,0.760753
Precision,0.023561,0.078579,0.646093
Recall,0.520179,0.519862,0.745342
Accuracy,0.664416,0.632797,0.770725
F1_score,0.045067,0.136409,0.690784
ROCAUC,0.732671,0.673067,0.855581


# SCRT1

In [18]:
scrt1_features_df, scrt1_scores_df = create_ko_specific_df(store_dir, ko_lines[4], categories)


In [27]:
scrt1_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+
1,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-
2,P63_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|-,ATF4_HUMAN.H11MO.0.A|-
3,P63_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|+
4,ATF4_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|-
5,P73_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|-,P73_HUMAN.H11MO.0.A|+
6,P73_HUMAN.H11MO.0.A|-,CEBPD_HUMAN.H11MO.0.C|-,P63_HUMAN.H11MO.0.A|-
7,ATF4_HUMAN.H11MO.0.A|-,ZN449_HUMAN.H11MO.0.C|+,CEBPG_HUMAN.H11MO.0.B|+
8,CEBPG_HUMAN.H11MO.0.B|-,ZBTB6_HUMAN.H11MO.0.C|+,E2F5_HUMAN.H11MO.0.B|-
9,CEBPG_HUMAN.H11MO.0.B|+,ZN528_HUMAN.H11MO.0.C|+,ATF4_HUMAN.H11MO.0.A|+


In [28]:
scrt1_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.071207,0.191692,0.981112
Precision,0.028874,0.064055,0.952255
Recall,0.568831,0.592113,0.956468
Accuracy,0.698012,0.721455,0.617787
F1_score,0.054943,0.115453,0.954287
ROCAUC,0.768553,0.776798,0.789935


# TCF7L2

In [19]:
tcf7l2_features_df, tcf7l2_scores_df = create_ko_specific_df(store_dir, ko_lines[5], categories)

In [29]:
tcf7l2_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,P53_HUMAN.H11MO.0.A|+,ATF4_HUMAN.H11MO.0.A|-,ZFX_HUMAN.H11MO.0.A|+
1,ATF4_HUMAN.H11MO.0.A|+,ATF4_HUMAN.H11MO.0.A|+,SNAI2_HUMAN.H11MO.0.A|+
2,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-,MYCN_HUMAN.H11MO.0.A|+
3,ATF4_HUMAN.H11MO.0.A|-,CREB1_HUMAN.H11MO.0.A|+,SNAI2_HUMAN.H11MO.0.A|-
4,ZFX_HUMAN.H11MO.0.A|-,CEBPG_HUMAN.H11MO.0.B|-,ZIC3_HUMAN.H11MO.0.B|+
5,TYY1_HUMAN.H11MO.0.A|-,CEBPG_HUMAN.H11MO.0.B|+,BHE40_HUMAN.H11MO.0.A|+
6,CREB1_HUMAN.H11MO.0.A|-,ATF2_HUMAN.H11MO.0.B|-,KLF8_HUMAN.H11MO.0.C|-
7,TYY1_HUMAN.H11MO.0.A|+,CREB1_HUMAN.H11MO.0.A|-,ZN768_HUMAN.H11MO.0.C|+
8,ZFP42_HUMAN.H11MO.0.A|-,P63_HUMAN.H11MO.0.A|-,REL_HUMAN.H11MO.0.B|+
9,P63_HUMAN.H11MO.0.A|+,ATF1_HUMAN.H11MO.0.B|+,ATF4_HUMAN.H11MO.0.A|+


In [30]:
tcf7l2_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.052164,0.073462,0.990097
Precision,0.029923,0.062629,0.977569
Recall,0.558496,0.50016,0.875385
Accuracy,0.673213,0.606432,0.661119
F1_score,0.056787,0.111244,0.923264
ROCAUC,0.733372,0.644628,0.790658


# 16p12.1

In [31]:
cnv_features_df, cnv_scores_df= create_ko_specific_df(store_dir, ko_lines[6], categories)

In [32]:
cnv_features_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
0,P53_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|+,ZFX_HUMAN.H11MO.0.A|-
1,P53_HUMAN.H11MO.0.A|-,ZFX_HUMAN.H11MO.0.A|+,ZFX_HUMAN.H11MO.0.A|+
2,ATF4_HUMAN.H11MO.0.A|+,P53_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|-
3,ATF4_HUMAN.H11MO.0.A|-,ZFX_HUMAN.H11MO.0.A|-,P53_HUMAN.H11MO.0.A|+
4,TYY1_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|-,P73_HUMAN.H11MO.0.A|+
5,TYY1_HUMAN.H11MO.0.A|-,ZN335_HUMAN.H11MO.0.A|+,P73_HUMAN.H11MO.0.A|-
6,CREB1_HUMAN.H11MO.0.A|-,LHX3_HUMAN.H11MO.0.C|-,MYCN_HUMAN.H11MO.0.A|+
7,P63_HUMAN.H11MO.0.A|-,ELF5_HUMAN.H11MO.0.A|+,P63_HUMAN.H11MO.0.A|-
8,P63_HUMAN.H11MO.0.A|+,TF7L1_HUMAN.H11MO.0.B|+,P63_HUMAN.H11MO.0.A|+
9,ELK4_HUMAN.H11MO.0.A|-,SP1_HUMAN.H11MO.0.A|-,E2F5_HUMAN.H11MO.0.B|-


In [33]:
cnv_scores_df

Unnamed: 0,peaks_vs_notpeaks,responsive_vs_nonresponsive,induced_vs_repressed
APS,0.069576,0.089286,0.858173
Precision,0.030844,0.060217,0.785539
Recall,0.566829,0.570169,0.795309
Accuracy,0.690346,0.687484,0.828563
F1_score,0.058491,0.108627,0.788642
ROCAUC,0.75916,0.752576,0.908111
