# Check notebook is using the correct conda environment

In [1]:
!which python

/data5/deepro/miniconda3/envs/starrseq/bin/python


# Import modules

In [2]:
import sys
import pandas as pd

In [3]:
# internal modules
sys.path.append("./src/")

In [4]:
from create_arguments import create_args
from utils.process_regions import get_intersects, get_non_intersects
from utils.validation_helper import get_tsv, get_cradle_activated, get_mea

# Arguments

In [5]:
# TODO: make the arguments proper
##### METADATA #####
meta_file = "./data/metadata.json"

##### GLOBALS #####
KO_NAME = "atf2" # can be 16p12.1, atf2, ctcf, foxa1, lef1, scrt1, tcf7l2
REFGEN = "/data5/deepro/genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
ROISORTED = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/master.sorted.bed"
ROIMASTER = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/enhancer_master_list_hek293.bed"

##### FLAGS #####
RUN=True
INPUT_FLAG=True
CONTROL_FLAG=True

args = create_args(meta_file, KO_NAME, REFGEN, ROIMASTER, ROISORTED, 
                run=RUN, input_flag=INPUT_FLAG, control_flag=CONTROL_FLAG)

# Peaks - preliminary validation

## Direct active regions
Peaks/regions present in the Control file but not in the KO file must be active when the KO TF is present

In [6]:
if args.run:
    # starrpeaker
    get_non_intersects(args.control_peak_file, args.ko_peak_file, args.ko_dactive_file)
    # cradle
    ## get activated peaks called by cradle
    get_cradle_activated(args.cradle_control_peak_file, args.cradle_control_activated_file)
    get_cradle_activated(args.cradle_ko_peak_file, args.cradle_ko_activated_file)
    ## get the non intersects between control and ko activated peaks
    get_non_intersects(args.cradle_control_activated_file, args.cradle_ko_activated_file, args.cradle_ko_dactive_file)
    

### Check number of direct active peaks corresponding to the KO'd TF already annotated in the master file

In [11]:
if args.run:
    # starrpeaker
    get_intersects(args.region_of_interest_master, args.ko_dactive_file, args.ko_daannotated_file)
    # cradle
    get_intersects(args.region_of_interest_master, args.cradle_ko_dactive_file, args.cradle_ko_daannotated_file)

In [16]:
df_master = pd.read_csv(args.region_of_interest_master, sep="\t")
df_averify = pd.read_csv(args.ko_daannotated_file, sep="\t", header=None, names=df_master.columns)
df_caverify = pd.read_csv(args.cradle_ko_daannotated_file, sep="\t", header=None, names=df_master.columns)
df_validated = df_averify.loc[df_averify[args.ko_name]==1].loc[:, ["Chrom", "Start", "End", args.ko_name]]
df_cvalidated = df_caverify.loc[df_caverify[args.ko_name]==1].loc[:, ["Chrom", "Start", "End", args.ko_name]]

print(f"{round(len(df_validated)*100/len(df_averify), 3)} percent of the starrpeaker direct active peaks are already validated")
print(f"{round(len(df_cvalidated)*100/len(df_caverify), 3)} percent the cradle direct active peaks are already validated")

33.463 percent of the starrpeaker direct active peaks are already validated
16.605 percent the cradle direct active peaks are already validated


### Look at sum of columns to see which TFs have maximal associations with the activated sites

In [17]:
df_averify.iloc[:, 4:].sum().sort_values(ascending=False).head(25)

eGFP-ZBTB17    509
eGFP-ZNF335    482
eGFP-SP7       469
eGFP-PRDM6     465
eGFP-ZNF629    465
eGFP-PRDM10    461
eGFP-ZNF366    429
eGFP-ZEB2      428
eGFP-FEZF1     426
eGFP-ZNF692    393
eGFP-OSR2      383
eGFP-GLIS1     379
eGFP-IKZF3     375
TRIM28         373
eGFP-ZNF660    367
eGFP-PATZ1     352
eGFP-ZNF600    350
eGFP-ATF2      344
eGFP-ZIC2      333
eGFP-ZXDB      333
eGFP-ZNF189    317
eGFP-ZBTB44    317
eGFP-ZNF843    311
eGFP-ZNF24     306
eGFP-ZFP69B    291
dtype: int64

In [18]:
df_caverify.iloc[:, 4:].sum().sort_values(ascending=False).head(25)

eGFP-PRDM6     10688
eGFP-FEZF1      8028
eGFP-OSR2       7712
eGFP-ZBTB17     7238
eGFP-SP7        7228
eGFP-PRDM10     6711
eGFP-ZNF629     6565
eGFP-GLIS1      6176
eGFP-ZNF843     6163
eGFP-ZNF335     6052
eGFP-ZNF366     5953
eGFP-ZXDB       5896
eGFP-ZEB2       5835
eGFP-ZNF600     5710
eGFP-IKZF3      5603
eGFP-WT1        5301
PKNOX1          5041
TRIM28          4956
eGFP-ZNF189     4950
eGFP-ZIC2       4864
eGFP-ZBTB44     4581
eGFP-ZNF692     4555
eGFP-PATZ1      4546
eGFP-PRDM1      4517
eGFP-PRDM4      4430
dtype: int64

## Direct inactive regions
Peaks/regions present in the KO file but not in the Control file must be inavtive when the KO TF is present

In [19]:
if RUN:
    # starrpeaker
    get_non_intersects(args.ko_peak_file, args.control_peak_file, args.ko_dinactive_file)
    # cradle
    ## get the non intersects between ko and control activated peaks
    get_non_intersects(args.cradle_ko_activated_file, args.cradle_control_activated_file, args.cradle_ko_dinactive_file)


### Look at sum of columns to see which TFs have maximal associations with the repressed sites

In [20]:
# repressed
if RUN:
    # starrpeaker
    get_intersects(args.region_of_interest_master, args.ko_dinactive_file, args.ko_diannotated_file)
    # cradle
    get_intersects(args.region_of_interest_master, args.cradle_ko_dinactive_file, args.cradle_ko_diannotated_file)

In [21]:
df_rverify = pd.read_csv(args.ko_diannotated_file, sep="\t", header=None, names=df_master.columns)
df_crverify = pd.read_csv(args.cradle_ko_diannotated_file, sep="\t", header=None, names=df_master.columns)

In [23]:
df_rverify.iloc[:, 4:].sum().sort_values(ascending=False).head(25)

eGFP-ZNF629    97
eGFP-SP7       93
eGFP-ZBTB17    90
eGFP-ZEB2      79
eGFP-ZNF366    78
eGFP-OSR2      74
eGFP-KLF8      69
eGFP-GLIS1     69
eGFP-PRDM10    68
eGFP-ZNF600    67
eGFP-ZNF335    66
eGFP-ZNF189    65
eGFP-ZNF398    65
eGFP-ZXDB      63
eGFP-FEZF1     62
eGFP-PATZ1     62
eGFP-HIC1      61
eGFP-PRDM6     60
eGFP-ZNF394    59
eGFP-ATF2      58
eGFP-ZNF692    58
eGFP-ZFP69B    57
eGFP-ZNF24     57
eGFP-ZNF639    56
eGFP-INSM2     56
dtype: int64

In [24]:
df_crverify.iloc[:, 4:].sum().sort_values(ascending=False).head(25)

eGFP-PRDM6     1167
eGFP-SP7       1009
eGFP-FEZF1      989
eGFP-ZBTB17     978
eGFP-OSR2       943
eGFP-ZNF629     926
eGFP-PRDM10     924
eGFP-GLIS1      904
eGFP-ZNF366     860
eGFP-ZNF335     848
eGFP-ZEB2       804
eGFP-ZNF600     796
eGFP-ZNF843     782
eGFP-ZXDB       777
eGFP-ZNF189     736
eGFP-ZIC2       735
eGFP-IKZF3      733
eGFP-WT1        702
eGFP-PATZ1      698
eGFP-ZNF692     685
PKNOX1          630
eGFP-ZNF398     613
eGFP-PRDM1      594
eGFP-SCRT2      590
TRIM28          586
dtype: int64

## Comparison with CRADLE

In [30]:
print("**************************************")
print("CONTROL PEAKS CALLED")
print("STARRPEAKER")
!cat $args.control_peak_file | wc -l
print("CRADLE")
!cat $args.cradle_control_activated_file | wc -l
print("CONTROL INTERSECTING PEAKS: STARRPEAKER IN CRADLE")
!bedtools intersect -a $args.control_peak_file -b $args.cradle_control_activated_file -u | wc -l
print("CONTROL INTERSECTING PEAKS: CRADLE IN STARRPEAKER")
!bedtools intersect -b $args.control_peak_file -a $args.cradle_control_activated_file -u | wc -l
print("**************************************")
print("KO PEAKS CALLED")
print("STARRPEAKER")
!cat $args.ko_peak_file | wc -l
print("CRADLE")
!cat $args.cradle_ko_activated_file | wc -l
print("KO INTERSECTING PEAKS: STARRPEAKER IN CRADLE")
!bedtools intersect -a $args.ko_peak_file -b $args.cradle_ko_activated_file -u | wc -l
print("KO INTERSECTING PEAKS: CRADLE IN STARRPEAKER")
!bedtools intersect -b $args.ko_peak_file -a $args.cradle_ko_activated_file -u | wc -l
print("**************************************")
print("KO DIRECT ACTIVE PEAKS CALLED")
print("STARRPEAKER")
!cat $args.ko_dactive_file | wc -l
print("CRADLE")
!cat $args.cradle_ko_dactive_file | wc -l
print("KO INTERSECTING DIRECT ACTIVE PEAKS: STARRPEAKER IN CRADLE")
!bedtools intersect -a $args.ko_dactive_file -b $args.cradle_ko_dactive_file -u | wc -l
print("KO INTERSECTING DIRECT ACTIVE PEAKS: CRADLE IN STARRPEAKER")
!bedtools intersect -b $args.ko_dactive_file -a $args.cradle_ko_dactive_file -u | wc -l
print("**************************************")
print("KO DIRECT INACTIVE PEAKS CALLED")
print("STARRPEAKER")
!cat $args.ko_dinactive_file | wc -l
print("CRADLE")
!cat $args.cradle_ko_dinactive_file | wc -l
print("KO INTERSECTING DIRECT INACTIVE PEAKS: STARRPEAKER IN CRADLE")
!bedtools intersect -a $args.ko_dinactive_file -b $args.cradle_ko_dinactive_file -u | wc -l
print("KO INTERSECTING DIRECT INACTIVE PEAKS: CRADLE IN STARRPEAKER")
!bedtools intersect -b $args.ko_dinactive_file -a $args.cradle_ko_dinactive_file -u | wc -l
print("**************************************")

**************************************
CONTROL PEAKS CALLED
STARRPEAKER
1437
CRADLE
27500
CONTROL INTERSECTING PEAKS: STARRPEAKER IN CRADLE
1421
CONTROL INTERSECTING PEAKS: CRADLE IN STARRPEAKER
1625
**************************************
KO PEAKS CALLED
STARRPEAKER
669
CRADLE
11197
KO INTERSECTING PEAKS: STARRPEAKER IN CRADLE
594
KO INTERSECTING PEAKS: CRADLE IN STARRPEAKER
836
**************************************
KO DIRECT ACTIVE PEAKS CALLED
STARRPEAKER
912
CRADLE
20119
KO INTERSECTING DIRECT ACTIVE PEAKS: STARRPEAKER IN CRADLE
364
KO INTERSECTING DIRECT ACTIVE PEAKS: CRADLE IN STARRPEAKER
420
**************************************
KO DIRECT INACTIVE PEAKS CALLED
STARRPEAKER
146
CRADLE
2528
KO INTERSECTING DIRECT INACTIVE PEAKS: STARRPEAKER IN CRADLE
29
KO INTERSECTING DIRECT INACTIVE PEAKS: CRADLE IN STARRPEAKER
39
**************************************


## Motif Enrichment Analysis using HOMER

### Direct Active Regions

In [6]:
if args.run:
    # starrpeaker active
    get_mea(args.ko_dactive_file, args.reference_genome, args.region_of_interest_homer_background, args.ko_dactive_mea_dir)
    # cradle active
    get_mea(args.cradle_ko_dactive_file, args.reference_genome, args.region_of_interest_homer_background, args.cradle_ko_dactive_mea_dir)

### Direct Inactive Regions

In [None]:
if args.run:
    # starrpeaker inactive
    get_mea(args.ko_dinactive_file, args.reference_genome, args.region_of_interest_homer_background, args.ko_dinactive_mea_dir)
    # cradle inactive
    get_mea(args.cradle_ko_dinactive_file, args.reference_genome, args.region_of_interest_homer_background, args.cradle_ko_dinactive_mea_dir)