In [1]:
import os
import shutil

import pandas as pd
import functools
import pybedtools

In [2]:
submitted_datasets = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161120/ALLDATASETS_submittedonly.txt")

In [3]:
# submitted_datasets.CLIP_rep1 = submitted_datasets.CLIP_rep1.apply(lambda x: x.replace("encode_v12", "encode_master"))
# submitted_datasets.CLIP_rep1 = submitted_datasets.CLIP_rep1.apply(lambda x: x.replace("encode_v13", "encode_master"))

# submitted_datasets.CLIP_rep2 = submitted_datasets.CLIP_rep2.apply(lambda x: x.replace("encode_v12", "encode_master"))
# submitted_datasets.CLIP_rep2 = submitted_datasets.CLIP_rep2.apply(lambda x: x.replace("encode_v13", "encode_master"))

# submitted_datasets.INPUT = submitted_datasets.INPUT.apply(lambda x: x.replace("encode_v12", "encode_master"))
# submitted_datasets.INPUT = submitted_datasets.INPUT.apply(lambda x: x.replace("encode_v13", "encode_master"))

In [4]:
narrow_peak_dir = "/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161120/for_encode_submission"
high_confidence_peaks = "/home/gpratt/projects/encode/analysis/encode_high_confidence_peaks"
medium_confidence_peaks = "/home/gpratt/projects/encode/analysis/encode_meduim_confidence_peaks"

In [5]:
def get_peak_file(uID, rep):
    encode_id = "_".join([uID, rep])
    peak_file = os.path.join(narrow_peak_dir, "{0}.basedon_{0}.peaks.l2inputnormnew.bed.compressed.bed.narrowPeak.encode.bed.gz").format(encode_id)
    return peak_file

def stringent_filter_peak(row, from_file, rep):
    bedtool = pybedtools.BedTool(row[from_file])
    filtered_bed_file = os.path.join(high_confidence_peaks, os.path.basename(bedtool.fn))
    filtered_bed_file = os.path.splitext(filtered_bed_file)[0]
    bedtool.filter(lambda x: int(x[4]) == 1000).sort().saveas(filtered_bed_file)
    
    bb_file = "{}_{}_{}_uID{}.bb".format(row.RBP, row['Cell line'], rep, row.uID)
    bb_file = os.path.join(high_confidence_peaks, bb_file)
    !bedToBigBed -type=bed6+4 $filtered_bed_file /projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes $bb_file
    return bb_file

def medium_filter_peak(row, from_file, rep):
    bedtool = pybedtools.BedTool(row[from_file])
    filtered_bed_file = os.path.join(medium_confidence_peaks, os.path.basename(bedtool.fn))
    filtered_bed_file = os.path.splitext(filtered_bed_file)[0]
    bedtool.filter(lambda x: float(x[6]) > 0).sort().saveas(filtered_bed_file)
    
    bb_file = "{}_{}_{}_uID{}.bb".format(row.RBP, row['Cell line'], rep, row.uID)
    bb_file = os.path.join(medium_confidence_peaks, bb_file)

    !bedToBigBed -type=bed6+4 $filtered_bed_file /projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes $bb_file
    return bb_file

def copy_bw_file(row, out_dir, from_file, rep, strand):
    out_file = "{}_{}_{}_uID{}.{}.bw".format(row.RBP, row['Cell line'], rep, row.uID, strand)
    out_file = os.path.join(out_dir, out_file)
    shutil.copy(row[from_file], out_file)

In [6]:
rep1_func = functools.partial(get_peak_file, rep="01")
rep2_func = functools.partial(get_peak_file, rep="02")

high_confidence_bw = functools.partial(copy_bw_file, out_dir=high_confidence_peaks)
medium_confidence_bw = functools.partial(copy_bw_file, out_dir=medium_confidence_peaks)

bw_high_pos_rep1 = functools.partial(high_confidence_bw, rep="Rep1", strand="pos", from_file="CLIP_rep1_bw_pos")
bw_high_neg_rep1 = functools.partial(high_confidence_bw, rep="Rep1", strand="neg", from_file="CLIP_rep1_bw_neg")

bw_high_pos_rep2 = functools.partial(high_confidence_bw, rep="Rep2", strand="pos", from_file="CLIP_rep2_bw_pos")
bw_high_neg_rep2 = functools.partial(high_confidence_bw, rep="Rep2", strand="neg", from_file="CLIP_rep2_bw_neg")

bw_medium_pos_rep1 = functools.partial(medium_confidence_bw, rep="Rep1", strand="pos", from_file="CLIP_rep1_bw_pos")
bw_medium_neg_rep1 = functools.partial(medium_confidence_bw, rep="Rep1", strand="neg", from_file="CLIP_rep1_bw_neg")

bw_medium_pos_rep2 = functools.partial(medium_confidence_bw, rep="Rep2", strand="pos", from_file="CLIP_rep2_bw_pos")
bw_medium_neg_rep2 = functools.partial(medium_confidence_bw, rep="Rep2", strand="neg", from_file="CLIP_rep2_bw_neg")

bw_high_INPUT_pos = functools.partial(high_confidence_bw, rep="Input", strand="pos", from_file="INPUT_bw_pos")
bw_high_INPUT_neg = functools.partial(high_confidence_bw, rep="Input", strand="neg", from_file="INPUT_bw_neg")

bw_medium_INPUT_pos = functools.partial(medium_confidence_bw, rep="Input", strand="pos", from_file="INPUT_bw_pos")
bw_medium_INPUT_neg = functools.partial(medium_confidence_bw, rep="Input", strand="neg", from_file="INPUT_bw_neg")

In [7]:
submitted_datasets["CLIP_rep1_bw_pos"] = submitted_datasets.CLIP_rep1.apply(lambda x: x.replace(".bam", ".norm.pos.bw"))
submitted_datasets["CLIP_rep1_bw_neg"] = submitted_datasets.CLIP_rep1.apply(lambda x: x.replace(".bam", ".norm.neg.bw"))

submitted_datasets.apply(bw_high_pos_rep1, axis=1)
submitted_datasets.apply(bw_high_neg_rep1, axis=1)
submitted_datasets.apply(bw_medium_pos_rep1, axis=1)
submitted_datasets.apply(bw_medium_neg_rep1, axis=1)

submitted_datasets["CLIP_rep2_bw_pos"] = submitted_datasets.CLIP_rep2.apply(lambda x: x.replace(".bam", ".norm.pos.bw"))
submitted_datasets["CLIP_rep2_bw_neg"] = submitted_datasets.CLIP_rep2.apply(lambda x: x.replace(".bam", ".norm.neg.bw"))

submitted_datasets.apply(bw_high_pos_rep2, axis=1)
submitted_datasets.apply(bw_high_neg_rep2, axis=1)
submitted_datasets.apply(bw_medium_pos_rep2, axis=1)
submitted_datasets.apply(bw_medium_neg_rep2, axis=1)

submitted_datasets["INPUT_bw_pos"] = submitted_datasets.INPUT.apply(lambda x: x.replace(".bam", ".norm.pos.bw"))
submitted_datasets["INPUT_bw_neg"] = submitted_datasets.INPUT.apply(lambda x: x.replace(".bam", ".norm.neg.bw"))

submitted_datasets.apply(bw_high_INPUT_pos, axis=1)
submitted_datasets.apply(bw_high_INPUT_neg, axis=1)
submitted_datasets.apply(bw_medium_INPUT_pos, axis=1)
submitted_datasets.apply(bw_medium_INPUT_neg, axis=1)

0      None
1      None
2      None
3      None
4      None
5      None
6      None
7      None
8      None
9      None
10     None
11     None
12     None
13     None
14     None
15     None
16     None
17     None
18     None
19     None
20     None
21     None
22     None
23     None
24     None
25     None
26     None
27     None
28     None
29     None
       ... 
151    None
152    None
153    None
154    None
155    None
156    None
157    None
158    None
159    None
160    None
161    None
162    None
163    None
164    None
165    None
166    None
167    None
168    None
169    None
170    None
171    None
172    None
173    None
174    None
175    None
176    None
177    None
178    None
179    None
180    None
dtype: object

In [8]:
submitted_datasets['CLIP_rep1_peak'] = submitted_datasets.uID.apply(rep1_func)
submitted_datasets['CLIP_rep2_peak'] = submitted_datasets.uID.apply(rep2_func)

stringent_filter_peak_rep1 = functools.partial(stringent_filter_peak, from_file="CLIP_rep1_peak", rep="Rep1")
stringent_filter_peak_rep2 = functools.partial(stringent_filter_peak, from_file="CLIP_rep2_peak", rep="Rep2")

submitted_datasets['CLIP_rep1_peak_bb'] = submitted_datasets.apply(stringent_filter_peak_rep1, axis=1)
submitted_datasets['CLIP_rep2_peak_bb'] = submitted_datasets.apply(stringent_filter_peak_rep2, axis=1)

medium_filter_peak_rep1 = functools.partial(medium_filter_peak, from_file="CLIP_rep1_peak", rep="Rep1")
medium_filter_peak_rep2 = functools.partial(medium_filter_peak, from_file="CLIP_rep2_peak", rep="Rep2")

submitted_datasets['CLIP_rep1_peak_bb'] = submitted_datasets.apply(medium_filter_peak_rep1, axis=1)
submitted_datasets['CLIP_rep2_peak_bb'] = submitted_datasets.apply(medium_filter_peak_rep2, axis=1)

pass1 - making usageList (24 chroms): 4 millis
pass2 - checking and writing primary data (5547 records, 10 fields): 28 millis
pass1 - making usageList (24 chroms): 4 millis
pass2 - checking and writing primary data (10221 records, 10 fields): 62 millis
pass1 - making usageList (24 chroms): 6 millis
pass2 - checking and writing primary data (5316 records, 10 fields): 27 millis
pass1 - making usageList (25 chroms): 3 millis
pass2 - checking and writing primary data (7661 records, 10 fields): 37 millis
pass1 - making usageList (24 chroms): 2 millis
pass2 - checking and writing primary data (1772 records, 10 fields): 10 millis
pass1 - making usageList (24 chroms): 4 millis
pass2 - checking and writing primary data (3002 records, 10 fields): 14 millis
pass1 - making usageList (24 chroms): 2 millis
pass2 - checking and writing primary data (1735 records, 10 fields): 10 millis
pass1 - making usageList (24 chroms): 1 millis
pass2 - checking and writing primary data (1774 records, 10 fields): 1

In [None]:
!make_trackhubs.py --genome hg19 --hub 20170109_encode_stringent_master_peaks /home/gpratt/projects/encode/analysis/encode_high_confidence_peaks/*bb --num_sep 1
!make_trackhubs.py --genome hg19 --hub 20170109_encode_stringent_master_wigs /home/gpratt/projects/encode/analysis/encode_high_confidence_peaks/*bw --num_sep 1

In [None]:
!make_trackhubs.py --genome hg19 --hub 20170109_encode_lenient_master_peaks /home/gpratt/projects/encode/analysis/encode_meduim_confidence_peaks/*bb --num_sep 1
!make_trackhubs.py --genome hg19 --hub 20170109_encode_lenient_master_wigs /home/gpratt/projects/encode/analysis/encode_meduim_confidence_peaks/*bw --num_sep 1