In [1]:
%matplotlib inline 

import pandas as pd
import functools
import os
import pybedtools
from IPython.core.display import HTML
import itertools
import seaborn as sns
from collections import defaultdict



In [2]:
submitted_datasets = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODEclip_20160718/ALLDATASETS_submittedonly.txt")

In [3]:
def get_gps_peak(fn):
    gps_dir = "/projects/ps-yeolab3/encode/analysis/gps_peaks"
    fn = os.path.basename(fn)
    basename = fn.split(".")[0]
    
    fn = os.path.join(gps_dir, "{}/{}_outputs/{}_2_GEM_events.bed".format(basename, basename, basename))
    return fn

In [4]:
submitted_datasets['gps_rep1'] = submitted_datasets.CLIP_rep1.apply(get_gps_peak)
submitted_datasets['gps_rep2'] = submitted_datasets.CLIP_rep2.apply(get_gps_peak)

In [5]:
def get_input_norm_peaks(fn, rep, run_dir):
    uid = "{}_{}".format(fn, rep)
    
    return os.path.join(run_dir, "{}.basedon_{}.peaks.l2inputnormnew.bed.compressed.bed".format(uid, uid))

In [6]:
clip_rep1 = functools.partial(get_input_norm_peaks, rep="01", run_dir="/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODEclip_20160718")
clip_rep2 = functools.partial(get_input_norm_peaks, rep="02", run_dir="/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODEclip_20160718")
submitted_datasets['old_clip_rep1'] = submitted_datasets.uID.apply(clip_rep1)
submitted_datasets['old_clip_rep2'] = submitted_datasets.uID.apply(clip_rep2)

clip_rep1 = functools.partial(get_input_norm_peaks, rep="01", run_dir="/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161017")
clip_rep2 = functools.partial(get_input_norm_peaks, rep="02", run_dir="/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161017")
submitted_datasets['new_clip_rep1'] = submitted_datasets.uID.apply(clip_rep1)
submitted_datasets['new_clip_rep2'] = submitted_datasets.uID.apply(clip_rep2)

In [7]:
def sort_and_return_peaks(fn):
    out_dir = "/projects/ps-yeolab3/encode/analysis/gps_peaks"
    bedtool = pybedtools.BedTool(fn)
    out_fn = os.path.join(out_dir, os.path.basename(fn))
    if os.path.exists(out_fn):
        return out_fn
    bedtool = bedtool.sort().saveas(out_fn)
    return bedtool.fn

In [8]:
processed_peaks = submitted_datasets.copy()
processed_peaks.index = pd.MultiIndex.from_tuples([row.values for name, row in processed_peaks[['uID', 'RBP', "Cell line"]].iterrows()])
processed_peaks = processed_peaks.drop(["CLIP_rep1", "CLIP_rep2", "INPUT", "uID", "RBP", "Cell line"], axis=1)

#How many peak files don't exist
print len(processed_peaks[~processed_peaks.applymap(os.path.exists).apply(all, axis=1)])

#Filter out peak files that don't exist
processed_peaks = processed_peaks[processed_peaks.applymap(os.path.exists).apply(all, axis=1)]
processed_peaks.columns = pd.MultiIndex.from_tuples([("_".join(item.split("_")[:-1]), item.split("_")[-1]) for item in processed_peaks.columns])
processed_peaks = processed_peaks.stack()
#processed_peaks = processed_peaks[:15]
processed_peaks.gps = processed_peaks.gps.apply(sort_and_return_peaks)

14


# Does RBFOX2 have the known motif?

In [9]:
GCATG_hg19_motif = pybedtools.BedTool("/projects/ps-yeolab3/oolite_backup/gpratt/projects/pipeline_analysis/motifs/hg19/motif_GCATG.BED")

def calc_percent_motif_in_peaks(peaks):
    peaks_with_motif = peaks.intersect(GCATG_hg19_motif, u=True)
    return len(peaks), len(peaks_with_motif), len(peaks_with_motif) / (len(peaks) * 1.0) 

In [10]:
rbfox2 = processed_peaks.xs("RBFOX2", level=1)

In [11]:
bedtool.fn

NameError: name 'bedtool' is not defined

In [None]:
for fn in rbfox2.gps:
    bedtool = pybedtools.BedTool(fn)
    print calc_percent_motif_in_peaks(bedtool)

# Does SLBP Bind Histones

In [None]:
slbp = processed_peaks.xs("SLBP", level=1)

In [None]:
hist_genes = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/RNA_type_analysis/genelists.HISTall", header=None,
              )
genes = pybedtools.BedTool("/home/gpratt/clipper/clipper/data/regions/hg19_v19_genes.bed")
hist_genes_bed = genes.filter(lambda x: x.name in hist_genes[0].values).saveas()

def calc_percent_in_hist(peaks):
    peaks_with_motif = peaks.intersect(hist_genes_bed, u=True)
    return len(peaks), len(peaks_with_motif), len(peaks_with_motif) / (len(peaks) * 1.0) 

In [None]:
for fn in slbp.gps:
    bedtool = pybedtools.BedTool(fn)
    print calc_percent_in_hist(bedtool)

# Does FXR1 and FXR2 Bind only CDS

In [None]:
from clipper.src import CLIP_analysis

In [None]:
fxr1 = processed_peaks.xs("FXR1", level=1)

In [None]:
for fn in fxr1.gps:

    result = CLIP_analysis.assign_to_regions(pybedtools.BedTool(fn), assigned_dir=".", species="hg19", nrand=0)

    for x in result:
        print x, len(result[x]['real'])

In [None]:
fmr1 = processed_peaks.xs("FMR1", level=1)

for fn in fmr1.gps:

    result = CLIP_analysis.assign_to_regions(pybedtools.BedTool(fn), assigned_dir=".", species="hg19", nrand=0)

    for x in result:
        print x, len(result[x]['real'])

In [None]:
fxr2 = processed_peaks.xs("FXR2", level=1)

for fn in fxr2.gps:

    result = CLIP_analysis.assign_to_regions(pybedtools.BedTool(fn), assigned_dir=".", species="hg19", nrand=0)

    for x in result:
        print x, len(result[x]['real'])

# Do the Splicing Factors overlap the correct 3' or 5' splice site

In [None]:
five_prime_ends = pybedtools.BedTool("/home/gpratt/clipper/clipper/data/regions/hg19_v19_five_prime_ends.bed")
three_prime_ends = pybedtools.BedTool("/home/gpratt/clipper/clipper/data/regions/hg19_v19_three_prime_ends.bed")

def calc_percent_3_prime_overlap(peaks):
    peaks_with_motif = peaks.intersect(three_prime_ends, u=True)
    return len(peaks), len(peaks_with_motif), len(peaks_with_motif) / (len(peaks) * 1.0) 

def calc_percent_5_prime_overlap(peaks):
    peaks_with_motif = peaks.intersect(five_prime_ends, u=True)
    return len(peaks), len(peaks_with_motif), len(peaks_with_motif) / (len(peaks) * 1.0) 

In [None]:
u2af2 = processed_peaks.xs("U2AF2", level=1)


In [None]:
u2af2 = processed_peaks.xs("U2AF2", level=1)

for fn in u2af2.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
u2af1 = processed_peaks.xs("U2AF1", level=1)

for fn in u2af1.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
u2af1 = processed_peaks.xs("U2AF1", level=1)

for fn in rbfox2.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
for fn in fxr1.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
prpf8 = processed_peaks.xs("PRPF8", level=1)

for fn in prpf8.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
rbm22 = processed_peaks.xs("RBM22", level=1)

for fn in rbm22.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
rbm5 = processed_peaks.xs("RBM5", level=1)

for fn in rbm5.gps:
    bedtool = pybedtools.BedTool(fn)
    print "3 prime", calc_percent_3_prime_overlap(bedtool)
    print "5 prime", calc_percent_5_prime_overlap(bedtool)

In [None]:
print "foo"