In [1]:
import subprocess
import pandas as pd
import numpy as np
from scipy.stats import hypergeom
from IPython.core.display import HTML
import pybedtools
import os

In [2]:
def annotatePeaks_stranded(peaks, motif, species="hg19", hist=10, size=1000):
    
    """
    
    runs homer with standard args
    output location is saved
    
    foreground - str, location of fasta file for the foreground distribution
    background - str, location of fasta file for the background distribution
    k - different k-mers to examine
    outloc - directory to output homer results 

    --make optional make work off locations and not fasta files 
    
    """
    #findMotifs.pl clusters.fa fasta outloc -nofacts p 4 -rna -S 10 -len 5,6,7,8,9 -noconvert -nogo -fasta background.fa
    #converts k to a string for use in subprocess
    #I'm forgetting strand specific information here
    try:
        with open("tmp.txt", 'w') as out:
            subprocess.call(["annotatePeaks.pl",
                             peaks,
                             species,
                             "-m", motif,
                             #"-hist", str(hist),
                             "-size", "given",
                             "-noann"], shell=False, stdout=out)
    except OSError:
        print "Homer not installed, ignoring motif generation, install homer for this to work"  
        raise   
    annotation = pd.read_table("tmp.txt", index_col=0)
    annotation['stranded_peaks'] = annotation.apply(only_stranded_motifs, axis=1)
    return annotation

def only_stranded_motifs(row):
    motifs = row[-1]
    try:
        true_peaks = "),".join([motif for motif in motifs.split("),") if row.Strand in motif])
        if len(true_peaks) == 0:
            return np.nan
        
        return true_peaks
    except AttributeError:
        return motifs
    
def get_strand_of_motif(motifs):
    try:
        return ",".join(list(set(motif.split(",")[1] for motif in motifs.split("),"))))
    except AttributeError:
        return np.nan
    
def bound(row):
    try:
        return row.Strand in row.motif_strands
    except TypeError:
        return False
    
def get_bound_motifs(bedtool, motif_file, motif_name):
    annoted_peaks = annotatePeaks(bedtool.fn, motif_file, "mm9")
    annoted_peaks['motif_strands'] = annoted_peaks['1-{} Distance From Peak(sequence,strand,conservation)'.format(motif_name)].apply(get_strand_of_motif)
    annoted_peaks['motif_present'] = annoted_peaks.apply(bound, axis=1)
    return annoted_peaks

In [3]:
def findMotifsGenome(peaks, motif, species="hg19"):
    try:
        with open("tmp.txt", 'w') as out:
            subprocess.call(["findMotifsGenome.pl",
                             peaks,
                             species,
                             "foo",
                             "-size", "given ",
                             "-rna",
                             "-find", motif,
                            ], shell=False, stdout=out)
    except OSError:
        print "Homer not installed, ignoring motif generation, install homer for this to work"  
        raise   
    return pd.read_table("tmp.txt", index_col=0)  

In [4]:
def findMotifsGenome(peaks, motif, species="hg19"):
    !findMotifsGenome.pl $peaks $species foo -size given -rna -find $motif > tmp.txt
    return pd.read_table("tmp.txt", index_col=0)  

In [5]:
rbfox2_motif = "/home/gpratt/projects/encode/analysis/encode_v8/homer/293XT_CLIP_RBFOX2_1120_RBFOX2.merged.r2.peaks.bed/all/homerResults/motif1.motif"

In [19]:
#dir_name = "/projects/ps-yeolab2/encode/analysis/encode_v9/assigned/"


In [42]:
import glob

In [None]:
v12 = glob.glob("/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/*")
v8 = glob.glob("/home/gpratt/projects/encode/analysis/encode_v8/assigned/*")


In [None]:
v12_peak_sizes = {os.path.basename(item): pybedtools.BedTool(item) for item in v12}
v8_peak_sizes = {os.path.basename(item): pybedtools.BedTool(item) for item in 8}

In [24]:
dir_name = "/home/gpratt/projects/encode/analysis/encode_v8/assigned"
peaks_to_annotate_v8 = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED"),}

dir_name = "/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/"
peaks_to_annotate_v12 = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED"),}


peaks_to_rand_1_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),}

peaks_to_rand_2_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),}

peaks_to_rand_3_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),}

In [36]:
dir_name = "/home/gpratt/projects/encode/analysis/encode_v8/"
peaks_to_annotate_v8 = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed"),}


In [39]:
dir_name = "/projects/ps-yeolab2/encode/analysis/encode_v12/"
peaks_to_annotate_v12 = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.bam"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed"),}





In [40]:
for name, fn in sorted(peaks_to_annotate_v12.items(), key=lambda x: x[0]):
    print fn, name, len(pybedtools.BedTool(fn))

/projects/ps-yeolab2/encode/analysis/encode_v12/EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP10_RBFOX2 36715
/projects/ps-yeolab2/encode/analysis/encode_v12/EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP1_RBFOX2 29073
/projects/ps-yeolab2/encode/analysis/encode_v12/EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP5_RBFOX2 22687
/projects/ps-yeolab2/encode/analysis/encode_v12/EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP6_RBFOX2 1137
/projects/ps-yeolab2/encode/analysis/encode_v12/EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP8_RBFOX2 22113
/projects/ps-yeolab2/encode/analysis/encode_v12/RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed RBFOX2_293XT_1to100_gel_RBFOX2 37600
/projects/ps-yeolab2/encode/analysis/encode_v12/RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed RBFOX2_293XT_1to12_gel_RBFOX2 36222
/projects/ps-yeolab2/encode/analysis/encode_v12/RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed RBFOX2_293XT_1to200_

In [41]:
for name, fn in sorted(peaks_to_annotate_v8.items(), key=lambda x: x[0]):
    try:
        
        print fn, name, len(pybedtools.BedTool(fn))
    except:
        pass

/home/gpratt/projects/encode/analysis/encode_v8/EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP10_RBFOX2 36715
/home/gpratt/projects/encode/analysis/encode_v8/EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP1_RBFOX2 29073
/home/gpratt/projects/encode/analysis/encode_v8/EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP5_RBFOX2 22687
/home/gpratt/projects/encode/analysis/encode_v8/EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP6_RBFOX2 1137
/home/gpratt/projects/encode/analysis/encode_v8/EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed EVN_041315_CLIP8_RBFOX2 22113
/home/gpratt/projects/encode/analysis/encode_v8/RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed RBFOX2_293XT_1to100_gel_RBFOX2 37600
/home/gpratt/projects/encode/analysis/encode_v8/RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed RBFOX2_293XT_1to12_gel_RBFOX2 36222
/home/gpratt/projects/encode/analysis/encode_v8/RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed RBFOX2_293XT_1to200_

In [33]:
def bound_in_same_strand(row):
    try:
        motifs = row['1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_0.6(0.999) Distance From Peak(sequence,strand,conservation)'].split("),")
        return any(row.Strand in item.split(",")[1] for item in motifs)
    except:
        return False

In [10]:
# results_annotate = {}
# for peak in peaks_to_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_annotate[peak], rbfox2_motif)
#     results_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_1_annotate = {}
# for peak in peaks_to_rand_1_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_rand_1_annotate[peak], rbfox2_motif)
#     #annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_1_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_2_annotate = {}
# for peak in peaks_to_rand_2_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_rand_2_annotate[peak], rbfox2_motif)
#     annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_2_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
#     #
# results_rand_3_annotate = {}
# for peak in peaks_to_rand_3_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_rand_3_annotate[peak], rbfox2_motif)
#     #annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_3_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}


In [21]:
results_v8 = {}
for peak in peaks_to_annotate_v8:
    annotated_peak = findMotifsGenome(peaks_to_annotate[peak], rbfox2_motif)
    results[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
 
results_v12 = {}
for peak in peaks_to_annotate_v12:
    annotated_peak = findMotifsGenome(peaks_to_annotate[peak], rbfox2_motif)
    results[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
    
# results_rand_1 = {}
# for peak in peaks_to_rand_1_annotate:
#     annotated_peak = findMotifsGenome(peaks_to_rand_1_annotate[peak], rbfox2_motif)
#     annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_1[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_2 = {}
# for peak in peaks_to_rand_2_annotate:
#     annotated_peak = findMotifsGenome(peaks_to_rand_2_annotate[peak], rbfox2_motif)
#     annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_2[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_3 = {}
# for peak in peaks_to_rand_3_annotate:
#     annotated_peak = findMotifsGenome(peaks_to_rand_3_annotate[peak], rbfox2_motif)
#     annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_3[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}



	Position file = /home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED
	Genome = hg19
	Output Directory = foo
	Using actual sizes of regions (-size given)
	Fragment size set to given
	Operating in RNA mode
	Will find motif(s) in /home/gpratt/projects/encode/analysis/encode_v8/homer/293XT_CLIP_RBFOX2_1120_RBFOX2.merged.r2.peaks.bed/all/homerResults/motif1.motif
	Found mset for "human", will check against vertebrates motifs
	Peak/BED file conversion summary:
		BED/Header formatted lines: 33194
		peakfile formatted lines: 0

	Peak File Statistics:
		Total Peaks: 33194
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Background fragment size set to 38 (avg size of targets)
	Background files for 38 bp fragments found.

	Extracting sequence

In [22]:
real = pd.DataFrame(results).T

random = pd.concat([pd.DataFrame(results_rand_1).T, 
           pd.DataFrame(results_rand_2).T, 
           pd.DataFrame(results_rand_3).T,])

In [41]:
real.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/real_counts_v12.txt")

In [18]:
#v12
real

Unnamed: 0,num_peaks,peaks_with_motif
EVN_041315_CLIP10_RBFOX2,33194,2178
EVN_041315_CLIP1_RBFOX2,27038,2420
EVN_041315_CLIP5_RBFOX2,20718,4011
EVN_041315_CLIP6_RBFOX2,914,260
EVN_041315_CLIP8_RBFOX2,20495,1165
RBFOX2_293XT_1to100_gel_RBFOX2,34934,1945
RBFOX2_293XT_1to12_gel_RBFOX2,33665,3056
RBFOX2_293XT_1to200_gel_RBFOX2,22370,1023
RBFOX2_293XT_1to25_gel_RBFOX2,27088,1821
RBFOX2_293XT_1to50_gel_RBFOX2,28458,1898


In [23]:
#v8
real

Unnamed: 0,num_peaks,peaks_with_motif
EVN_041315_CLIP10_RBFOX2,33194,2178
EVN_041315_CLIP1_RBFOX2,27038,2420
EVN_041315_CLIP5_RBFOX2,20718,4011
EVN_041315_CLIP6_RBFOX2,914,260
EVN_041315_CLIP8_RBFOX2,20495,1165
RBFOX2_293XT_1to100_gel_RBFOX2,34934,1945
RBFOX2_293XT_1to12_gel_RBFOX2,33665,3056
RBFOX2_293XT_1to200_gel_RBFOX2,22370,1023
RBFOX2_293XT_1to25_gel_RBFOX2,27088,1821
RBFOX2_293XT_1to50_gel_RBFOX2,28458,1898


In [40]:
random.groupby(level=0).sum().to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/random_counts_v12.txt")

In [14]:
random.groupby(level=0).sum()

Unnamed: 0,num_peaks,peaks_with_motif
EVN_041315_CLIP10_RBFOX2,99582,1217
EVN_041315_CLIP1_RBFOX2,81114,782
EVN_041315_CLIP5_RBFOX2,62154,649
EVN_041315_CLIP6_RBFOX2,2742,35
EVN_041315_CLIP8_RBFOX2,61485,583
RBFOX2_293XT_1to100_gel_RBFOX2,104802,1214
RBFOX2_293XT_1to12_gel_RBFOX2,100995,1267
RBFOX2_293XT_1to200_gel_RBFOX2,67110,730
RBFOX2_293XT_1to25_gel_RBFOX2,81264,955
RBFOX2_293XT_1to50_gel_RBFOX2,85374,1099
