In [2]:
import subprocess
import pandas as pd
import numpy as np
from scipy.stats import hypergeom
from IPython.core.display import HTML
import pybedtools
import os
import glob
from IPython.core.display import HTML
from Bio import SeqIO

In [3]:
def annotatePeaks_stranded(peaks, motif, species="hg19", hist=10, size=1000):
    
    """
    
    runs homer with standard args
    output location is saved
    
    foreground - str, location of fasta file for the foreground distribution
    background - str, location of fasta file for the background distribution
    k - different k-mers to examine
    outloc - directory to output homer results 

    --make optional make work off locations and not fasta files 
    
    """
    #findMotifs.pl clusters.fa fasta outloc -nofacts p 4 -rna -S 10 -len 5,6,7,8,9 -noconvert -nogo -fasta background.fa
    #converts k to a string for use in subprocess
    #I'm forgetting strand specific information here
    try:
        with open("tmp.txt", 'w') as out:
            subprocess.call(["annotatePeaks.pl",
                             peaks,
                             species,
                             "-m", motif,
                             #"-hist", str(hist),
                             "-size", "given",
                             "-noann"], shell=False, stdout=out)
    except OSError:
        print "Homer not installed, ignoring motif generation, install homer for this to work"  
        raise   
    annotation = pd.read_table("tmp.txt", index_col=0)
    annotation['stranded_peaks'] = annotation.apply(only_stranded_motifs, axis=1)
    return annotation

def only_stranded_motifs(row):
    motifs = row[-1]
    try:
        true_peaks = "),".join([motif for motif in motifs.split("),") if row.Strand in motif])
        if len(true_peaks) == 0:
            return np.nan
        
        return true_peaks
    except AttributeError:
        return motifs
    
def get_strand_of_motif(motifs):
    try:
        return ",".join(list(set(motif.split(",")[1] for motif in motifs.split("),"))))
    except AttributeError:
        return np.nan
    
def bound(row):
    try:
        return row.Strand in row.motif_strands
    except TypeError:
        return False
    
def get_bound_motifs(bedtool, motif_file, motif_name):
    annoted_peaks = annotatePeaks(bedtool.fn, motif_file, "mm9")
    annoted_peaks['motif_strands'] = annoted_peaks['1-{} Distance From Peak(sequence,strand,conservation)'.format(motif_name)].apply(get_strand_of_motif)
    annoted_peaks['motif_present'] = annoted_peaks.apply(bound, axis=1)
    return annoted_peaks

In [4]:
def findMotifsGenome(peaks, motif, species="hg19"):
    try:
        with open("tmp.txt", 'w') as out:
            subprocess.call(["findMotifsGenome.pl",
                             peaks,
                             species,
                             "foo",
                             "-size", "given ",
                             "-rna",
                             "-find", motif,
                            ], shell=False, stdout=out)
    except OSError:
        print "Homer not installed, ignoring motif generation, install homer for this to work"  
        raise   
    return pd.read_table("tmp.txt", index_col=0)  

In [5]:
def findMotifsGenome(peaks, motif, species="hg19"):
    !findMotifsGenome.pl $peaks $species foo -size given -rna -find $motif > tmp.txt
    return pd.read_table("tmp.txt", index_col=0)  

In [6]:
rbfox2_motif = "/home/gpratt/projects/encode/analysis/encode_v8/homer/293XT_CLIP_RBFOX2_1120_RBFOX2.merged.r2.peaks.bed/all/homerResults/motif1.motif"

In [7]:
#dir_name = "/projects/ps-yeolab2/encode/analysis/encode_v9/assigned/"


In [8]:
# v12 = glob.glob("/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/*")
# v8 = [item.replace("/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/", 
#                    "/home/gpratt/projects/encode/analysis/encode_v8/assigned/") for item in v12]


In [9]:
# v12_peak_sizes = {}
# for item in v12:
#     try:
#         if "all" in item and "real" in item:
#             v12_peak_sizes[os.path.basename(item)] = len(pybedtools.BedTool(item)) 
#     except:
#         pass
    
# v8_peak_sizes = {}
# for item in v8:
#     try:
#         if "all" in item and "real" in item:
#             v8_peak_sizes[os.path.basename(item)] = len(pybedtools.BedTool(item))
#     except:
#         pass

In [10]:
# foo = pd.concat({"v12": pd.Series(v12_peak_sizes),
#                  "v8": pd.Series(v8_peak_sizes)}).unstack().T

In [11]:
# foo = foo[["all" in item for item in foo.index]]
# foo = foo[["real" in item for item in foo.index]]

In [12]:
# foo['same'] = foo.v12 == foo.v8

In [13]:
# HTML(foo.to_html())

In [14]:
# for item in foo.index:
#     !diff /projects/ps-yeolab2/encode/analysis/encode_v12/assigned/$item /home/gpratt/projects/encode/analysis/encode_v8/assigned/$item

In [41]:
dir_name = "/home/gpratt/projects/encode/analysis/encode_v8/assigned"
peaks_to_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED"),}

dir_name = "/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/"
peaks_to_annotate_v12 = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED"),}


# dir_name = "/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/"
# peaks_to_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED"),
#                      "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED"),}


peaks_to_rand_1_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.rand.1.BED"),}

peaks_to_rand_2_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.rand.2.BED"),}

peaks_to_rand_3_annotate = {"RBFOX2_293XT_1to200_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to200_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to100_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to50_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to50_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to25_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to25_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "RBFOX2_293XT_1to12_gel_RBFOX2": os.path.join(dir_name, "RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP10_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP8_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP1_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP5_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),
                     "EVN_041315_CLIP6_RBFOX2": os.path.join(dir_name, "EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.rand.0.BED"),}

In [42]:
for name, fn in sorted(peaks_to_annotate_v12.items(), key=lambda x: x[0]):
    print fn, name, len(pybedtools.BedTool(fn))

/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP10_RBFOX2 33194
/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP1_RBFOX2 27038
/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP5_RBFOX2 20718
/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP6_RBFOX2 914
/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP8_RBFOX2 20495
/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED RBFOX2_293XT_1to100_gel_RBFOX2 34934
/projects/ps-yeolab2/encode/analysis/encode_v12/assigned/RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED RB

In [43]:
for name, fn in sorted(peaks_to_annotate_v8.items(), key=lambda x: x[0]):
    try:
        
        print fn, name, len(pybedtools.BedTool(fn))
    except:
        pass

/home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP10_RBFOX2 33194
/home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP1_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP1_RBFOX2 27038
/home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP5_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP5_RBFOX2 20718
/home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP6_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP6_RBFOX2 914
/home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP8_RBFOX2.merged.r2.peaks.bed.all.real.BED EVN_041315_CLIP8_RBFOX2 20495
/home/gpratt/projects/encode/analysis/encode_v8/assigned/RBFOX2_293XT_1to100_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED RBFOX2_293XT_1to100_gel_RBFOX2 34934
/home/gpratt/projects/encode/analysis/encode_v8/assigned/RBFOX2_293XT_1to12_gel_RBFOX2.merged.r2.peaks.bed.all.real.BED RB

In [44]:
def bound_in_same_strand(row):
    try:
        motifs = row['1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_0.6(0.999) Distance From Peak(sequence,strand,conservation)'].split("),")
        return any(row.Strand in item.split(",")[1] for item in motifs)
    except:
        return False

In [45]:
#This approach is incorrect, doesn't get strand quite right and isn't as stringent with the kmer finding as the approach below
# results_annotate = {}
# for peak in peaks_to_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_annotate[peak], rbfox2_motif)
#     results_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_1_annotate = {}
# for peak in peaks_to_rand_1_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_rand_1_annotate[peak], rbfox2_motif)
#     #annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_1_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_2_annotate = {}
# for peak in peaks_to_rand_2_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_rand_2_annotate[peak], rbfox2_motif)
#     annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_2_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
# results_rand_3_annotate = {}
# for peak in peaks_to_rand_3_annotate:
#     annotated_peak = annotatePeaks_stranded(peaks_to_rand_3_annotate[peak], rbfox2_motif)
#     #annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
#     results_rand_3_annotate[peak] = {"peaks_with_motif": len(annotated_peak.stranded_peaks.dropna()), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}


In [178]:
results = {}
for peak in peaks_to_annotate:
    annotated_peak = findMotifsGenome(peaks_to_annotate[peak], rbfox2_motif)
    annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
    results[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}

results_rand_1 = {}
for peak in peaks_to_rand_1_annotate:
    annotated_peak = findMotifsGenome(peaks_to_rand_1_annotate[peak], rbfox2_motif)
    annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
    results_rand_1[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
results_rand_2 = {}
for peak in peaks_to_rand_2_annotate:
    annotated_peak = findMotifsGenome(peaks_to_rand_2_annotate[peak], rbfox2_motif)
    annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
    results_rand_2[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}
    
results_rand_3 = {}
for peak in peaks_to_rand_3_annotate:
    annotated_peak = findMotifsGenome(peaks_to_rand_3_annotate[peak], rbfox2_motif)
    annotated_peak['bound_in_same_strand'] = annotated_peak.apply(bound_in_same_strand, axis=1)
    results_rand_3[peak] = {"peaks_with_motif": len(set(annotated_peak.index)), "num_peaks": len(pybedtools.BedTool(peaks_to_annotate[peak]))}



	Position file = /home/gpratt/projects/encode/analysis/encode_v8/assigned/EVN_041315_CLIP10_RBFOX2.merged.r2.peaks.bed.all.real.BED
	Genome = hg19
	Output Directory = foo
	Using actual sizes of regions (-size given)
	Fragment size set to given
	Operating in RNA mode
	Will find motif(s) in /home/gpratt/projects/encode/analysis/encode_v8/homer/293XT_CLIP_RBFOX2_1120_RBFOX2.merged.r2.peaks.bed/all/homerResults/motif1.motif
	Found mset for "human", will check against vertebrates motifs
	Peak/BED file conversion summary:
		BED/Header formatted lines: 33194
		peakfile formatted lines: 0

	Peak File Statistics:
		Total Peaks: 33194
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Background fragment size set to 38 (avg size of targets)
	Background files for 38 bp fragments found.

	Extracting sequence

In [170]:
foo = pybedtools.BedTool(peaks_to_annotate[peak]).filter(lambda interval: interval.name in set(annotated_peak.index) - found).saveas()

In [173]:
annotated_peak.ix['ENSG00000078900.10_1_42_262']

Offset                                                                 51
Sequence                                                           TGCATG
Motif Name              1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...
Strand                                                                  +
MotifScore                                                        8.24193
bound_in_same_strand                                                False
Name: ENSG00000078900.10_1_42_262, dtype: object

In [164]:
annotated_peak.ix[set(annotated_peak.index) - found]

Unnamed: 0_level_0,Offset,Sequence,Motif Name,Strand,MotifScore,bound_in_same_strand
PositionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000105983.14_4_9_26031,2,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000159720.7_1_9_10474,20,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000031823.10_5_29_14272,9,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000092439.9_0_9_8522,0,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000135476.7_11_8_6130,11,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000185483.7_1_5_1372,1,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000198952.6_4_13_2140,8,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000163931.11_5_15_21041,9,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000009954.6_2_7_25260,9,TGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,8.241931,False
ENSG00000181220.11_5_5_25863,18,CGCATG,"1-TGCATG,BestGuess:A2BP1_ENSG00000078328_M298_...",+,5.438040,False


In [135]:
foo = pybedtools.BedTool(peaks_to_annotate[peak]).filter(lambda interval: not interval.name in set(annotated_peak_v2.index)).saveas()

In [150]:
fasta_sequence = pybedtools.BedTool(peaks_to_annotate[peak]).sequence(fi="/projects/ps-yeolab/genomes/hg19/chromosomes/all.fa", name=True, fo="foo.fasta", s=True)

In [176]:
found_motif = 0
found = set([])
for record in SeqIO.parse("foo.fasta", format="fasta"):
    if "CGCATG".lower() in record.seq.lower() or "TGCATG".lower() in record.seq.lower():
        found_motif += 1
        found.add(record.name)

In [179]:
real = pd.DataFrame(results).T

random = pd.concat([pd.DataFrame(results_rand_1).T, 
           pd.DataFrame(results_rand_2).T, 
           pd.DataFrame(results_rand_3).T,])

In [180]:
real.to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/real_counts_v12.txt")

In [181]:
random.groupby(level=0).sum().to_csv("/home/gpratt/Dropbox/encode_integration/for_eric/random_counts_v12.txt")