In [1]:
import pandas as pd
import pybedtools
import numpy as np
from pybedtools import featurefuncs
import glob
import os

In [7]:
annotated_exons = pybedtools.BedTool("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.exon.gtf").sort().saveas()
annotated_exons_bed = annotated_exons.each(featurefuncs.gff2bed).saveas()

#annotated_exons = pybedtools.BedTool("/home/gpratt/clipper/clipper/data/regions/hg19_exons.bed").sort().saveas()
annotated_genes = pybedtools.BedTool("/home/gpratt/clipper/clipper/data/regions/hg19_genes.bed")

In [8]:
def read_sj_table(fn):
    return pd.read_table(fn,
                         header=None, 
                         names=['chrom', 'start', 'stop', 'strand', 'intron_motif', 'annotated', 'unique_reads', 'multi_reads', 'max_overhang'])

In [10]:
results = pd.concat({"ctrl": read_sj_table("/home/gpratt/projects/cryptic_exons/analysis/ling_et_al_v1/SRR2002765_1.polyATrim.adapterTrim.rmRep.bamSJ.out.tab"),
                     "kd":  read_sj_table("/home/gpratt/projects/cryptic_exons/analysis/ling_et_al_v1/SRR2002766_1.polyATrim.adapterTrim.rmRep.bamSJ.out.tab"),},
                     names=['condition', 'sj'])


IOError: File /home/gpratt/projects/cryptic_exons/analysis/ling_et_al_v1/SRR2002765_1.polyATrim.adapterTrim.rmRep.bamSJ.out.tab does not exist

In [None]:
two_pass =  pd.read_table("/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/SRR2096921_1.polyATrim.adapterTrim.rmRep.bamSJ.out.tab",
                    header=None, names=['chrom', 'start', 'stop', 'strand', 'intron_motif', 'annotated', 'unique_reads',
                                      'multi_reads', 'max_overhang']) }, names=['condition', 'sj']

#Identify all possible exons given constraints
1. Exons are no more than max_exon_size away from each other 
2. They are on the same strand

In [None]:
results['total_reads'] = results.unique_reads + results.multi_reads 
ctrl_unfiltered = results.ix['kd']

In [None]:
#Data filtering
#Try to get a bit more confidence in the splice junctions
ctrl = ctrl_unfiltered[ctrl_unfiltered.total_reads  >= 5]
#explicily not looking for unique reads, I lose a bunch of real splice junctions that way

In [None]:
def call_exons(sj_df):
    
    sj_df_pos = sj_df[sj_df.strand == 1]
    sj_df_neg = sj_df[sj_df.strand == 2]

    exons = set([])
    chroms = set(sj_df.chrom)
    max_exon_size = 1000
    for chrom in chroms:
        #Positive Strand
        sj_df_pos_chrom = sj_df_pos[sj_df_pos.chrom == chrom]

        result = {}
        for name, row in sj_df_pos_chrom.iterrows():
            possible_junctions = row.stop - sj_df_pos_chrom.start
            possible_junctions = possible_junctions[(0 > possible_junctions) & (np.abs(possible_junctions) < max_exon_size)]
            result[name] = possible_junctions

        for x, (exon_start, possible_ends) in enumerate(result.items()):
            if len(possible_ends) == 0:
                continue

            possible_ends = possible_ends.apply(np.abs)
            shortest_possible_exon = possible_ends.min()

            key = possible_ends[possible_ends == shortest_possible_exon].index[0]
            start = sj_df_pos_chrom.ix[exon_start].stop
            stop = sj_df_pos_chrom.ix[key].start - 1
            if stop < start:
                continue
            exons.add((chrom, start,stop, "foo", "0","+"))

        #Negative Strand
        sj_df_neg_chrom = sj_df_neg[sj_df_neg.chrom == chrom]

        result = {}
        for start_id, row in sj_df_neg_chrom.iterrows():
            possible_junctions = row.start - sj_df_neg_chrom.stop
            possible_junctions = possible_junctions[(0 < possible_junctions) & (np.abs(possible_junctions) < max_exon_size)]
            result[start_id] = possible_junctions

        for x, (exon_start, possible_ends) in enumerate(result.items()):
            if len(possible_ends) == 0:
                continue

            possible_ends = possible_ends.apply(np.abs)
            shortest_possible_exon = possible_ends.min()

            key = possible_ends[possible_ends == shortest_possible_exon].index[0]
            stop = sj_df_neg_chrom.ix[exon_start].start - 1
            start = sj_df_neg_chrom.ix[key].stop
            if stop < start:
                continue
            exons.add((chrom, start,stop, "foo", "0","-"))

    exons = pybedtools.BedTool([pybedtools.create_interval_from_list(list(exon)) for exon in exons]).sort().saveas()

In [None]:
#only get exons in known genes
gene_exons = exons.intersect(annotated_genes, s=True, f=1.0, u=True).saveas()

In [None]:
gene_exons.saveas("/home/gpratt/Dropbox/cryptic_splicing/data/splice_juncions/SRR2002766_1.bed")

In [None]:
len(exons), len(gene_exons)
#Doesn't remove too many exons, but enough to take care of some false positives

In [None]:
#only get cryptic exons that don't overlap known exons
#this should really be called cryptic exons
kd_unannotated_exons = gene_exons.intersect(annotated_exons, s=True, v=True).saveas()
kd_possible_annotated_exons = gene_exons.intersect(annotated_exons, u=True, s=True).saveas()

kd_annotated_exons = kd_possible_annotated_exons.intersect(annotated_exons, s=True, u=True, f=1.0, r=True, sorted=True).saveas()
kd_not_annotated_exons = kd_possible_annotated_exons.intersect(annotated_exons, s=True, v=True, f=1.0, r=True, sorted=True).saveas()

In [None]:
kd_unannotated_exons.head(n=100)

In [None]:
kd_possible_annotated_exons.filter(lambda x: x.start == 242608196 ).saveas().head()

In [None]:
#Total number of exons, annotated and detected and unannotated and detected using my method.  
len(annotated_exons), len(kd_possible_annotated_exons), len(kd_annotated_exons), len(kd_not_annotated_exons), len(kd_unannotated_exons)

In [None]:
kd_not_annotated_exons

Mostly fixed negative strand exon idenitifcation bug, but there is a subtle one still there, could actually just be a splicing error, should keep digging.

In [None]:
#Number of exons on chr 1 on positive strand
len(annotated_exons.filter(lambda x: x.chrom == "chr1" and x.strand == "+"))

Looks like I might have a high false negative rate.  Let me filter out false positives a bit more and I'll figure out what I'm missing.

Notes on Progress:
1. Taking next splice junction isn't a bad strat, identifies 6365 / ??? expressed exons
2. Identified False positives from junction spanning reads spanning two genes (fix by intersect bed making sure the exon is within a gene
3. Identified False positives where one or two reads were leading to stuff that didn't quite look like exons.  Filter via hurestrics?  

I need a way to assess my false positive and negative rate.  Read bowtie paper for this?  Figure out how many annotated exons are expressed vs how many I identify?

#Odd things:
This should have been filtered: 

1. Filter splice sites in genes without exons.  Need to dig into gencode tags for this one.  Might just create a list of all the RPs and remove them, I think they are the main culprate for now.
* chr1	353809	353865
* chr1	355353	355386
 
2. Filer odd short jumps.  Can do this by only taking half annotated splice junctions.  

3. looks wrong, don't have a good patterns

chr1	2324708	2324794	foo	0	+ --double splicing event, might want to handle this eventually
chr1	46466630	46466651 -- looks real just real short, possibly noise
chr1	100474843	100474879 -- looks really short, just ugly
chr1	144048708	144048769 -- really short
chr1	156165803	156165964 -- major mis-splicing
chr1	196644579	196644659 -- doesn't look super real, not enough reads spanning splice junctions
chr1	179013907	179013976 -- multiple 5' splice sites, what should I do about this?
chr1	203803064	203803230 -- doesn't look super real, come back to this
chr1	203803064	203803230 -- doesn't look super real, come back to this
chr1	225898204	225898270 -- looks real, but not really in an annotated, gene, splice sites don't reconstruct well
chr1	225898204	225898270 -- looks real, but not really in an annotated, gene, splice sites don't reconstruct well
chr1	100474843	100474879 -- lots of noise, shouldn't keep.  Need to figure out how to remove
 
 
chr1	109438938	109439053 -- bound by TDP43
chr1	154214175	154214273 -- slightly bound by TDP-43
 chr1	213299108	213299186 -- bound nicely by TDP-43

#Random Thoughts
Argn has some crazy splicing patterns
1. chr1	980267	980367	foo	0	+
2. chr1	980271	980367	foo	0	+
3. chr1	980355	980367	foo	0	+

#Possible Next options.
1. Try to figure out if I've got false negatives
1. Keep filtering
3. Add in neagative strand + rest of the chromosomes.  
4. Think ahead even more

#Assessing False Negatives
General Stragety will be to look at featurecounts counts, see what things are covered (and have splice junctions) that I should know about

In [None]:
all_exons = pd.read_table("/home/gpratt/projects/cryptic_exons/analysis/ling_et_al_v1/hg19_counts.txt", skiprows=1)

In [None]:
expressed_exons = all_exons[all_exons['SRR2002766_1.polyATrim.adapterTrim.rmRep.bam'] != 0]

In [None]:
expressed_exons_chr1 = expressed_exons[(expressed_exons.Chr == "chr1") & (expressed_exons.Strand == "+")]

In [None]:
all_exons_chr1 = all_exons[(all_exons.Chr == "chr1") & (all_exons.Strand == "+")]

In [None]:
#Number of exons on chr1 and the expressed exons on chr1
print len(all_exons_chr1), len(expressed_exons_chr1)

#Get the exons I didn't detect with my algorithm

In [None]:
expressed_exons_chr1_bed = []
for name, row in expressed_exons_chr1.iterrows():
    expressed_exons_chr1_bed.append(pybedtools.create_interval_from_list([row.Chr, 
                                          row.Start, 
                                          row.End, 
                                          row.Geneid, 
                                          row['SRR2002766_1.polyATrim.adapterTrim.rmRep.bam'], 
                                          row.Strand]))
expressed_exons_chr1_bed = pybedtools.BedTool(expressed_exons_chr1_bed)

chr1	321032	321290	ENSG00000237094.6

In [None]:
expressed_exons_chr1_bed.intersect(kd_annotated_exons, v=True).head(n=100)

Ok, so it looks like I'm not detecting a lot of the annotated exons because there aren't any reads that span splice junctions, or meet my cutoff for what a "true" splice junction looks like.  

This is both good and bad.  Good in that I'm not going to over-call peaks.  Bad in that I could be missing a lot of cryptic sites because I'm taking a splice junctino centric view.  I still like this approach.  I'm going to try using the STAR two pass approach to try and recover a few more reads and have things pass my thresholds a bit better.  In the mean time I'll generalize my algorithm to all chromosomes / strands.  

#How Many Novel Exons overlap with exons called by Ling et. al?

In [None]:
ling_et_al_exons = """chr5:111602907-111602981
chr5:648215-648338
chr19:7169645-7169842
chr5:153416928-153417015
chr10:3141749-3142011
chr11:8797739-8801798
chr12:117226441-117226517
chr12:117227839-117228527
chr12:85689446-85689514
chr1:980268-980367
chr1:980272-980367
chr2:242608197-242608400
chr1:980268-980460
chr1:980272-980460
chr11:8801680-8801798
chr3:9510260-9510300
chr11:108368528-108368891
chr19:9112157-9112234
chr19:14560900-14561129
chr14:24630911-24631099
chr10:11968921-11971863
chr1:109438939-109439053
chr13:21374252-21374294
chr7:102227839-102228081
chr7:102128667-102128909
chr15:72557611-72557753
chr12:52631354-52631501
chr2:3462025-3462286
chr22:20110103-20110220
chr4:89319549-89319596
chr1:169337730-169337948
chr11:58384466-58384527
chr19:4492012-4492149
chr10:30731500-30731565
chr2:182775853-182775960
chr5:64836755-64836979
chr6:33626324-33626446
chr2:143722222-143728961
chr14:24634266-24634392
chrX:107465095-107465143
chr2:143728740-143728961""".replace(":", " ").replace("-", " ")

ling_et_al_exons = pybedtools.BedTool(ling_et_al_exons, from_string=True).saveas("ling_et_al_exons.bed")

In [None]:
len(ling_et_al_exons)

In [None]:
len(ling_et_al_exons.intersect(kd_unannotated_exons, u=True))

#of the ones that I miss, why?

In [None]:
len(ling_et_al_exons.intersect(kd_unannotated_exons, v=True))

In [None]:
ling_et_al_exons.intersect(kd_unannotated_exons, v=True).head(n=30)

chr19	7169645	7169842 -- no splice junction on the 5' end
chr12	117226441	117226517 -- not quite enough splice junctions (maybe have some off by one bugs)
chr12	117227839	117228527 -- no splice junctions on the 3' end
chr12	85689446	85689514 -- not enough splice junctions on the 5' end
chr2	242608197	242608400 -- possibly call slightly shorter exon, need to look into this, also might just not be eonugh reads (overlaps with previous exon there is TDP-43 binding)
chr3	9510260	9510300 -- miss called, exon near but not overlapping, I think (but there is TDP-43 binding)
chr11	108368528	108368891 -- not enough reads on the 3' end (but TDP-43 binding)
chr19	9112157	9112234 --I shouldn't have missed this one, look into it (no gene?)
chr19	14560900	14561129 --not real cryptic exon
chr14	24630911	24631099 -- no 5' reads
chr10	11968921	11971863 -- no spliced reads detected
chr13	21374252	21374294 -- 5' extention event, I wouldn't have detected this
chr7	102227839	102228081 -- extention event I think.  Wouldn't have detected this
chr7	102128667	102128909 -- extention event I think.  Wouldn't have detected this
chr15	72557611	72557753  -- extention event I think.  Wouldn't have detected this
chr12	52631354	52631501  -- extention event I think.  Wouldn't have detected this
chr2	3462025	3462286  -- no 5' reads (there is another exon near here, may have called that)
chr22	20110103	20110220 -- known exon (with TDP-43 binding though)
chr4	89319549	89319596 -- should have detected this, don't know why I didn't (overlaps with known exon)
chr1	169337730	169337948 -- no 5' reads 
chr5	64836755	64836979 -- no 5' reads
chr6	33626324	33626446 -- looks like an extention
chr2	143722222	143728961 -- this looks just kind of fucked up
chr14	24634266	24634392 -- overlaps with a known exon
chr2	143728740	143728961 -- no 3' reads 

#Scaling up

In [None]:
cryptic_splicing = glob.glob("/home/gpratt/projects/cryptic_exons/analysis/av_resequencing_v1/*rmRep*.tab")
splice_junction_predictions = pd.concat({os.path.basename(item).split(".")[0]: read_sj_table(item) for item in cryptic_splicing},
                                        names=['condition', 'sj'])

In [None]:
splice_junction_predictions['total_reads'] = splice_junction_predictions.unique_reads + splice_junction_predictions.multi_reads 
splice_junction_predictions_filtered = splice_junction_predictions[splice_junction_predictions.total_reads  >= 5]

In [None]:
result = {}
for name, df in splice_junction_predictions_filtered.groupby(level="condition"):
    print name
    result[name] = call_exons(df)