In [65]:
%matplotlib inline

import collections
import os
import clipper
from itertools import izip 
import HTSeq
from clipper.src.bam_helpers import Robust_BAM_Reader


from clipper.src import count_features
from clipper.src import CLIP_analysis
from gscripts.general import region_helpers
import pandas as pd
from gscripts.general import parsers
from IPython.core.display import HTML

In [66]:
def bed_to_genomic_interval(bed):
    for interval in bed:
        yield HTSeq.GenomicInterval(str(interval.chrom), interval.start, interval.stop, str(interval.strand))

        
class RegionCounter():
    def __init__(self, species):

        regions = collections.OrderedDict()
        regions["all"] = "All"
        regions["cds"] = "CDS"
        regions["three_prime_utrs"] = "3' UTR"
        regions["five_prime_utrs"] = "5' UTR"
        regions["proxintron500"] = "Proximal\nIntron"
        regions["distintron500"] = "Distal\nIntron"
        regions['exons'] = "Exons"

        assigned_regions = regions.copy()
        del assigned_regions['all']
        self.species = species
        self.assigned_regions = assigned_regions
        self.features = self.make_features()

    def make_features(self):
        Region = collections.namedtuple("Region", ["region", "gene_id"])

        bedtracks = {}
        for region in self.assigned_regions:
            bedtracks[region] = pybedtools.BedTool(os.path.join(clipper.data_dir(),
                                                                "regions", "%s_%s.bed" % (self.species, region)))



        features = HTSeq.GenomicArrayOfSets("auto", stranded=True)
        for region, bedtrack in bedtracks.items():
            for iv, interval in izip(CLIP_analysis.bed_to_genomic_interval(bedtrack), bedtrack):
                features[iv] = set([Region(region, interval.name)])
        return features


    def count_features(self, bam_file):
        bam_file = bed_to_genomic_interval(bam_file)
        counts = collections.defaultdict(collections.Counter)
        for x, read in enumerate(bam_file):
            region_ids = set()
            for iv, val in self.features[read].steps():
                region_ids |= val

            gene_ids = {region_id.gene_id for region_id in region_ids}
            if len(gene_ids) == 1:

                cur_regions = {region_id.region for region_id in region_ids}

                for region in self.assigned_regions:
                    if region in cur_regions:
                        break

                gene_id = list(region_ids)[0]

                counts[gene_id.gene_id][region] += 1

            elif len(gene_ids) == 0:
                counts["_no_feature"]['none'] += 1
            else:
                counts["_ambiguous"]['none'] += 1

        return pd.DataFrame(counts)

In [67]:
region_counter = RegionCounter("mm9")

In [104]:
!mkdir -p /nas3/gpratt/projects/fet_family/analysis/mouse_clip_v5
!scp tscc-login.sdsc.edu:/home/gpratt/projects/fet_family/analysis/mouse_clip_v5/*WholeBrain.merged.bam /nas3/gpratt/projects/fet_family/analysis/mouse_clip_v5
!scp -q tscc-login.sdsc.edu:/home/gpratt/projects/fet_family/analysis/mouse_clip_v5/*.{metrics,final.out,peaks.bed} /nas3/gpratt/projects/fet_family/analysis/mouse_clip_v5 

TAF15_WholeBrain.merged.bam                   100% 1208MB  25.2MB/s   00:48    
TDP43_WholeBrain.merged.bam                   100%  427MB  30.5MB/s   00:14    
TLS_WholeBrain.merged.bam                     100%  240MB  26.7MB/s   00:09    


In [4]:
clip_rnaseq_df = parsers.clipseq_metrics("/nas3/gpratt/projects/fet_family/analysis/mouse_clip_v5", iclip=False)
filtered_df = clip_rnaseq_df[[ "Input Reads",
                             "Too short reads",
                             "Reads that were too short percent",
                             'repetitive_count',
                        "Reads Passing Quality Filter", 
                        "Uniquely Mapped Reads",
                        "Uniquely mapped reads %",
                        "Usable Reads",
                            
                             
                            #"spot",
                            "Num Peaks",
                            #"Fraction Collapsed",
                            # "Fraction Usable"                             
                            ]]


HTML(filtered_df.to_html(formatters={"Input Reads" : parsers.commas,
                                     'repetitive_count': parsers.commas,
                                     "Too short reads": parsers.commas,
                                     "Reads Passing Quality Filter" : parsers.commas,
                                     "Reads after Quality Filtering" : parsers.commas,
                                     "Uniquely Mapped Reads" : parsers.commas,
                                     "Usable Reads" : parsers.commas,
                                     "Num Peaks": parsers.commas
                                     } ))

Unnamed: 0,Input Reads,Too short reads,Reads that were too short percent,repetitive_count,Reads Passing Quality Filter,Uniquely Mapped Reads,Uniquely mapped reads %,Usable Reads,Num Peaks
20090209_MP_none1_mouse_brain_CLIPseq_TDP43_lower_band,4472151,4062,0.1,1267649,3993240.0,1354679,33.92%,216923,2246
20090209_MP_none2_mouse_brain_3A_CLIPseq_TLS,5940464,661,0.0,655254,5792395.0,1866037,32.22%,1694735,4840
20090601_MP_none3_mouse_brain_2A_CLIPseq_TLS,6754559,10796,0.2,342212,6661709.0,1223067,18.36%,1096610,3181
20090601_MP_none4_mouse_brain_10A_CLIPseq_TDP43_highMNase,5341577,763,0.0,806946,5185955.0,735875,14.19%,520223,20426
20090601_MP_none5_mouse_brain_13E_CLIPseq_empty_beads,6342111,961,0.0,1095800,5963457.0,1086740,18.22%,132641,5717
20091230_MP10_1_TLS_clip_mouse_brain_3A,8321534,1094,0.0,46569,8308383.0,282219,3.40%,266699,535
20091230_MP11_1_TLS_clip_mouse_brain_3B,11679461,1907,0.0,194283,11627895.0,1076942,9.26%,996634,3153
20101110_MP41_1_mouse_brain_CLIPseq_TDP43_lower_band,70496200,684826,1.0,20292044,65591046.0,18875696,28.78%,3412759,71138
20101110_MP42_1_mouse_brain_CLIPseq_TDP43_upper_band,71776915,674998,0.9,29352592,65012667.0,19378128,29.81%,4407855,58207
TAF15_1,288984903,11548509,4.0,50320764,262881597.0,122174227,46.48%,22801935,33459


In [68]:
def gene_id_to_type_fun(genes):
    for gene_id in genes:
        try:
            yield gene_id_to_type[gene_id]
        except:
            yield "no_type"

In [69]:
gene_id_to_name = region_helpers.gene_id_to_name("/nas3/gpratt/gencode/gencode.vM1.annotation.gtf.db")
gene_id_to_type = region_helpers.gene_id_to_type("/nas3/gpratt/gencode/gencode.vM1.annotation.gtf.db")

name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

In [72]:
taf15_df = region_counter.count_features(pybedtools.BedTool("/nas3/gpratt/projects/fet_family/data/TAF15.all.BED"))
taf15_1_df = region_counter.count_features(pybedtools.BedTool("/nas3/gpratt/projects/fet_family/data/TAF15_195.all.BED"))
taf15_2_df = region_counter.count_features(pybedtools.BedTool("/nas3/gpratt/projects/fet_family/data/TAF15_196.all.BED"))
# tdp43_df = region_counter.count_features("/nas3/gpratt/projects/fet_family/analysis/mouse_clip_v5/TDP43_WholeBrain.merged.bam")
# tls_df = region_counter.count_features("/nas3/gpratt/projects/fet_family/analysis/mouse_clip_v5/TLS_WholeBrain.merged.bam")

taf15_df = pd.DataFrame(taf15_df.fillna(0).stack(), columns=["count"])
taf15_1_df = pd.DataFrame(taf15_1_df.fillna(0).stack(), columns=["count"])
taf15_2_df = pd.DataFrame(taf15_2_df.fillna(0).stack(), columns=["count"])
# tdp43_df= pd.DataFrame(tdp43_df.fillna(0).stack(), columns=["count"])
# tls_df = pd.DataFrame(tls_df.fillna(0).stack(), columns=["count"])

taf15_df['gene_type'] = list(gene_id_to_type_fun(taf15_df.index.get_level_values(level=1)))
taf15_1_df['gene_type'] = list(gene_id_to_type_fun(taf15_1_df.index.get_level_values(level=1)))
taf15_2_df['gene_type'] = list(gene_id_to_type_fun(taf15_2_df.index.get_level_values(level=1)))
# tdp43_df['gene_type'] = list(gene_id_to_type_fun(tdp43_df.index.get_level_values(level=1)))
# tls_df['gene_type'] = list(gene_id_to_type_fun(tls_df.index.get_level_values(level=1)))

In [73]:
non_coding_genes = {'IG_C_gene',
 'IG_D_gene',
 'IG_J_gene',
 'IG_V_gene',
 'Mt_rRNA',
 'Mt_tRNA',
 'lincRNA',
 'miRNA',
 'misc_RNA',
 'polymorphic_pseudogene',
 'processed_transcript',
 'pseudogene',
 'rRNA',
 'snRNA',
 'snoRNA'}

In [74]:
total = taf15_df['count'].sum()
total

5975094.0

In [75]:
total_protein_coding = taf15_df[taf15_df['gene_type'] == "protein_coding"]['count'].sum() 

In [76]:
print taf15_1_df['count'].sum()
print taf15_2_df['count'].sum()

3173314.0
2801780.0


In [77]:
print taf15_1_df[taf15_1_df['gene_type'] == "protein_coding"]['count'].sum() 
print taf15_2_df[taf15_2_df['gene_type'] == "protein_coding"]['count'].sum() 


2729033.0
2399782.0


In [78]:
total_noncoding = taf15_df[taf15_df['gene_type'].isin(non_coding_genes)]['count'].sum()

In [79]:
total_protein_coding

5128815.0

In [80]:
total_protein_coding / total

0.85836557550391679

In [81]:
total_noncoding

139382.0

In [82]:
total_noncoding / total

0.023327164392727546

In [83]:
#taf15_premrna = taf15_df[taf15_df['gene_type'] == "protein_coding"].ix[['proxintron500', 'distintron500']]
#taf15_premrna['count'].sum()

In [84]:
#This is stupid, these are not pre-mrna
#print taf15_1_df[taf15_1_df['gene_type'] == "protein_coding"].ix[['proxintron500', 'distintron500']]['count'].sum()
#print taf15_2_df[taf15_2_df['gene_type'] == "protein_coding"].ix[['proxintron500', 'distintron500']]['count'].sum()

In [85]:
#len(set(taf15_premrna[taf15_premrna['count'] > 10].index.get_level_values(level=1)))

In [86]:
81.1 + 1.5 + 17.3

99.89999999999999

In [87]:
taf15_df[taf15_df.gene_type == "no_type"]['count'].sum()

706897.0

In [88]:
taf15_df[taf15_df.gene_type == "no_type"]['count'].sum() / total

0.1183072601033557