In [1]:
%matplotlib inline

from collections import Counter
from collections import defaultdict
import glob
import itertools
from itertools import izip
import os

import seaborn as sns
from matplotlib import pyplot
from matplotlib_venn import venn2, venn3
import pybedtools
import numpy as np
from IPython.core.display import HTML
import pandas as pd

from gscripts import GO
from gscripts.rnaseq import splicing_map
from gscripts.general import dataviz
from gscripts.general import region_helpers

img_dir = "/home/gpratt/Dropbox/cryptic_splicing/data/human_data/splicing_maps"
heatmap_dir = "/home/gpratt/Dropbox/cryptic_splicing/data/human_data/heatmaps"

In [2]:
gene_id_to_name = region_helpers.gene_id_to_name("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db")
gene_id_to_type = region_helpers.gene_id_to_type("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db")
ensembl_id_to_name = {key.split(".")[0]: value for key, value in gene_id_to_name.items()}

name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

In [3]:
master_processing_table = pd.read_table("/home/elvannostrand/data/ENCODE/project_stuff/ENCODE_MASTER_ID_LIST_20160301_AllDatasets.txt", 
                                        sep="\t",
                                        skiprows=1,
                                        names=["UID", "RBP_gID", "CellLine", "RBP_ENSG", "Antibody", "Lot", "CLIP_ENCODEAccID", "CLIP_Rep1ENC", "CLIP_Rep2ENC", "CLIP_InputENC", "RNASEQ_ENCODEAccID",  "Duplicate_RNASEQ_ENCODEAccID", "RNASEQ_ControlENC", "RNASEQ_KDRep1Bam", "RNASEQ_KDRep2Bam", "RNASEQ_ControlRep1Bam", "RNASEQ_ControlRep2Bam", "bar"]
)

In [7]:
def annotate_cryptic_events(row):
    try:
        normal = pd.read_table(row.normal, index_col=0)
        cryptic = pd.read_table(row.cryptic, index_col=0)
        normal["is_cryptic"] = normal.index.isin(cryptic.index)
    except ValueError:
        print row.normal
        return None
    return normal


#I have to resort to this iterator stragety to keep my memory footprint low
#this takes all merged events and returns a group_id (count) and all the samples that have the event, along with the event itself (value)
def events_dict_iter(grouped_events):
    for count, (key, value) in enumerate(grouped_events.iteritems()):
        yield count, value 

#This takes all the actual events and makes a maping of group_id to the original dataframe id
def real_results_iter(grouped_events):
    for key, value in events_dict_iter(grouped_events):
        for item in value:
            yield item, key
            #real_results[item] = {"group_id": key}
            
def annotate_se_events(stress_events):

    #Stress events is a list of root rMATS dirs
    #Load in normal and cryptic events
    stress_events = pd.DataFrame(pd.Series({os.path.basename(item): item for item in stress_events}, name="events"))
    stress_events['normal'] = stress_events.events.apply(lambda x: os.path.join(x, "ASEvents", "fromGTF.SE.txt"))
    stress_events['cryptic'] = stress_events.events.apply(lambda x: os.path.join(x, "ASEvents", "fromGTF.novelEvents.SE.txt"))

    stress_events = stress_events.drop("events", axis=1)
    event_annotations = stress_events.unstack()

    #annotate all events as either cryptic or normal and combine them into one dataframe
    event_annotations = pd.concat({name: annotate_cryptic_events(row) for name, row in stress_events.iterrows()}, names=["condition", "ID"])

    #Group events by location
    df = event_annotations.groupby(["downstreamEE", "downstreamES", "upstreamEE", "upstreamES", "exonEnd", "exonStart_0base"])

    real_results_df = pd.DataFrame(pd.Series(dict(real_results_iter(df.groups)), name="group_id"))

    #Assign that value back to the full ist of event annotations
    annotated_combined_events = pd.concat([event_annotations, real_results_df], axis=1)

    annotated_combined_events = annotated_combined_events.drop([u'GeneID', u'geneSymbol', u'chr', u'strand', u'exonStart_0base',
           u'exonEnd', u'upstreamES', u'upstreamEE', u'downstreamES',
           u'downstreamEE',],axis=1)
    return annotated_combined_events

def get_rMATS_events(events_list):
    #might want to eventually convert that into an events list
    df = pd.DataFrame(pd.Series({os.path.basename(item): item for item in events_list}, name="events"))
    
    df['SE'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "SE.MATS.JunctionCountOnly.txt"))
    df['MXE'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "MXE.MATS.JunctionCountOnly.txt"))
    df['A5SS'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "A5SS.MATS.JunctionCountOnly.txt"))
    df['A3SS'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "A3SS.MATS.JunctionCountOnly.txt"))
    df['RI'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "RI.MATS.JunctionCountOnly.txt"))
    
    print df[~df['SE'].apply(os.path.exists)].values
    df = df[df['SE'].apply(os.path.exists)]
    
    se = pd.concat({key: pd.read_table(value, index_col=0) for key, value in df.SE.iteritems()}, names=["condition", "event_id"])
    se_annotated_events = annotate_se_events(df.events)
    se = pd.concat([se, se_annotated_events], axis=1).dropna()

    mxe = pd.concat({key: pd.read_table(value, index_col=0) for key, value in df.MXE.iteritems()}, names=["condition", "event_id"])
    a5ss = pd.concat({key: pd.read_table(value, index_col=0) for key, value in df.A5SS.iteritems()}, names=["condition", "event_id"])
    a3ss = pd.concat({key: pd.read_table(value, index_col=0) for key, value in df.A3SS.iteritems()}, names=["condition", "event_id"])
    ri = pd.concat({key: pd.read_table(value, index_col=0) for key, value in df.RI.iteritems()}, names=["condition", "event_id"])
    
    all_events = pd.concat({"SE": se,
                            "MXE": mxe,
                            "A5SS": a5ss,
                            "A3SS": a3ss,
                            "RI": ri,
                                  }, 
                                  names=["event_type", "condition", "event_id"]
                                 )
    return all_events

def get_significant_events(events):
    sig_events = events[events.FDR < .05]
    sig_events = pd.concat({"excluded": sig_events[sig_events.IncLevelDifference < 0],
                            "included": sig_events[sig_events.IncLevelDifference > 0],}, 
                           names=["direction", "event_type", "condition", "event_id"])
    sig_events = sig_events.swaplevel(0,1)
    return sig_events

In [8]:
#THIS NEEDS LARGE AMOUNTS OF MEMORY, RUN WITH ppn=4 minium, will have to optomize very soon for scaling
encode_events = glob.glob("/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/*")
stress_events = glob.glob("/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/*")

#filters out events that I won't use anyway, small speedups and hopefully will help with this next memory / scaling issue
encode_events = [encode_event for encode_event in encode_events if os.path.basename(encode_event).split("_vs_")[0] in master_processing_table.RNASEQ_ENCODEAccID.values]
both_events = encode_events + stress_events

all_events = get_rMATS_events(both_events)

[[ '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR028ITN_vs_ENCSR491FOC'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR028ITN_vs_ENCSR491FOC/MATS_output/SE.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR028ITN_vs_ENCSR491FOC/MATS_output/MXE.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR028ITN_vs_ENCSR491FOC/MATS_output/A5SS.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR028ITN_vs_ENCSR491FOC/MATS_output/A3SS.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR028ITN_vs_ENCSR491FOC/MATS_output/RI.MATS.JunctionCountOnly.txt']
 [ '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR113PYX_vs_ENCSR419JMU'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR113PYX_vs_ENCSR419JMU/MATS_output/SE.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR113PYX_vs_ENCSR419JMU/MAT

In [9]:
rbp_gid_dict = dict(zip(master_processing_table.RNASEQ_ENCODEAccID, master_processing_table.RBP_gID.values))
cell_type_dict = dict(zip(master_processing_table.RNASEQ_ENCODEAccID, master_processing_table.CellLine.values))

rbp_gid_dict["CA_HepG2"] = 'CA'
rbp_gid_dict["CA_K562"] = 'CA'
rbp_gid_dict["PQ1_HepG2"] = 'PQ'
rbp_gid_dict["PQ1_K562"] = 'PQ'
rbp_gid_dict["PQCA1_HepG2"] = 'PQCA'
rbp_gid_dict["PQCA1_K562"] = 'PQCA'
rbp_gid_dict["ars1_HepG2"] = 'ars'
rbp_gid_dict["ars1_K562"] = 'ars'
rbp_gid_dict["hs1_HepG2"] = 'hs'
rbp_gid_dict["hs1_K562"] = 'hs'

cell_type_dict["CA_HepG2"] = 'HepG2'
cell_type_dict["CA_K562"] = 'K562'
cell_type_dict["PQ1_HepG2"] = 'HepG2'
cell_type_dict["PQ1_K562"] = 'K562'
cell_type_dict["PQCA1_HepG2"] = 'HepG2'
cell_type_dict["PQCA1_K562"] = 'K562'
cell_type_dict["ars1_HepG2"] = 'HepG2'
cell_type_dict["ars1_K562"] = 'K562'
cell_type_dict["hs1_HepG2"] = 'HepG2'
cell_type_dict["hs1_K562"] = 'K562'

all_events['RNASEQ_ENCODEAccID'] = [condition.split("_vs_")[0] for condition in all_events.index.get_level_values("condition")]
all_events['rbp'] = all_events.RNASEQ_ENCODEAccID.apply(lambda x: rbp_gid_dict[x])
all_events['cell_type'] = all_events.RNASEQ_ENCODEAccID.apply(lambda x: cell_type_dict[x])
all_events['both'] = all_events['rbp'] + "_" + all_events['cell_type']

all_events.index = pd.MultiIndex.from_tuples([list(index) + [rbp, cell_type, both] for index, rbp, cell_type, both in izip(all_events.index, all_events.rbp, all_events.cell_type, all_events.both)], 
                                                         names=["event_type", "condition", "event_id", 'rbp', 'cell_type', "both"])

all_events = all_events.swaplevel("event_id", "both")

In [11]:
all_events.to_csv("/home/gpratt/projects/cryptic_exons/analysis/ipython_data/merged_cryptic_events.csv")

In [12]:
all_events

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,1stExonEnd,1stExonStart_0base,2ndExonEnd,2ndExonStart_0base,FDR,GeneID,ID.1,IJC_SAMPLE_1,IJC_SAMPLE_2,IncFormLen,...,riExonStart_0base,shortEE,shortES,strand,upstreamEE,upstreamES,RNASEQ_ENCODEAccID,rbp,cell_type,both
event_type,condition,both,rbp,cell_type,event_id,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,10551,,,,,0.000000e+00,ENSG00000198563.9,10551,4765101,155131136,197,...,,31502631,31502587,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,12061,,,,,0.000000e+00,ENSG00000038382.13,12061,23913,789175,197,...,,14485355,14485284,+,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,1400,,,,,0.000000e+00,ENSG00000213983.7,1400,003,222644,197,...,,24034408,24034329,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,15680,,,,,0.000000e+00,ENSG00000152795.13,15680,020,258286251,197,...,,83346820,83346715,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,20126,,,,,0.000000e+00,ENSG00000234741.3,20126,163,657465,197,...,,173835705,173835665,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,23445,,,,,0.000000e+00,ENSG00000141505.7,23445,353842,199186156,197,...,,7081952,7081812,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,24842,,,,,0.000000e+00,ENSG00000100445.12,24842,181719,597556,197,...,,24910132,24910059,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,26774,,,,,0.000000e+00,ENSG00000183258.7,26774,752,769370,179,...,,176942259,176942186,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,26775,,,,,0.000000e+00,ENSG00000183258.7,26775,752,9011888,197,...,,176942259,176942186,-,,,CA_HepG2,CA,HepG2,CA_HepG2
A3SS,CA_HepG2_vs_WT_HepG2,CA_HepG2,CA,HepG2,29826,,,,,0.000000e+00,ENSG00000169045.13,29826,51014,715864,164,...,,179048036,179047892,-,,,CA_HepG2,CA,HepG2,CA_HepG2
