In [17]:
%matplotlib inline

from collections import Counter
from collections import defaultdict
import glob
import itertools
from itertools import izip
import os

import seaborn as sns
from matplotlib import pyplot
from matplotlib_venn import venn2, venn3
import pybedtools
import numpy as np
from IPython.core.display import HTML
import pandas as pd

from gscripts import GO
from gscripts.rnaseq import splicing_map
from gscripts.general import dataviz
from gscripts.general import region_helpers

img_dir = "/home/gpratt/Dropbox/cryptic_splicing/data/human_data/splicing_maps"
heatmap_dir = "/home/gpratt/Dropbox/cryptic_splicing/data/human_data/heatmaps"

In [18]:
gene_id_to_name = region_helpers.gene_id_to_name("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db")
gene_id_to_type = region_helpers.gene_id_to_type("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db")
ensembl_id_to_name = {key.split(".")[0]: value for key, value in gene_id_to_name.items()}

name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

In [19]:
# master_processing_table = pd.read_table("/home/elvannostrand/data/ENCODE/project_stuff/ENCODE_MASTER_ID_LIST_20160301_AllDatasets.txt", 
#                                         sep="\t",
#                                         skiprows=1,
#                                         names=["UID", "RBP_gID", "CellLine", "RBP_ENSG", "Antibody", "Lot", "CLIP_ENCODEAccID", "CLIP_Rep1ENC", "CLIP_Rep2ENC", "CLIP_InputENC", "RNASEQ_ENCODEAccID",  "Duplicate_RNASEQ_ENCODEAccID", "RNASEQ_ControlENC", "RNASEQ_KDRep1Bam", "RNASEQ_KDRep2Bam", "RNASEQ_ControlRep1Bam", "RNASEQ_ControlRep2Bam", "bar"]
# )

master_processing_table = pd.read_table("/home/gpratt/Dropbox/encode_integration/20160408_ENCODE_MASTER_ID_LIST_AllDatasets.csv", index_col=0 )

In [134]:
def parse_rMATS(fn):
    """Given a file name parses it but also uses rMATS file structure to find cryptic events, infer them and returns the entire thing"""
    df = pd.read_table(fn, index_col=0)
    splice_type = os.path.basename(fn).split(".")[0]
    dir_name = os.path.split(os.path.split(fn)[0])[0]
    
    normal =  pd.read_table(os.path.join(dir_name, "ASEvents", "fromGTF.{}.txt").format(splice_type), index_col=0)
    cryptic = pd.read_table(os.path.join(dir_name, "ASEvents", "fromGTF.novelEvents.{}.txt").format(splice_type), index_col=0)
    normal["is_cryptic"] = normal.index.isin(cryptic.index)
    normal = normal.drop(set(normal.columns).difference(set(["is_cryptic"])), axis=1)
    df = df.join(normal)
    return df

#I have to resort to this iterator stragety to keep my memory footprint low
#this takes all merged events and returns a group_id (count) and all the samples that have the event, along with the event itself (value)
def events_dict_iter(grouped_events):
    for count, (key, value) in enumerate(grouped_events.iteritems()):
        yield count, value 

#This takes all the actual events and makes a maping of group_id to the original dataframe id
def real_results_iter(grouped_events):
    for key, value in events_dict_iter(grouped_events):
        for item in value:
            yield item, key
            #real_results[item] = {"group_id": key}
            
def annotate_se_events(se_events):

    #Group events by location
    df = se_events.groupby(["downstreamEE", "downstreamES", "upstreamEE", "upstreamES", "exonEnd", "exonStart_0base"])

    real_results_df = pd.DataFrame(pd.Series(dict(real_results_iter(df.groups)), name="group_id"))
    #Assign that value back to the full ist of event annotations
    annotated_combined_events = pd.concat([se_events, real_results_df], axis=1)

    return annotated_combined_events

def get_rMATS_events(events_list):
    #might want to eventually convert that into an events list
    df = pd.DataFrame(pd.Series({os.path.basename(item): item for item in events_list}, name="events"))
    
    df['SE'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "SE.MATS.JunctionCountOnly.txt"))
    df['MXE'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "MXE.MATS.JunctionCountOnly.txt"))
    df['A5SS'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "A5SS.MATS.JunctionCountOnly.txt"))
    df['A3SS'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "A3SS.MATS.JunctionCountOnly.txt"))
    df['RI'] = df.events.apply(lambda x: os.path.join(x, "MATS_output", "RI.MATS.JunctionCountOnly.txt"))
    
    print "paths that didn't exist", df[~df['SE'].apply(os.path.exists)].values
    df = df[df['SE'].apply(os.path.exists)]
    
    se = pd.concat({key: parse_rMATS(value) for key, value in df.SE.iteritems()}, names=["condition", "event_id"])
    se = annotate_se_events(se)
    
    mxe = pd.concat({key: parse_rMATS(value) for key, value in df.MXE.iteritems()}, names=["condition", "event_id"])
    a5ss = pd.concat({key: parse_rMATS(value) for key, value in df.A5SS.iteritems()}, names=["condition", "event_id"])
    a3ss = pd.concat({key: parse_rMATS(value) for key, value in df.A3SS.iteritems()}, names=["condition", "event_id"])
    ri = pd.concat({key: parse_rMATS(value) for key, value in df.RI.iteritems()}, names=["condition", "event_id"])
    
    all_events = pd.concat({"SE": se,
                            "MXE": mxe,
                            "A5SS": a5ss,
                            "A3SS": a3ss,
                            "RI": ri,
                                  }, 
                                  names=["event_type", "condition", "event_id"]
                                 )
    return all_events

In [135]:
#THIS NEEDS LARGE AMOUNTS OF MEMORY, RUN WITH ppn=4 minium, will have to optomize very soon for scaling
encode_events = glob.glob("/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/*")
stress_events = glob.glob("/home/gpratt/projects/cryptic_exons/analysis/ad-hoc/rMATS/*")

#filters out events that I won't use anyway, small speedups and hopefully will help with this next memory / scaling issue
encode_events = [encode_event for encode_event in encode_events if os.path.basename(encode_event).split("_vs_")[0] in master_processing_table.RNASEQ_ENCODEAccID.values]
both_events = encode_events + stress_events

all_events = get_rMATS_events(both_events)

paths that didn't exist [[ '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR060KRD_vs_ENCSR092WKG'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR060KRD_vs_ENCSR092WKG/MATS_output/SE.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR060KRD_vs_ENCSR092WKG/MATS_output/MXE.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR060KRD_vs_ENCSR092WKG/MATS_output/A5SS.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR060KRD_vs_ENCSR092WKG/MATS_output/A3SS.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR060KRD_vs_ENCSR092WKG/MATS_output/RI.MATS.JunctionCountOnly.txt']
 [ '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR362XMY_vs_ENCSR620PUP'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR362XMY_vs_ENCSR620PUP/MATS_output/SE.MATS.JunctionCountOnly.txt'
  '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/ENCSR3

In [136]:
rbp_gid_dict = dict(zip(master_processing_table.RNASEQ_ENCODEAccID, master_processing_table.RBP_gID.values))
cell_type_dict = dict(zip(master_processing_table.RNASEQ_ENCODEAccID, master_processing_table.CellLine.values))

rbp_gid_dict["CA_HepG2"] = 'CA'
rbp_gid_dict["CA_K562"] = 'CA'
rbp_gid_dict["PQ1_HepG2"] = 'PQ'
rbp_gid_dict["PQ1_K562"] = 'PQ'
rbp_gid_dict["PQCA1_HepG2"] = 'PQCA'
rbp_gid_dict["PQCA1_K562"] = 'PQCA'
rbp_gid_dict["ars1_HepG2"] = 'ars'
rbp_gid_dict["ars1_K562"] = 'ars'
rbp_gid_dict["hs1_HepG2"] = 'hs'
rbp_gid_dict["hs1_K562"] = 'hs'
rbp_gid_dict["AV_Kin1ALS17_3_puro_S48_L007_R1_001"]  = "puro"
rbp_gid_dict["AV_ALS17_5_puro_1_S46_L007_R1_001"] = "puro"
rbp_gid_dict["AV_GY6_2_puro_1_S44_L007_R1_001" ] = "puro"
rbp_gid_dict["KK_MN_shFUS_7_S55_L008_R1_001" ] = "puro"
rbp_gid_dict["AV_47d_puro_1_S51_L008_R1_001" ] = "puro"
rbp_gid_dict["AV_CVB_puro_S49_L007_R1_001" ] = "puro"
rbp_gid_dict["KK_MN_shTAF_5_S54_L008_R1_001" ] = "puro"
rbp_gid_dict['KK_MN_shTDP_1_S52_L008_R1_001'] = "puro"

cell_type_dict["CA_HepG2"] = 'HepG2'
cell_type_dict["CA_K562"] = 'K562'
cell_type_dict["PQ1_HepG2"] = 'HepG2'
cell_type_dict["PQ1_K562"] = 'K562'
cell_type_dict["PQCA1_HepG2"] = 'HepG2'
cell_type_dict["PQCA1_K562"] = 'K562'
cell_type_dict["ars1_HepG2"] = 'HepG2'
cell_type_dict["ars1_K562"] = 'K562'
cell_type_dict["hs1_HepG2"] = 'HepG2'
cell_type_dict["hs1_K562"] = 'K562'
cell_type_dict["AV_Kin1ALS17_3_puro_S48_L007_R1_001"]  = "MN"
cell_type_dict["AV_ALS17_5_puro_1_S46_L007_R1_001"] = "MN"
cell_type_dict["AV_GY6_2_puro_1_S44_L007_R1_001" ] = "MN"
cell_type_dict["KK_MN_shFUS_7_S55_L008_R1_001" ] = "MN"
cell_type_dict["AV_47d_puro_1_S51_L008_R1_001" ] = "MN"
cell_type_dict["AV_CVB_puro_S49_L007_R1_001" ] = "MN"
cell_type_dict["KK_MN_shTAF_5_S54_L008_R1_001" ] = "MN"
cell_type_dict['KK_MN_shTDP_1_S52_L008_R1_001'] = "MN"


all_events['RNASEQ_ENCODEAccID'] = [condition.split("_vs_")[0] for condition in all_events.index.get_level_values("condition")]
all_events['rbp'] = all_events.RNASEQ_ENCODEAccID.apply(lambda x: rbp_gid_dict[x])
all_events['cell_type'] = all_events.RNASEQ_ENCODEAccID.apply(lambda x: cell_type_dict[x])
all_events['both'] = all_events['rbp'] + "_" + all_events['cell_type']

all_events.index = pd.MultiIndex.from_tuples([list(index) + [rbp, cell_type, both] for index, rbp, cell_type, both in izip(all_events.index, all_events.rbp, all_events.cell_type, all_events.both)], 
                                                         names=["event_type", "condition", "event_id", 'rbp', 'cell_type', "both"])

all_events = all_events.swaplevel("event_id", "both")

In [None]:
all_events.to_csv("/home/gpratt/projects/cryptic_exons/analysis/ipython_data/merged_cryptic_events.csv")

In [29]:
print "foo"

foo
