In [207]:
%matplotlib inline
from collections import Counter
import glob
import itertools
import os

import numpy as np
import pandas as pd
import pybedtools
import seaborn as sns

from gscripts.general import region_helpers


In [12]:
splicing_events = sorted(glob.glob("/home/gpratt/projects/cryptic_exons/analysis/encode_stress_v1/miso/*"))

In [13]:
!mkdir -p /home/gpratt/projects/cryptic_exons/analysis/encode_stress_miso_out/SE

In [110]:
event_to_gene_id = pd.read_table("/projects/ps-yeolab/genomes/hg19/miso/miso_se_to_ensembl_gene.tsv")
event_to_gene_id = {key: value for key, value in zip(event_to_gene_id.event_name, event_to_gene_id.ensembl_gene)}

gene_id_to_name = region_helpers.gene_id_to_name("/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db")
gene_id_to_type = region_helpers.gene_id_to_type("/projects/ps-yeolab/genomes/hg19/gencode_v19/gencode.v19.annotation.gtf.db")
ensembl_id_to_name = {key.split(".")[0]: value for key, value in gene_id_to_name.items()}

name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

In [14]:
miso_commands = []
for rep1, rep2 in itertools.permutations(splicing_events, 2):
    outfile = os.path.join("/home/gpratt/projects/cryptic_exons/analysis/encode_stress_miso_out/SE", os.path.basename(rep1) + "_vs_" + os.path.basename(rep2))
    #print os.path.basename(rep1), os.path.basename(rep2)
    miso_commands.append("compare_miso --compare-samples {} {} {}".format(os.path.join(rep1, "SE"), os.path.join(rep2, "SE"), outfile))

In [15]:
def eplouge(name,count, run_dir="/home/gpratt/projects/encode/analysis/idr_analysis_p_value/"):
    return """#!/bin/bash
#PBS -N {0}
#PBS -l nodes=1:ppn=1
#PBS -o {0}.out
#PBS -e {0}.err
#PBS -V
#PBS -q home-yeo
#PBS -W group_list=yeo-group
#PBS -t 1-{1}
#PBS -l walltime=4:00:00
cd {2}
echo "hello, starting"
""".format(name, count, run_dir)

prolouge = "eval ${cmd[$PBS_ARRAYID]}"

In [20]:
total = 0 
result = []
num_out = 0
for cmd in miso_commands:
    total += 1
    result.append('cmd[{}]="{}"'.format(total, cmd))
    if total >= 500:
        with open(os.path.join("/home/gpratt/projects/cryptic_exons/scripts/", "miso_compare_{}.sh".format(num_out)), 'w') as out_file:
            out_file.write(eplouge("miso_compare", total, "/home/gpratt/projects/cryptic_exons/analysis/encode_stress_miso_out/SE" ))
            for line in result:
                out_file.write(line + "\n\n")
            out_file.write(prolouge + "\n")
        total = 0 
        num_out += 1
        result = []
            
with open(os.path.join("/home/gpratt/projects/cryptic_exons/scripts/", "miso_compare_{}.sh".format(num_out)), 'w') as out_file:
    out_file.write(eplouge("miso_compare", total, "/home/gpratt/projects/cryptic_exons/analysis/encode_stress_miso_out/SE" ))
    for line in result:
        out_file.write(line + "\n\n")
    out_file.write(prolouge + "\n")

#Lets do stuffs, in this case stuffs is looking at splicing maps around confident events

In [29]:
metadata = pd.read_excel("/home/gpratt/Dropbox/cryptic_splicing/data/human_data/data_annotations.xlsx", "Sheet1")


In [33]:
comparative_splicing_df = glob.glob("/home/gpratt/projects/cryptic_exons/analysis/encode_stress_miso_out/SE/*")
comparative_splicing_df = pd.Series({tuple(os.path.basename(item).split("_vs_")): item for item in  comparative_splicing_df})
comparative_splicing_df = pd.DataFrame(comparative_splicing_df)

extra_df = {}
for item in comparative_splicing_df.index:
    rep1 = metadata.ix[item[0]]
    rep1.index = ["rep1_" + index for index in rep1.index]
    
    rep2 = metadata.ix[item[1]]
    rep2.index = ["rep2_" + index for index in rep2.index]
    
    reps = pd.concat([rep1, rep2])
    extra_df[item] = reps
    
extra_df = pd.DataFrame(extra_df).T

comparative_splicing_df = pd.concat([comparative_splicing_df, extra_df], axis=1)
comparative_splicing_df[0] = comparative_splicing_df[0].apply(lambda x: os.path.join(x, "SE_vs_SE/bayes-factors/SE_vs_SE.miso_bf"))

In [155]:
#comparative_splicing_df['number_of_sites'] = comparative_splicing_df[0].apply(lambda x: len(pd.read_table(x, index_col=0)))
foo = comparative_splicing_df[(comparative_splicing_df.rep1_cell_type == "HepG2") & (comparative_splicing_df.rep1_cell_type == "HepG2")]
foo = foo[(foo.rep1_stress == "ars1") & (foo.rep2_stress == "WT")]


In [62]:
interesting_combinations = {('ars1', 'WT'),
#                             ('q331k', 'rev'),
#                             ('q331k', 'hWT'),
#                             ('hWT', 'gt'),
#                             ('hWT', 'rev'),
#                             ('gt', 'rev'),
                           }

In [None]:
diff_threshold = .1
bayes_factor_threshold = 1

def sig_event(event, diff=.1, bayes_factor = 1):
    return np.abs(event['diff']) > diff and event['bayes_factor'] > bayes_factor

In [66]:
#Stringent Method
results = {}
total_counts = {}
for genotype_1, genotype_2 in interesting_combinations:

    combination = foo[(foo.rep1_stress  == genotype_1) & (foo.rep2_stress  == genotype_2)]
    #total_counts[(genotype_1, genotype_2)] = combination.number_of_sites.values
    result = pd.concat({"rep{}".format(x): pd.read_table(fn, index_col=0) for x, fn in enumerate(combination[0])}, names=['rep', 'event'])
    
    results[(genotype_1, genotype_2)] = result
interesting_splicing_df = pd.concat(results)

In [105]:
#less stringent filtering
results = {}
merged_df = {}
for genotype_name, df in interesting_splicing_df.groupby(level=[0,1]):
    counts = Counter()
    for x, (name, df) in enumerate(df.groupby(level="event")):
        counts[len(df)] += 1
        #The last arg is an xor both events must be changing in the same direction
        if len(df) >= 2:
#             one, two = df['diff'] > 0
            if any([sig_event(row) for name, row in df.iterrows()]) and (all(df['diff'] > 0) or all(df['diff'] < 0)):
                counts['confident'] += 1
                merged_df[tuple(list(genotype_name) + [name[-1]])] = df.mean()
#         elif len(df) >= 2 and all(df.bayes_factor > bayes_factor_threshold) and all(np.abs(df['diff']) > diff_threshold) and not (all(df['diff'] > 0) or all(df['diff'] < 0)):
#             counts['discordent'] += 1
#         break
        counts["diff_" + str(sum(np.abs(df['diff']) > diff_threshold))] += 1
        counts["bayes_factor_" + str(sum(np.abs(df['bayes_factor']) > bayes_factor_threshold))] += 1
    results[genotype_name] = counts

In [111]:
significant_df = pd.DataFrame(merged_df).T
significant_df['gene_id'] = [event_to_gene_id[event] for event in significant_df.index.get_level_values(level=2)]

unfiltered_interesting_splicing_df = interesting_splicing_df.groupby(level=[0,1,3]).mean()
unfiltered_interesting_splicing_df['event'] = unfiltered_interesting_splicing_df.index.get_level_values(level="event")


In [112]:
included_significant_df = significant_df[significant_df['diff'] > 0]
excluded_significant_df = significant_df[significant_df['diff'] < 0]

pd.concat({"included":included_significant_df.groupby(level=[0,1]).count().sample1_posterior_mean,
           "excluded": excluded_significant_df.groupby(level=[0,1]).count().sample1_posterior_mean}).swaplevel(0,2).swaplevel(0,1).unstack()

Unnamed: 0,Unnamed: 1,excluded,included
ars1,WT,473,571


In [122]:
def miso_to_bed(miso_list):
    result = []
    for exon in miso_list:
        chrom, start, stop, strand = exon.split(":")
        result.append(pybedtools.create_interval_from_list([chrom, start, stop, "0", "0", strand]))
    return pybedtools.BedTool(result)

def get_miso_regions(miso_names):
    upstream_exon = miso_to_bed([item.split("@")[0] for item in miso_names]).saveas()
    skipped_exon = miso_to_bed([item.split("@")[1] for item in miso_names]).saveas()
    downstream_exon = miso_to_bed([item.split("@")[2] for item in miso_names]).saveas()
    return upstream_exon, skipped_exon, downstream_exon

In [133]:
names = ['cryptic_chrom', 'cryptic_start', "cryptic_stop", "cryptic_name", "cryptic_score", "cryptic_strand",
         'rbp_chrom', 'rbp_start', 'rbp_stop', 'rbp_name', 'rbp_score', 'rbp_strand', "rbp_foo", 'rbp_bar', 'rbp_color', 'foo', 'bar', 'baz', 'buz']


In [160]:
included_upstream_exon, included_skipped_exon, included_downstream_exon = get_miso_regions(included_significant_df.index.get_level_values(level=2))
excluded_upstream_exon, excluded_skipped_exon, excluded_downstream_exon = get_miso_regions(excluded_significant_df.index.get_level_values(level=2))
background_upstream_exon, background_skipped_exon, background_downstream_exon = get_miso_regions(set(interesting_splicing_df.index.get_level_values("event")))

In [150]:
#don't forget, I'm also interested in "super interactior events", events with lots of RBPS binding and regulating an exon

In [148]:
def count_frequent_events(bedtool):
    cryptic_events_slop = bedtool.slop(b=500, g="/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes").intersect("/home/gpratt/Dropbox/cryptic_splicing/data/clip/encode_v12_filelist.allencode_20160226.txt.K562_allpeaks_ENCODEv12.colored.bed",
                          wo=True)
    cryptic_events_slop_df = cryptic_events_slop.to_dataframe(names=names)
    cryptic_events_slop_counts = cryptic_events_slop_df.groupby(["cryptic_chrom", 'cryptic_start', 'cryptic_stop', 'rbp_name']).count().rbp_chrom 
    #cryptic_events_slop_counts.unstack()

    return cryptic_events_slop_counts.unstack().count().sort_values()

In [182]:
included = pd.concat({("upstream_exon"): count_frequent_events(included_upstream_exon),
("skipped_exon"): count_frequent_events(included_skipped_exon),
("downstream_exon"): count_frequent_events(included_downstream_exon),}).unstack().T
                      
excluded = pd.concat({( "upstream_exon"): count_frequent_events(excluded_upstream_exon),
("skipped_exon"): count_frequent_events(excluded_skipped_exon),
( "downstream_exon"): count_frequent_events(excluded_downstream_exon),}).unstack().T

background = pd.concat({("upstream_exon"): count_frequent_events(background_upstream_exon),
( "skipped_exon"): count_frequent_events(background_skipped_exon),
( "downstream_exon"): count_frequent_events(background_downstream_exon),}).unstack().T


In [220]:
included_enriched = (included / included.sum()) / (background / background.sum())
excluded_enriched = (excluded / excluded.sum()) / (background / background.sum())

#Get rid of noise enrichments
included_enriched[included < 10] = np.nan
excluded_enriched[excluded < 10] = np.nan

In [221]:
included_enriched.sort_values("downstream_exon", ascending=False)

Unnamed: 0_level_0,downstream_exon,skipped_exon,upstream_exon
rbp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TRA2A,2.466063,3.066594,2.285082
U2AF1,2.157862,2.417198,2.454919
KHSRP,1.919926,1.997101,1.829321
GPKOW,1.721801,1.282000,1.944594
SMNDC1,1.587240,2.183319,1.957323
U2AF2,1.575984,1.955537,2.195549
RBM22,1.241309,2.161672,2.225106
SRSF1,1.141208,1.956779,1.007307
SF3B4,1.122527,0.927379,1.009730
IGF2BP1,1.088469,,


In [228]:
included.ix['QKI'] 

downstream_exon     8
skipped_exon       11
upstream_exon       7
Name: QKI, dtype: float64

In [229]:
background.ix['QKI']

downstream_exon    192
skipped_exon       267
upstream_exon      150
Name: QKI, dtype: float64

In [222]:
included_enriched.sort_values("skipped_exon", ascending=False)

Unnamed: 0_level_0,downstream_exon,skipped_exon,upstream_exon
rbp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TRA2A,2.466063,3.066594,2.285082
U2AF1,2.157862,2.417198,2.454919
SMNDC1,1.587240,2.183319,1.957323
RBM22,1.241309,2.161672,2.225106
QKI,,2.116180,
KHSRP,1.919926,1.997101,1.829321
SRSF1,1.141208,1.956779,1.007307
U2AF2,1.575984,1.955537,2.195549
SF3B1,0.935013,1.306159,0.823294
GPKOW,1.721801,1.282000,1.944594


In [223]:
# sns.distplot(included.fillna(0).upstream_exon)
# sns.distplot(included.fillna(0).downstream_exon)
# sns.distplot(included.fillna(0).skipped_exon)

# sns.distplot(excluded.fillna(0).upstream_exon)
# sns.distplot(excluded.fillna(0).downstream_exon)
# sns.distplot(excluded.fillna(0).skipped_exon)

In [224]:
included_enriched.sort_values("upstream_exon", ascending=False)

Unnamed: 0_level_0,downstream_exon,skipped_exon,upstream_exon
rbp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
U2AF1,2.157862,2.417198,2.454919
TRA2A,2.466063,3.066594,2.285082
RBM22,1.241309,2.161672,2.225106
U2AF2,1.575984,1.955537,2.195549
SMNDC1,1.587240,2.183319,1.957323
GPKOW,1.721801,1.282000,1.944594
PTBP1,,,1.860358
KHSRP,1.919926,1.997101,1.829321
PRPF8,0.989149,0.983433,1.257324
CSTF2T,,,1.138246


In [225]:
excluded_enriched.sort_values("downstream_exon", ascending=False)

Unnamed: 0_level_0,downstream_exon,skipped_exon,upstream_exon
rbp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SMNDC1,2.157019,2.067034,2.535962
SRSF1,1.832849,1.906593,1.820441
RBM22,1.557145,1.234421,
BUD13,1.316399,1.062761,0.855555
SF3B1,1.232154,0.910668,1.373200
DDX24,1.099919,0.958469,0.969005
SF3B4,1.052059,1.044027,0.992916
KHSRP,0.996213,0.910074,0.976776
RBM15,0.860379,,0.869647
PRPF8,0.852437,0.846703,0.908912


In [226]:
excluded_enriched.sort_values("skipped_exon", ascending=False)

Unnamed: 0_level_0,downstream_exon,skipped_exon,upstream_exon
rbp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RBM5,,3.347696,
AKAP8L,,2.749893,
SMNDC1,2.157019,2.067034,2.535962
SRSF1,1.832849,1.906593,1.820441
GTF2F1,,1.769122,1.029703
DROSHA,,1.640920,0.841842
RBM22,1.557145,1.234421,
GPKOW,,1.127410,1.182472
BUD13,1.316399,1.062761,0.855555
SF3B4,1.052059,1.044027,0.992916


In [227]:
excluded_enriched.sort_values("upstream_exon", ascending=False)

Unnamed: 0_level_0,downstream_exon,skipped_exon,upstream_exon
rbp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SMNDC1,2.157019,2.067034,2.535962
SRSF1,1.832849,1.906593,1.820441
NONO,,,1.577642
SF3B1,1.232154,0.910668,1.373200
U2AF2,0.759966,1.016204,1.370309
GPKOW,,1.127410,1.182472
CSTF2T,,,1.153577
RDBP,,,1.133154
GTF2F1,,1.769122,1.029703
SF3B4,1.052059,1.044027,0.992916
