In [78]:
import numpy as np
import pandas as pd
import pyranges as pr
from pandas import DataFrame
import os
from collections import defaultdict
import pdb

In [79]:
import yaml
from snakemake.io import expand

config_file = '../../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)
lib_meta = '../../'+expand(config['data']['meta'], species='human')[0]
cerb_psi_file = '../../'+expand(config['data']['psi'], species='human')[0]

suppa_af_file = '../../'+expand(config['data']['suppa']['psi'],
                               species='human', event='AF')[0]
suppa_al_file = '../../'+expand(config['data']['suppa']['psi'],
                               species='human', event='AL')[0]
print(suppa_af_file)
    

In [80]:
def extract_genes_from_cerberus_across_samples(cerberus, sample, mapping, name, thres_1 = 0.25, thres_2 = 0.75):
    from collections import Counter
    df = cerberus.loc[cerberus['dataset'].isin(mapping[sample]), [name,'psi']].groupby([name]).mean()
    events = set(df[(df['psi'] >= thres_1) & (df['psi'] <= thres_2)].index)
    counter = Counter([event.split('_')[0] for event in events])
    return([event for event in counter.keys() if counter[event] >= 2])

def extract_genes_from_suppa_across_samples(psi, sample, mapping, thres_1 = 0.25, thres_2 = 0.75):    
    dataset = mapping[sample]
    df = psi[dataset]
    df = df[(df.mean(axis=1) >= thres_1) & (df.mean(axis=1) <= thres_2)]
    return(set([gene[0].split('.')[0] for gene in df.index.str.split(';')]))

def compare_cerberus_and_suppa_across_events(df, psi, threshold_1, threshold_2, mapping=d):
    
    temp = pd.DataFrame()
    for sample in mapping.keys():
       
        geneList = set(df.loc[df['dataset'].isin(mapping[sample]), 'gid_stable'])
        gene_by_suppa = extract_genes_from_suppa_across_samples(psi, sample, mapping, thres_1=threshold_1, thres_2=threshold_2)
        gene_by_cerberus = extract_genes_from_cerberus_across_samples(df, sample, mapping, 'feat_id', thres_1=threshold_1, thres_2=threshold_2)
        
        suppa = []; cerberus = []
        for gene in geneList:     
            
            suppa.append('TRUE') if gene in gene_by_suppa else suppa.append('FALSE') ## not detected by SUPPA: (1) psi value not in .25 - .75 (2) does not have a event
            cerberus.append('TRUE') if gene in gene_by_cerberus else cerberus.append('FALSE')
            
        res = DataFrame({'gid':list(geneList), 'sample':sample, 'suppa':suppa, 'cerberus':cerberus})
        temp = pd.concat([temp, res], axis=0)
         
    return temp

def get_cerb_suppa_matching_events(cerb_psi_file,
                                   suppa_file,
                                   ofile,
                                   lib_meta,
                                   kind='tss'):

    # get sample <-> dataset mapping
    metadata = pd.read_csv(lib_meta, sep = '\t')
    metadata = metadata[['dataset','sample']]
    d = defaultdict(list)
    for a, b in metadata.values.tolist():
        d[b].append(a)
    
    # cerberus psi values
    cerberus_psi = pd.read_csv(cerb_psi_file, sep = '\t')  
    cerberus_psi = cerberus_psi[cerberus_psi['feat'] == kind]
    
    # suppa psi values
    suppa_psi = pd.read_csv(suppa_file, sep = '\t')
    
    # get matching events and dump to ofile
    df = compare_cerberus_and_suppa_across_events(cerberus_psi,
                                                  suppa_psi,
                                                  threshold_1=0.25,
                                                  threshold_2=0.75)
    df.to_csv(ofile, sep='\t')

In [None]:
get_cerb_suppa_matching_events(cerb_psi_file,
                                suppa_af_file,
                                'test.tsv',
                                lib_meta,
                                kind='tss')

> [0;32m<ipython-input-80-bad341c5db56>[0m(11)[0;36mextract_genes_from_suppa_across_samples[0;34m()[0m
[0;32m      9 [0;31m    [0mdataset[0m [0;34m=[0m [0mmapping[0m[0;34m[[0m[0msample[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 11 [0;31m    [0mdf[0m [0;34m=[0m [0mpsi[0m[0;34m[[0m[0mdataset[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     12 [0;31m    [0mdf[0m [0;34m=[0m [0mdf[0m[0;34m[[0m[0;34m([0m[0mdf[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m [0;34m>=[0m [0mthres_1[0m[0;34m)[0m [0;34m&[0m [0;34m([0m[0mdf[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0maxis[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m [0;34m<=[0m [0mthres_2[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m    [0;32mreturn[0m[0;34m([0m[0mset[0m[0;34m([

ipdb>  psi.head()


                                                    h9_neural_crest_1_1  \
ENSG00000000938.12;AF:chr1:27625151-27626046:27...                  NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...                  NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...                  NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...                  NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...                  NaN   

                                                    a673_1_1  h9_chondro_1_3  \
ENSG00000000938.12;AF:chr1:27625151-27626046:27...       NaN             NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...       NaN             NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...       NaN             NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...       NaN             NaN   
ENSG00000000938.12;AF:chr1:27625151-27626046:27...       NaN             NaN   

                                                    hl60_m2_24hr_1_2

In [75]:
get_cerb_suppa_matching_events(cerb_psi_file,
                                suppa_al_file,
                                'test.tsv',
                                lib_meta,
                                kind='tes')

In [45]:
metadata = pd.read_csv(lib_meta, sep = '\t')
metadata = metadata[['dataset','sample']]
d = defaultdict(list)
for a, b in metadata.values.tolist():
    d[b].append(a)

In [57]:
cerberus_psi = pd.read_csv(cerb_psi_file, sep = '\t')

In [58]:
AF_psi = pd.read_csv(cerb_af_file, sep = '\t')
AL_psi = pd.read_csv(cerb_al_file, sep = '\t')

In [59]:
cerberus_tss = cerberus_psi[cerberus_psi['feat'] == 'tss']
cerberus_tes = cerberus_psi[cerberus_psi['feat'] == 'tes']

In [60]:
def extract_genes_from_cerberus_across_samples(cerberus, sample, mapping, name, thres_1 = 0.25, thres_2 = 0.75):
    from collections import Counter
    df = cerberus.loc[cerberus['dataset'].isin(mapping[sample]), [name,'psi']].groupby([name]).mean()
    events = set(df[(df['psi'] >= thres_1) & (df['psi'] <= thres_2)].index)
    counter = Counter([event.split('_')[0] for event in events])
    return([event for event in counter.keys() if counter[event] >= 2])

In [61]:
def extract_genes_from_suppa_across_samples(psi, sample, mapping, thres_1 = 0.25, thres_2 = 0.75):    
    dataset = mapping[sample]
    df = psi[dataset]
    df = df[(df.mean(axis=1) >= thres_1) & (df.mean(axis=1) <= thres_2)]
    return(set([gene[0].split('.')[0] for gene in df.index.str.split(';')]))

In [65]:
def compare_cerberus_and_suppa_across_events(df, ofile, psi, name, threshold_1, threshold_2, mapping=d):
    
    path = '/'.join(['cerberus_suppa', str(threshold_1) + '_' + str(threshold_2), name])
    if not os.path.exists(path):
        os.makedirs(path) 
    
    temp = pd.DataFrame()
    for sample in mapping.keys():
       
        geneList = set(df.loc[df['dataset'].isin(mapping[sample]), 'gid_stable'])
        gene_by_suppa = extract_genes_from_suppa_across_samples(psi, sample, mapping, thres_1=threshold_1, thres_2=threshold_2)
        gene_by_cerberus = extract_genes_from_cerberus_across_samples(df, sample, mapping, 'feat_id', thres_1=threshold_1, thres_2=threshold_2)
        
        suppa = []; cerberus = []
        for gene in geneList:     
            
            suppa.append('TRUE') if gene in gene_by_suppa else suppa.append('FALSE') ## not detected by SUPPA: (1) psi value not in .25 - .75 (2) does not have a event
            cerberus.append('TRUE') if gene in gene_by_cerberus else cerberus.append('FALSE')
            
        res = DataFrame({'gid':list(geneList), 'sample':sample, 'suppa':suppa, 'cerberus':cerberus})
        temp = pd.concat([temp, res], axis=0)
         
        # res.to_csv(''.join([path, '/', sample, '.tsv']), sep = '\t', index = False, header = True)
    return temp

In [63]:
df = compare_cerberus_and_suppa_across_events(df = cerberus_tss, 
                                              psi = AF_psi, name = 'tss', threshold_1=0.25, threshold_2=0.75)
# compare_cerberus_and_suppa_across_events(df = cerberus_tes, psi = AL_psi, name = 'tes', threshold_1=0.25, threshold_2=0.75)

In [64]:
print(len(df.index))
df.head()

623356


Unnamed: 0,gid,sample,suppa,cerberus
0,ENSG00000034677,a673,False,False
1,ENSG00000163046,a673,False,False
2,ENSG00000204271,a673,False,False
3,ENSG00000138670,a673,False,False
4,ENSG00000160877,a673,False,False


In [32]:
! head -n 1 cerberus_suppa/0.25_0.75/tss/brain.tsv > cerberus_suppa/0.25_0.75/tss_combined.tsv
! tail -n +2 -q cerberus_suppa/0.25_0.75/tss/*.tsv >> cerberus_suppa/0.25_0.75/tss_combined.tsv

! head -n 1 cerberus_suppa/0.25_0.75/tes/brain.tsv > cerberus_suppa/0.25_0.75/tes_combined.tsv
! tail -n +2 -q cerberus_suppa/0.25_0.75/tes/*.tsv >> cerberus_suppa/0.25_0.75/tes_combined.tsv

In [None]:
from collections import defaultdict

AF_dict = defaultdict(set)

for AF_event in AF_events['event_id']:
    gene, event = AF_event.split(';')[:2]
    chrom = event.split(':')[1]
    
    if event.endswith('-'):
        s1, e1 = event.split('-')[2].split(':')[:2]
        s2, e2 = event.split('-')[1].split(':')[:2]
        strand = '-'
    else:
        s1, e1 = event.split('-')[0].split(':')[2:]
        s2, e2 = event.split('-')[1].split(':')[1:]
        strand = '+'
          
    exon_1 = chrom + ':' + s1 + '-' + e1 + ':' + strand
    exon_2 = chrom + ':' + s2 + '-' + e2 + ':' + strand
    
    AF_dict[gene].add(exon_1)
    AF_dict[gene].add(exon_2)