In [1]:
import numpy as np
import pandas as pd
import pyranges as pr
from pandas import DataFrame
import os
from collections import defaultdict

In [24]:
os.chdir('/gpfs/gibbs/pi/gerstein/bb.shared.projects/RNAWG/S7_C')

In [3]:
metadata = pd.read_csv('swan_metadata.txt', sep = '\t')
metadata = metadata[['dataset','sample']]
d = defaultdict(list)
for a, b in metadata.values.tolist():
    d[b].append(a)

In [5]:
cerberus_psi = pd.read_csv('cerberus_psi.tsv', sep = '\t')

In [6]:
cerberus_psi

Unnamed: 0,feat_id,dataset,feat_tpm,gid_stable,tpm_gene,psi,feat
0,ENSG00000000003_1,mcf7_1_1,33.719579,ENSG00000000003,33.719579,1.000000,tss
1,ENSG00000000419_1,mcf7_1_1,62.755882,ENSG00000000419,87.108911,0.720430,tss
2,ENSG00000000419_8,mcf7_1_1,24.353029,ENSG00000000419,87.108911,0.279570,tss
3,ENSG00000000457_1,mcf7_1_1,3.746620,ENSG00000000457,3.746620,1.000000,tss
4,ENSG00000000460_1,mcf7_1_1,13.113169,ENSG00000000460,18.733099,0.700000,tss
...,...,...,...,...,...,...,...
8744615,ENSG00000285756_16,left_lung_2_1,1.751774,ENSG00000285756,7.007095,0.250000,tes
8744616,ENSG00000285906_1,left_lung_2_1,3.503547,ENSG00000285906,3.503547,1.000000,tes
8744617,ENSG00000285933_1,left_lung_2_1,10.510642,ENSG00000285933,10.510642,1.000000,tes
8744618,ENSG00000285976_1,left_lung_2_1,24.524831,ENSG00000285976,26.276605,0.933333,tes


In [30]:
AF_psi = pd.read_csv('suppa/psi/cerberus_AF.psi', sep = '\t')
AL_psi = pd.read_csv('suppa/psi/cerberus_AL.psi', sep = '\t')

In [47]:
cerberus_tss = cerberus_psi[cerberus_psi['feat'] == 'tss']
cerberus_tes = cerberus_psi[cerberus_psi['feat'] == 'tes']
#cerberus_tss = pd.read_csv('psi_by_cerberus/tss_psi.tsv', sep = '\t', header = 0, low_memory = False)
#cerberus_tes = pd.read_csv('psi_by_cerberus/tes_psi.tsv', sep = '\t', header = 0, low_memory = False)
#cerberus_ic = pd.read_csv('psi_by_cerberus/ic_psi.tsv', sep = '\t', header = 0, low_memory = False)

In [48]:
cerberus_tss

Unnamed: 0,feat_id,dataset,feat_tpm,gid_stable,tpm_gene,psi,feat
0,ENSG00000000003_1,mcf7_1_1,33.719579,ENSG00000000003,33.719579,1.00000,tss
1,ENSG00000000419_1,mcf7_1_1,62.755882,ENSG00000000419,87.108911,0.72043,tss
2,ENSG00000000419_8,mcf7_1_1,24.353029,ENSG00000000419,87.108911,0.27957,tss
3,ENSG00000000457_1,mcf7_1_1,3.746620,ENSG00000000457,3.746620,1.00000,tss
4,ENSG00000000460_1,mcf7_1_1,13.113169,ENSG00000000460,18.733099,0.70000,tss
...,...,...,...,...,...,...,...
2180952,ENSG00000285756_12,left_lung_2_1,1.751774,ENSG00000285756,7.007095,0.25000,tss
2180953,ENSG00000285756_2,left_lung_2_1,5.255321,ENSG00000285756,7.007095,0.75000,tss
2180954,ENSG00000285906_1,left_lung_2_1,3.503547,ENSG00000285906,3.503547,1.00000,tss
2180955,ENSG00000285933_1,left_lung_2_1,10.510642,ENSG00000285933,10.510642,1.00000,tss


In [50]:
def extract_genes_from_cerberus_across_samples(cerberus, sample, mapping, name, thres_1 = 0.25, thres_2 = 0.75):
    from collections import Counter
    df = cerberus.loc[cerberus['dataset'].isin(mapping[sample]), [name,'psi']].groupby([name]).mean()
    events = set(df[(df['psi'] >= thres_1) & (df['psi'] <= thres_2)].index)
    counter = Counter([event.split('_')[0] for event in events])
    return([event for event in counter.keys() if counter[event] >= 2])

In [51]:
def extract_genes_from_suppa_across_samples(psi, sample, mapping, thres_1 = 0.25, thres_2 = 0.75):    
    dataset = mapping[sample]
    df = psi[dataset]
    df = df[(df.mean(axis=1) >= thres_1) & (df.mean(axis=1) <= thres_2)]
    return(set([gene[0].split('.')[0] for gene in df.index.str.split(';')]))

In [61]:
def compare_cerberus_and_suppa_across_events(df, psi, name, threshold_1, threshold_2, mapping = d):
    
    path = '/'.join(['cerberus_suppa', str(threshold_1) + '_' + str(threshold_2), name])
    if not os.path.exists(path):
        os.makedirs(path) 
    
    for sample in mapping.keys():
       
        geneList = set(df.loc[df['dataset'].isin(mapping[sample]), 'gid_stable'])
        gene_by_suppa = extract_genes_from_suppa_across_samples(psi, sample, mapping, thres_1=threshold_1, thres_2=threshold_2)
        gene_by_cerberus = extract_genes_from_cerberus_across_samples(df, sample, mapping, 'feat_id', thres_1=threshold_1, thres_2=threshold_2)
        
        suppa = []; cerberus = []
        for gene in geneList:     
            
            suppa.append('TRUE') if gene in gene_by_suppa else suppa.append('FALSE') ## not detected by SUPPA: (1) psi value not in .25 - .75 (2) does not have a event
            cerberus.append('TRUE') if gene in gene_by_cerberus else cerberus.append('FALSE')
            
#            if gene in gene_by_cerberus:
#                cerberus.append('TRUE')
#            else:
#                cerberus.append('FALSE') if gene in gene_bg2 else cerberus.append(np.nan)            
#            lr.append(True) if gene in gene_by_lr else lr.append(False)
        res = DataFrame({'gid':list(geneList), 'sample':sample, 'suppa':suppa, 'cerberus':cerberus})
        # reformat
#        for i in range(df.shape[0]):
#            if(pd.isnull(df.iloc[i,3])):
 #               df.iloc[i,2] = np.nan
               
        res.to_csv(''.join([path, '/', sample, '.tsv']), sep = '\t', index = False, header = True)

In [63]:
compare_cerberus_and_suppa_across_events(df = cerberus_tss, psi = AF_psi, name = 'tss', threshold_1=0.25, threshold_2=0.75)
compare_cerberus_and_suppa_across_events(df = cerberus_tes, psi = AL_psi, name = 'tes', threshold_1=0.25, threshold_2=0.75)

In [64]:
! head -n 1 cerberus_suppa/0.25_0.75/tss/brain.tsv > cerberus_suppa/0.25_0.75/tss_combined.tsv
! tail -n +2 -q cerberus_suppa/0.25_0.75/tss/*.tsv >> cerberus_suppa/0.25_0.75/tss_combined.tsv

! head -n 1 cerberus_suppa/0.25_0.75/tes/brain.tsv > cerberus_suppa/0.25_0.75/tes_combined.tsv
! tail -n +2 -q cerberus_suppa/0.25_0.75/tes/*.tsv >> cerberus_suppa/0.25_0.75/tes_combined.tsv

In [None]:
from collections import defaultdict

AF_dict = defaultdict(set)

for AF_event in AF_events['event_id']:
    gene, event = AF_event.split(';')[:2]
    chrom = event.split(':')[1]
    
    if event.endswith('-'):
        s1, e1 = event.split('-')[2].split(':')[:2]
        s2, e2 = event.split('-')[1].split(':')[:2]
        strand = '-'
    else:
        s1, e1 = event.split('-')[0].split(':')[2:]
        s2, e2 = event.split('-')[1].split(':')[1:]
        strand = '+'
          
    exon_1 = chrom + ':' + s1 + '-' + e1 + ':' + strand
    exon_2 = chrom + ':' + s2 + '-' + e2 + ':' + strand
    
    AF_dict[gene].add(exon_1)
    AF_dict[gene].add(exon_2)