In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus
import subprocess

p = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
major_set = '../swan/isos_sample_gene_90.tsv'
c_annot = 'cerberus_annot_triplets.h5'
gtf = '../cerberus.gtf'
swan_file = '../swan/swan.p'
filt_ab = '../cerberus_filtered_abundance.tsv'
h5 = '../cerberus_annot.h5'

In [47]:
def get_med_exons_per_gene(ca):
    """
    Get the median number of exons per gene across 
    all identified transcripts in cerberus
    """
    exon_df = ca.t_map[['transcript_id']]
    exon_df = add_feat(exon_df, kind='ic', col='transcript_id')
    exon_df.drop_duplicates(inplace=True)
    
    # merge w/ ic table to get # exons
    exon_df = exon_df.merge(ca.ic[['Name', 'Coordinates']],
                        how='left',
                        left_on='ic',
                        right_on='Name')
    exon_df['n_exons'] = exon_df['Coordinates'].str.count('-')
    exon_df.loc[exon_df.Coordinates=='-', 'exon_df'] = 0
    
    # get gene id and compute median
    exon_df['gid'] = exon_df.ic.str.split('_', expand=True)[0]
    gene_exon_df = exon_df[['gid', 'n_exons']].groupby('gid').median().reset_index()
    gene_exon_df.rename({'n_exons': 'median_exons'}, axis=1, inplace=True)
    
    return gene_exon_df

In [3]:
ca = cerberus.read(c_annot)

## 1,1,1 isoform

In [66]:
# get all sample det isoforms
source = 'sample_det'
df = ca.triplets.loc[ca.triplets.source == source]

# get triplet id and limit to 1,1,1 genes
df['triplet'] = df.n_tss.astype(int).astype(str)+','+\
                df.n_ic.astype(int).astype(str)+','+\
                df.n_tes.astype(int).astype(str)
df = df.loc[df.triplet == '1,1,1']

# get the median number of exons per gene
ge_df = get_med_exons_per_gene(ca)
df = df.merge(ge_df, how='left', on='gid')
              
# remove genes without a lot of exons
df = df.loc[df.median_exons > 3]
              
df.sort_values(by='gene_tpm', ascending=False).head(20)              

Unnamed: 0,source,gid,n_tss,n_tes,n_ic,n_iso,splicing_ratio,gname,sample,gene_tpm,triplet,median_exons
84888,sample_det,ENSG00000164879,1.0,1.0,1.0,1.0,1.0,CA3,muscle,1181.528564,111,5.0
110312,sample_det,ENSG00000142676,1.0,1.0,1.0,1.0,1.0,RPL11,mcf7,1067.494995,111,7.0
167491,sample_det,ENSG00000142676,1.0,1.0,1.0,1.0,1.0,RPL11,hl60,941.440308,111,7.0
168037,sample_det,ENSG00000171314,1.0,1.0,1.0,1.0,1.0,PGAM1,hl60,941.217407,111,5.0
139089,sample_det,ENSG00000171314,1.0,1.0,1.0,1.0,1.0,PGAM1,adipose,843.580444,111,5.0
84309,sample_det,ENSG00000143318,1.0,1.0,1.0,1.0,1.0,CASQ1,muscle,793.289673,111,15.0
151674,sample_det,ENSG00000142676,1.0,1.0,1.0,1.0,1.0,RPL11,panc1,748.990295,111,7.0
47977,sample_det,ENSG00000148965,1.0,1.0,1.0,1.0,1.0,SAA4,liver,743.821594,111,5.0
48175,sample_det,ENSG00000157131,1.0,1.0,1.0,1.0,1.0,C8A,liver,721.939087,111,13.0
126781,sample_det,ENSG00000142676,1.0,1.0,1.0,1.0,1.0,RPL11,pc3,709.075439,111,7.0


In [63]:
ca.triplets.loc[(ca.triplets.source == 'sample_det')&(ca.triplets.gname=='COL1A1')].sort_values(by='gene_tpm', ascending=False)

Unnamed: 0,source,gid,n_tss,n_tes,n_ic,n_iso,splicing_ratio,gname,sample,gene_tpm
525202,sample_det,ENSG00000108821,3.0,5.0,4.0,7.0,1.0,COL1A1,h9_chondro,63934.390625
140365,sample_det,ENSG00000108821,2.0,3.0,2.0,4.0,0.8,COL1A1,pgp1_astro,17760.802734
128010,sample_det,ENSG00000108821,2.0,2.0,2.0,2.0,1.0,COL1A1,imr90,9858.12207
487597,sample_det,ENSG00000108821,2.0,3.0,2.0,3.0,0.8,COL1A1,hffc6,1834.368774
210700,sample_det,ENSG00000108821,2.0,3.0,2.0,3.0,0.8,COL1A1,vessels,680.690735
252630,sample_det,ENSG00000108821,1.0,1.0,1.0,1.0,1.0,COL1A1,ovary,547.942749
572436,sample_det,ENSG00000108821,2.0,2.0,2.0,2.0,1.0,COL1A1,h9_osteocyte,442.539429
58620,sample_det,ENSG00000108821,1.0,2.0,1.0,2.0,0.666667,COL1A1,lung,175.406296
466298,sample_det,ENSG00000108821,1.0,1.0,1.0,1.0,1.0,COL1A1,adipose,131.496017
87306,sample_det,ENSG00000108821,1.0,2.0,1.0,2.0,0.666667,COL1A1,colon,131.359818


In [48]:
ge_df = get_med_exons_per_gene(ca)

## Highly-expressed non 1,1,1 isoform

In [70]:
source = 'sample_det'
df = ca.triplets.loc[ca.triplets.source == source]

In [78]:
# get triplet id and limit to non 1,1,1 genes
df['triplet'] = df.n_tss.astype(int).astype(str)+','+\
                df.n_ic.astype(int).astype(str)+','+\
                df.n_tes.astype(int).astype(str)
df = df.loc[df.triplet != '1,1,1']
print(df.head())

# let's try to get genes w/ a lot of ic variation
df = df.loc[df.n_ic > 5]

# add tf designation
gene_df, _, _ = get_gtf_info(how='gene', ver='v40_cerberus')
gene_df['gid_stable'] = cerberus.get_stable_gid(gene_df, 'gid')
df = df.merge(gene_df[['tf', 'gid_stable']],
              how='left', left_on='gid',
              right_on='gid_stable', 
              suffixes=('', '_gtf'))

        source              gid  n_tss  n_tes  n_ic  n_iso  splicing_ratio  \
0   sample_det  ENSG00000000003    1.0    5.0   6.0   18.0             2.0   
1   sample_det  ENSG00000000419    1.0    4.0   8.0   11.0             3.2   
4   sample_det  ENSG00000001036    1.0    4.0   6.0   11.0             2.4   
19  sample_det  ENSG00000002586    1.0    2.0   9.0    9.0             6.0   
23  sample_det  ENSG00000002822    4.0    2.0  18.0   19.0             6.0   

     gname sample    gene_tpm triplet  
0   TSPAN6  caco2  347.757385   1,6,5  
1     DPM1  caco2  126.812943   1,8,4  
4    FUCA2  caco2  261.958191   1,6,4  
19    CD99  caco2  408.017517   1,9,2  
23  MAD1L1  caco2   97.261612  4,18,2  


In [80]:
df.loc[df['sample']=='ovary'].sort_values(by='gene_tpm', ascending=False)
df.loc[(df['sample']=='ovary')&(df.gene_tpm>100)&(df.gene_tpm<1000)&(df.tf==True)]

Unnamed: 0,source,gid,n_tss,n_tes,n_ic,n_iso,splicing_ratio,gname,sample,gene_tpm,triplet,tf,gid_stable
39390,sample_det,ENSG00000067066,2.0,5.0,13.0,15.0,3.714286,SP100,ovary,102.093597,2135,True,ENSG00000067066
39391,sample_det,ENSG00000067082,2.0,5.0,7.0,9.0,2.0,KLF6,ovary,197.415817,275,True,ENSG00000067082
39624,sample_det,ENSG00000100219,3.0,2.0,6.0,6.0,2.4,XBP1,ovary,136.043945,362,True,ENSG00000100219
39762,sample_det,ENSG00000103495,5.0,2.0,9.0,9.0,2.571429,MAZ,ovary,149.227646,592,True,ENSG00000103495
39855,sample_det,ENSG00000106624,4.0,1.0,7.0,7.0,2.8,AEBP1,ovary,388.119843,471,True,ENSG00000106624
40072,sample_det,ENSG00000116044,4.0,5.0,7.0,9.0,1.555556,NFE2L2,ovary,156.31012,475,True,ENSG00000116044
40205,sample_det,ENSG00000123358,5.0,3.0,11.0,11.0,2.75,NR4A1,ovary,179.301956,5113,True,ENSG00000123358
40352,sample_det,ENSG00000130844,7.0,7.0,10.0,23.0,1.428571,ZNF331,ovary,204.941071,7107,True,ENSG00000130844
40495,sample_det,ENSG00000136574,6.0,5.0,16.0,18.0,2.909091,GATA4,ovary,199.088608,6165,True,ENSG00000136574
40583,sample_det,ENSG00000140262,1.0,5.0,9.0,12.0,3.0,TCF12,ovary,107.840759,195,True,ENSG00000140262
