In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import cerberus

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
filt_ab = 'cerberus_filtered_abundance.tsv'
c_annot = 'cerberus_annot.h5'
ver = 'v40_cerberus'

## PolyA gene transcript detection

What fraction of polyA genes from the annotation do we detect with at least 1 known isoform >= 1 TPM in any library?

In [11]:
df = pd.read_csv(filt_ab, sep='\t')
df, tids = get_tpm_table(df,
               how='iso',
               nov=['Known'],
               min_tpm=1,
               gene_subset='polya')

# merge with gene id
gene_df = pd.read_csv(filt_ab, sep='\t')
gene_df = gene_df[['annot_gene_id', 'annot_transcript_id']]
df = df.merge(gene_df, how='left', left_index=True, right_on='annot_transcript_id')

gene_df, _, _ = get_gtf_info(how='gene', ver=ver)

polya_biotypes = ['protein_coding', 'pseudogene', 'lncRNA']
polya_genes = gene_df.loc[gene_df.biotype_category.isin(polya_biotypes), 'gid'].tolist()
n_polya = len(polya_genes)
n_det_polya = len(df.annot_gene_id.unique().tolist())

print('Detected {} / {} ({:.2f}%) annotated polyA genes w/ at least 1 known transcript >= 1 TPM'.format(n_det_polya, n_polya, (n_det_polya/n_polya)*100))

Calculating iso TPM values
Subsetting for novelty categories ['Known']
Subsetting for polya genes


  df[total_col] = df[d].sum()
  df[tpm_col] = (df[d]*1000000)/df[total_col]


Enforcing minimum TPM
Total # isos detected: 245379
# isos >= 1 tpm: 223477
Applying gene type and novelty subset
Number of isos reported: 132274
Detected 26446 / 52274 (50.59%) annotated polyA genes w/ at least 1 known transcript >= 1 TPM


## TF gene transcript detection

What fraction of TF genes from the annotation do we detect with at least 1 known isoform >= 1 TPM in any library?

In [12]:
df = pd.read_csv(filt_ab, sep='\t')
df, tids = get_tpm_table(df,
                   how='iso',
                   nov=['Known'],
                   min_tpm=1,
                   gene_subset='tf')

# merge with gene id
gene_df = pd.read_csv(filt_ab, sep='\t')
gene_df = gene_df[['annot_gene_id', 'annot_transcript_id']]
df = df.merge(gene_df, how='left', left_index=True, right_on='annot_transcript_id')

gene_df, _, _ = get_gtf_info(how='gene', subset='tf', ver=ver)

n_tf = len(gene_df.index)
n_det_tf = len(df.annot_gene_id.unique().tolist())

print('Detected {} / {} ({:.2f}%) annotated TF genes w/ at least 1 known transcript >= 1 TPM'.format(n_det_tf, n_tf, (n_det_tf/n_tf)*100))

Calculating iso TPM values
Subsetting for novelty categories ['Known']
Subsetting for tf genes
Enforcing minimum TPM
Total # isos detected: 245379
# isos >= 1 tpm: 223477
Applying gene type and novelty subset
Number of isos reported: 9102
Detected 1326 / 1419 (93.45%) annotated TF genes w/ at least 1 known transcript >= 1 TPM


Which TFs have the most isoforms?

In [7]:
df = pd.read_csv(filt_ab, sep='\t')
df = get_isos_per_gene(df,
                       min_tpm=1,
                       gene_subset='tf',
                       groupby='all', 
                       nov=['Known', 'NIC', 'NNC', 'ISM_rescue'])
gene_df, _, _ = get_gtf_info(how='gene', ver=ver, add_stable_gid=True)

df.reset_index(inplace=True)
df['gid_stable'] = cerberus.get_stable_gid(df, 'annot_gene_id')
df.set_index('annot_gene_id')
      
df = df.merge(gene_df[['gid_stable', 'gname']], 
              how='left', 
              on='gid_stable')
df = df.sort_values(by='all', ascending=False)
df.head(10)

Calculating iso TPM values
Subsetting for novelty categories ['Known', 'NIC', 'NNC', 'ISM_rescue']
Subsetting for tf genes
Enforcing minimum TPM
Total # isos detected: 245379
# isos >= 1 tpm: 223477
Applying gene type and novelty subset
Number of isos reported: 13122


Unnamed: 0,annot_gene_id,all,gid_stable,gname
87,ENSG00000081189.15,74,ENSG00000081189,MEF2C
5,ENSG00000005801.17,71,ENSG00000005801,ZNF195
68,ENSG00000071564.14,67,ENSG00000071564,TCF3
120,ENSG00000095794.19,66,ENSG00000095794,CREM
44,ENSG00000062194.15,64,ENSG00000062194,GPBP1
843,ENSG00000175387.15,64,ENSG00000175387,SMAD2
62,ENSG00000068305.17,59,ENSG00000068305,MEF2A
302,ENSG00000120837.7,57,ENSG00000120837,NFYB
404,ENSG00000130844.17,57,ENSG00000130844,ZNF331
100,ENSG00000085274.15,56,ENSG00000085274,MYNN


Which TF has the most isoforms expressed in a single sample? 

In [9]:
df = pd.read_csv(filt_ab, sep='\t')
df = get_isos_per_gene(df,
                       min_tpm=1,
                       gene_subset='tf',
                       groupby='sample', 
                       nov=['Known', 'NIC', 'NNC', 'ISM_rescue'])
df['max_sample'] = df.max(1)
df = df['max_sample'].to_frame()

gene_df, _, _ = get_gtf_info(how='gene', ver=ver, add_stable_gid=True)
df.reset_index(inplace=True)
df['gid_stable'] = cerberus.get_stable_gid(df, 'annot_gene_id')
df.set_index('annot_gene_id')
df = df.merge(gene_df[['gid_stable', 'gname']], 
              how='left', 
              on='gid_stable')

df = df.sort_values(by='max_sample', ascending=False)
df.head(10)

Calculating iso TPM values
Subsetting for novelty categories ['Known', 'NIC', 'NNC', 'ISM_rescue']
Subsetting for tf genes
Enforcing minimum TPM
Total # isos detected: 245379
# isos >= 1 tpm: 223477
Applying gene type and novelty subset
Number of isos reported: 13122
Found 50 total samples


Unnamed: 0,annot_gene_id,max_sample,gid_stable,gname
250,ENSG00000115415.18,44.0,ENSG00000115415,STAT1
120,ENSG00000095794.19,42.0,ENSG00000095794,CREM
87,ENSG00000081189.15,41.0,ENSG00000081189,MEF2C
62,ENSG00000068305.17,41.0,ENSG00000068305,MEF2A
44,ENSG00000062194.15,41.0,ENSG00000062194,GPBP1
471,ENSG00000137504.13,38.0,ENSG00000137504,CREBZF
728,ENSG00000168610.14,36.0,ENSG00000168610,STAT3
5,ENSG00000005801.17,36.0,ENSG00000005801,ZNF195
364,ENSG00000126561.16,35.0,ENSG00000126561,STAT5A
101,ENSG00000085276.17,34.0,ENSG00000085276,MECOM


## Protein coding gene transcript detection

What fraction of protein-coding genes do we detect with at least 1 known isoform >= 1 TPM in any library?

In [10]:
df = pd.read_csv(filt_ab, sep='\t')
# no gene subset here so we can look at all other genes
df, tids = get_tpm_table(df,
                   how='iso',
                   nov=['Known'],
                   min_tpm=1)

# merge with gene id
gene_df = pd.read_csv(filt_ab, sep='\t')
gene_df = gene_df[['annot_gene_id', 'annot_transcript_id']]
df = df.merge(gene_df, how='left', left_index=True, right_on='annot_transcript_id')

gene_df, b_counts, b_cat_counts = get_gtf_info(how='gene', ver=ver, add_stable_gid=True)
df['gid_stable'] = cerberus.get_stable_gid(df, 'annot_gene_id')
print('gene_df')
print(gene_df.head())
print('df')
print(df.head())
df = df.merge(gene_df, how='left', left_on='annot_gene_id', right_on='gid')

temp = df[['gid', 'biotype_category']].drop_duplicates().groupby('biotype_category').count()
temp.reset_index(inplace=True)
temp.rename({'gid': 'detected_counts'}, axis=1, inplace=True)

temp = temp.merge(b_cat_counts, on='biotype_category', how='outer')
temp['prop_detected'] = temp.detected_counts/temp.gencode_counts
temp.sort_values(by='prop_detected', ascending=False, inplace=True)
temp

Calculating iso TPM values
Subsetting for novelty categories ['Known']


  df[total_col] = df[d].sum()
  df[tpm_col] = (df[d]*1000000)/df[total_col]


Enforcing minimum TPM
Total # isos detected: 245379
# isos >= 1 tpm: 223477
Applying gene type and novelty subset
Number of isos reported: 135125
gene_df
                  gid     gname  length         biotype biotype_category  \
0  ENSG00000000460.17  C1orf112  192174  protein_coding   protein_coding   
1  ENSG00000000971.17       CFH  100823  protein_coding   protein_coding   
2  ENSG00000001461.17    NIPAL3   57275  protein_coding   protein_coding   
3  ENSG00000004487.18     KDM1A   64347  protein_coding   protein_coding   
4  ENSG00000007933.13      FMO3   27019  protein_coding   protein_coding   

      tf       gid_stable  
0  False  ENSG00000000460  
1  False  ENSG00000000971  
2  False  ENSG00000001461  
3  False  ENSG00000004487  
4  False  ENSG00000007933  
df
    mcf7_1_1  k562_1_1  heart_right_ventricle_2_1  gm12878_3_1  \
8        0.0       0.0                        0.0          0.0   
9        0.0       0.0                        0.0          0.0   
10       0.0       0

Unnamed: 0,biotype_category,detected_counts,gencode_counts,prop_detected
2,other,2182,7391,0.295224
1,miRNA,466,1879,0.248004
4,pseudogene,3430,14538,0.235933
0,lncRNA,3389,17748,0.190951
3,protein_coding,787,19988,0.039374


## What fraction of protein coding gene transcripts do we detect?

Of the transcripts associated with protein coding genes GENCODE, how many do we detect >=1 TPM?

In [3]:
ca = cerberus.read(c_annot)

In [4]:
temp = ca.t_map[['original_transcript_id', 'original_transcript_name'

Unnamed: 0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,gene_id,gene_name,original_transcript_name,transcript_triplet,transcript_id,transcript_name,tss_first_sd_issue,tes_last_sa_issue
0,ENCODEHT000206942,1,ENCODEHG000058846_1,ENCODEHG000058846_1,1,ENCODEHG000058846_1,1,ENCODEHG000058846,ENCODEHG000058846,ENCODEHT000206942,"[1,1,1]","ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]",False,False
1,ENCODEHT000206942#0,1,ENCODEHG000058846_1,ENCODEHG000058846_1,1,ENCODEHG000058846_1,1,ENCODEHG000058846,ENCODEHG000058846,ENCODEHT000206942,"[1,1,1]","ENCODEHG000058846[1,1,1]","ENCODEHG000058846[1,1,1]",False,False
2,ENCODEHT000206867,4,ENCODEHG000058837_4,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206867,"[2,4,1]","ENCODEHG000058837[2,4,1]","ENCODEHG000058837[2,4,1]",False,False
3,ENCODEHT000206867#0,4,ENCODEHG000058837_4,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206867,"[2,4,1]","ENCODEHG000058837[2,4,1]","ENCODEHG000058837[2,4,1]",False,False
4,ENCODEHT000206868,2,ENCODEHG000058837_2,ENCODEHG000058837_2,2,ENCODEHG000058837_1,1,ENCODEHG000058837,ENCODEHG000058837,ENCODEHT000206868,"[2,2,1]","ENCODEHG000058837[2,2,1]","ENCODEHG000058837[2,2,1]",False,False
