In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

## PolyA gene transcript detection

What fraction of polyA genes from the annotation do we detect with at least 1 known isoform >= 1 TPM in any library?

In [2]:
df = pd.read_csv('human_talon_abundance.tsv', sep='\t')
df = get_tpm_table(df,
                   how='iso',
                   nov=['Known'],
                   min_tpm=1)

85348
Total # isos detected: 85348
# isos >= 1 tpm: 77128


In [5]:
# merge with gene id
gene_df = pd.read_csv('human_talon_abundance.tsv', sep='\t')
gene_df = gene_df[['annot_gene_id', 'annot_transcript_id']]
df = df.merge(gene_df, how='left', left_index=True, right_on='annot_transcript_id')

In [6]:
df.head()

Unnamed: 0,gm12878_1_1,gm12878_1_2,gm12878_1_3,gm12878_1_4,gm12878_3_1,gm12878_2_1,gm12878_2_2,pgp1_1_1,pgp1_1_2,h9_1_1,...,right_cardiac_atrium_3_1,right_cardiac_atrium_4_1,right_cardiac_atrium_2_1,right_lobe_of_liver_1_1,right_lobe_of_liver_2_1,right_ventricle_myocardium_inferior_1_1,right_ventricle_myocardium_superior_1_1,upper_lobe_of_right_lung_1_1,annot_gene_id,annot_transcript_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSG00000243485.5,ENST00000469289.1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.737626,1.618796,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSG00000237613.2,ENST00000417324.1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.737626,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSG00000237613.2,ENST00000461467.1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSG00000233750.3,ENST00000442987.3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.737626,6.475185,1.242064,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSG00000268903.1,ENST00000494149.2


In [8]:
gene_df = get_gtf_info(how='gene')
polya_biotypes = ['protein_coding', 'pseudogene', 'lncRNA']
polya_genes = gene_df.loc[gene_df.biotype_category.isin(polya_biotypes), 'gid'].tolist()
n_polya = len(polya_genes)
temp = df.loc[df.annot_gene_id.isin(polya_genes)]
n_det_polya = len(temp.annot_gene_id.unique().tolist())

print('Detected {} / {} annotated polyA genes'.format(n_det_polya, n_polya))

Detected 24936 / 49472 annotated polyA genes
