In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

## PolyA gene detection

What fraction of polyA genes from the annotation do we detect with >= 1 TPM in any library?

In [2]:
df = pd.read_csv('human_talon_abundance.tsv', sep='\t')
df = get_tpm_table(df, how='gene', min_tpm=1)

39285
Total # genes detected: 39285
# genes >= 1 tpm: 33946


In [3]:
gene_df = get_gtf_info(how='gene')

In [6]:
polya_biotypes = ['protein_coding', 'pseudogene', 'lncRNA']
polya_genes = gene_df.loc[gene_df.biotype_category.isin(polya_biotypes), 'gid'].tolist()
n_polya = len(polya_genes)
temp = df.loc[df.index.isin(polya_genes)]
n_det_polya = len(temp.index)

print('Detected {} / {} annotated polyA genes'.format(n_det_polya, n_polya))

Detected 32160 / 49472 annotated polyA genes


How many genes of each biotype are there in the annotation?

In [8]:
temp = gene_df[['gid', 'biotype_category']].groupby('biotype_category').count()
temp.reset_index(inplace=True)
temp.rename({'gid': 'counts'}, axis=1, inplace=True)
temp

Unnamed: 0,biotype_category,counts
0,lncRNA,15006
1,miRNA,1881
2,other,7426
3,protein_coding,19969
4,pseudogene,14497


How many genes of each biotype do we detect?

In [9]:
df = df.merge(gene_df, how='left', left_index=True, right_on='gid')
temp = df[['gid', 'biotype_category']].groupby('biotype_category').count()
temp.reset_index(inplace=True)
temp.rename({'gid': 'counts'}, axis=1, inplace=True)
temp

Unnamed: 0,biotype_category,counts
0,lncRNA,9766
1,miRNA,277
2,other,1509
3,protein_coding,18612
4,pseudogene,3782


What types of "other" genes do we detect?

In [11]:
temp = df.loc[df.biotype_category == 'other']
temp = temp[['gid', 'biotype']].groupby('biotype').count()
temp.reset_index(inplace=True)
temp.rename({'gid': 'counts'}, axis=1, inplace=True)
temp

Unnamed: 0,biotype,counts
0,IG_C_gene,14
1,IG_C_pseudogene,2
2,IG_J_gene,4
3,IG_V_gene,27
4,IG_V_pseudogene,9
5,Mt_rRNA,2
6,Mt_tRNA,10
7,TEC,457
8,TR_C_gene,6
9,TR_J_gene,16
