In [3]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus
import subprocess
import re

p = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [4]:
major_set = '../swan/isos_sample_gene_90.tsv'
c_annot = 'cerberus_annot_triplets.h5'
gtf = '../cerberus.gtf'
swan_file = '../swan/swan.p'
filt_ab = '../cerberus_filtered_abundance.tsv'
h5 = 'gtex_cerberus.h5'

gene_subset = 'polya'
min_tpm = 1

In [7]:
def get_transcript_novelties(c_annot,
                             filt_ab,
                             min_tpm,
                             gene_subset,
                             ofile):
    ca = cerberus.read(c_annot)   
    
    # get observed lapa transcripts
    df = pd.read_csv(filt_ab, sep='\t')
    df, tids = get_tpm_table(df,
                   how='iso',
                   min_tpm=min_tpm,
                   gene_subset=gene_subset)
    
    df = ca.t_map.loc[ca.t_map.source=='lapa'].copy(deep=True)
    df = df.merge(ca.ic[['Name', 'novelty']], how='left', left_on='ic_id', right_on='Name')
    df.rename({'novelty':'ic_novelty'}, axis=1, inplace=True)
    df.drop('Name', axis=1, inplace=True)
    df = df.merge(ca.tss[['Name', 'novelty']], how='left', left_on='tss_id', right_on='Name')
    df.rename({'novelty':'tss_novelty'}, axis=1, inplace=True)
    df.drop('Name', axis=1, inplace=True)
    df = df.merge(ca.tes[['Name', 'novelty']], how='left', left_on='tes_id', right_on='Name')
    df.rename({'novelty':'tes_novelty'}, axis=1, inplace=True)
    df.drop('Name', axis=1, inplace=True)

    df = df.loc[df.transcript_id.isin(tids)]
    
    # gtex stuff
    df = ca.t_map.loc[ca.t_map.source=='gtex'].copy(deep=True)
    df = df.merge(ca.ic[['Name', 'novelty']], how='left', left_on='ic_id', right_on='Name')
    df.rename({'novelty':'ic_novelty'}, axis=1, inplace=True)
    df.drop('Name', axis=1, inplace=True)
    df = df.merge(ca.tss[['Name', 'novelty']], how='left', left_on='tss_id', right_on='Name')
    df.rename({'novelty':'tss_novelty'}, axis=1, inplace=True)
    df.drop('Name', axis=1, inplace=True)
    df = df.merge(ca.tes[['Name', 'novelty']], how='left', left_on='tes_id', right_on='Name')
    df.rename({'novelty':'tes_novelty'}, axis=1, inplace=True)
    df.drop('Name', axis=1, inplace=True)
    
#     # limit to polya
#     if gene_subset:
#         gene_df, _, _ = get_gtf_info(how='gene',
#                                      ver=ver,
#                                      add_stable_gid=True)
#         gene_df = gene_df[['gid_stable', 'biotype']]
#         df = df.merge(gene_df, how='left',
#                         left_on='gid', right_on='gid_stable')
#         df = df.loc[df.biotype==gene_subset]
        
    df.to_csv(ofile, sep='\t')

In [8]:
get_transcript_novelties(c_annot,
                         filt_ab,
                         min_tpm,
                         gene_subset,
                         'test_novelties.tsv')

Calculating iso TPM values
Subsetting for polya genes
Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 206806


In [34]:
ca = cerberus.read(c_annot)

## Cerberus transcripts

In [22]:
# get observed lapa transcripts
df = pd.read_csv(filt_ab, sep='\t')
df, tids = get_tpm_table(df,
               how='iso',
               min_tpm=min_tpm,
               gene_subset=gene_subset)

Calculating iso TPM values
Subsetting for polya genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 206806


In [23]:
df = ca.t_map.loc[ca.t_map.source=='lapa'].copy(deep=True)
df = df.merge(ca.ic[['Name', 'novelty']], how='left', left_on='ic_id', right_on='Name')
df.rename({'novelty':'ic_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tss[['Name', 'novelty']], how='left', left_on='tss_id', right_on='Name')
df.rename({'novelty':'tss_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tes[['Name', 'novelty']], how='left', left_on='tes_id', right_on='Name')
df.rename({'novelty':'tes_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)

df = df.loc[df.transcript_id.isin(tids)]

In [24]:
len(df.transcript_id.unique())

206806

## Gtex transcripts

In [31]:
ca.t_map.source.unique()

array(['v40', 'v29', 'lapa'], dtype=object)

In [32]:
df = ca.t_map.loc[ca.t_map.source=='gtex'].copy(deep=True)
df = df.merge(ca.ic[['Name', 'novelty']], how='left', left_on='ic_id', right_on='Name')
df.rename({'novelty':'ic_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tss[['Name', 'novelty']], how='left', left_on='tss_id', right_on='Name')
df.rename({'novelty':'tss_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tes[['Name', 'novelty']], how='left', left_on='tes_id', right_on='Name')
df.rename({'novelty':'tes_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)

In [33]:
df.head()

Unnamed: 0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,gene_id,gene_name,original_transcript_name,transcript_triplet,transcript_id,transcript_name,tss_first_sd_issue,tes_last_sa_issue,source,ic_novelty,tss_novelty,tes_novelty


In [None]:
# limit to polya
if gene_subset:
    gene_df, _, _ = get_gtf_info(how='gene',
                                 ver=ver,
                                 add_stable_gid=True)
    gene_df = gene_df[['gid_stable', 'biotype']]
    df = df.merge(gene_df, how='left',
                    left_on='gid', right_on='gid_stable')
    df = df.loc[df.biotype==gene_subset]

In [24]:
df.to_csv('cerberus_transcript_novelty.tsv', sep='\t', index=False)

In [2]:
import pandas as pd
df = pd.read_csv('cerberus_transcript_novelty.tsv',sep='\t')
len(df.index)

322461