In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import pdb
import copy
from scipy import sparse
import anndata
import cerberus
import subprocess
import re

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [3]:
major_set = '../swan/isos_sample_gene_90.tsv'
# c_annot = 'cerberus_annot_triplets.h5'
swan_file = '../swan/swan.p'
filt_ab = '../cerberus/cerberus_filtered_abundance.tsv'
h5 = 'gtex_cerberus.h5'
ver = 'v40_cerberus'

gene_subset = 'polya'
min_tpm = 1

In [42]:
ca = cerberus.read(h5)

## Cerberus transcripts

In [43]:
# get observed lapa transcripts
df = pd.read_csv(filt_ab, sep='\t')
df, tids = get_tpm_table(df,
               how='iso',
               min_tpm=min_tpm,
               gene_subset=gene_subset)

Calculating iso TPM values
Subsetting for polya genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 206806


In [44]:
df = ca.t_map.loc[ca.t_map.source=='lapa'].copy(deep=True)
df = df.merge(ca.ic[['Name', 'novelty']], how='left', left_on='ic_id', right_on='Name')
df.rename({'novelty':'ic_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tss[['Name', 'novelty']], how='left', left_on='tss_id', right_on='Name')
df.rename({'novelty':'tss_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tes[['Name', 'novelty']], how='left', left_on='tes_id', right_on='Name')
df.rename({'novelty':'tes_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)

df = df.loc[df.transcript_id.isin(tids)]

In [45]:
len(df.transcript_id.unique())

206806

In [46]:
df.to_csv('cerberus_transcript_novelty.tsv', sep='\t', index=False)

## Gtex transcripts

In [47]:
ca.t_map.source.unique()

array(['v40', 'v29', 'lapa', 'gtex'], dtype=object)

In [48]:
df = ca.t_map.loc[ca.t_map.source=='gtex'].copy(deep=True)
df = df.merge(ca.ic[['Name', 'novelty']], how='left', left_on='ic_id', right_on='Name')
df.rename({'novelty':'ic_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tss[['Name', 'novelty']], how='left', left_on='tss_id', right_on='Name')
df.rename({'novelty':'tss_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df = df.merge(ca.tes[['Name', 'novelty']], how='left', left_on='tes_id', right_on='Name')
df.rename({'novelty':'tes_novelty'}, axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)

In [49]:
df.head()

Unnamed: 0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,gene_id,gene_name,original_transcript_name,transcript_triplet,transcript_id,transcript_name,tss_first_sd_issue,tes_last_sa_issue,source,ic_novelty,tss_novelty,tes_novelty
0,0016cd97-0137-4194-9821-910931c6e972_ENSG00000...,22,ENSG00000086015_22,ENSG00000086015_5,5,ENSG00000086015_1,1.0,ENSG00000086015,MAST2,0016cd97-0137-4194-9821-910931c6e972_ENSG00000...,"[5,22,1]","ENSG00000086015[5,22,1]","MAST2[5,22,1]",False,False,gtex,NIC,Known,Known
1,001eea3f-368c-444f-96c0-539cb2204403_ENSG00000...,28,ENSG00000162736_28,ENSG00000162736_1,1,ENSG00000162736_1,1.0,ENSG00000162736,NCSTN,001eea3f-368c-444f-96c0-539cb2204403_ENSG00000...,"[1,28,1]","ENSG00000162736[1,28,1]","NCSTN[1,28,1]",False,False,gtex,NIC,Known,Known
2,0039fbf5-7334-40e1-a1a9-2aa34b6e009c_ENSG00000...,23,ENSG00000084072_23,ENSG00000084072_1,1,ENSG00000084072_3,3.0,ENSG00000084072,PPIE,0039fbf5-7334-40e1-a1a9-2aa34b6e009c_ENSG00000...,"[1,23,3]","ENSG00000084072[1,23,3]","PPIE[1,23,3]",False,False,gtex,NIC,Known,Known
3,003cbbd1-0cb2-43d0-ba48-daabd4d7d518_ENSG00000...,13,ENSG00000137965_13,ENSG00000137965_1,1,ENSG00000137965_1,1.0,ENSG00000137965,IFI44,003cbbd1-0cb2-43d0-ba48-daabd4d7d518_ENSG00000...,"[1,13,1]","ENSG00000137965[1,13,1]","IFI44[1,13,1]",False,False,gtex,NIC,Known,Known
4,004a6304-0523-477e-ad32-90215c977dca_ENSG00000...,77,ENSG00000127603_77,ENSG00000127603_39,39,ENSG00000127603_36,36.0,ENSG00000127603,MACF1,004a6304-0523-477e-ad32-90215c977dca_ENSG00000...,"[39,77,36]","ENSG00000127603[39,77,36]","MACF1[39,77,36]",False,False,gtex,ISM,Known,Known


In [50]:
# limit to polya
if gene_subset:
    gene_df, _, _ = get_gtf_info(how='gene',
                                 ver=ver,
                                 add_stable_gid=True)
    gene_df = gene_df[['gid_stable', 'biotype']]
    df = df.merge(gene_df, how='left',
                    left_on='gene_id', right_on='gid_stable')
    df = df.loc[df.biotype.isin(get_polya_cats())]
    df.drop(['gid_stable', 'biotype'], axis=1, inplace=True)

In [51]:
df.to_csv('cerberus_transcript_novelty.tsv', sep='\t', index=False, mode='a', header=False)

## Make sure everything looks ok

In [4]:
df = pd.read_csv('cerberus_transcript_novelty.tsv', sep='\t')

In [56]:
len(df.loc[df.source=='lapa', 'transcript_id'].unique().tolist())

206806

In [57]:
len(df.loc[df.source=='gtex', 'transcript_id'].unique().tolist())

87014

In [6]:
len(df.transcript_id.unique().tolist())

267923