In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *
# from proc_revisions.mane_utils import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
species = 'human'
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
lib_meta = od+expand(config['lr']['meta'], species=species)[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]
pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='human')[0]
gtf_file = od+expand(config['lr']['cerberus']['gtf'], species='human')[0]

gtex_gtf_file = od+expand(config['gtex']['gtf'], species='human')[0]

gene_subset = 'polya'
min_tpm = 1

species = 'mouse'
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species=species)[0]
m_gtf = od+expand(config['lr']['cerberus']['gtf'], species=species)[0]
m_lib_meta = od+expand(config['lr']['meta'], species=species)[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species=species, obs_col='sample')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species=species)[0]
m_ref_gtf = od+expand(config['ref']['new_gtf'], species=species)[0]


## Get novel transcripts in BED format

In [7]:
gtf_df = pr.read_gtf(m_gtf).df

In [8]:
# get expressed novel genes
df = pd.read_csv(m_filt_ab, sep='\t')
df, _ = get_tpm_table(df, 
                      how='iso',
                      min_tpm=1,
                      gene_subset=None,
                      species='mouse')

df.reset_index(inplace=True)
df['gid'] = df.annot_transcript_id.str.split('[', expand=True)[0]
df.head()
df.reset_index(inplace=True)
df.rename({'gid_stable':'gid'}, axis=1, inplace=True)
ab_df = pd.read_csv(m_filt_ab, sep='\t')
ab_df['gid'] = ab_df.annot_transcript_id.str.split('[', expand=True)[0]
ab_df = ab_df[['gid', 'gene_novelty']].drop_duplicates()
ab_df.loc[ab_df.gid.duplicated(keep=False)].sort_values(by='gid')
assert len(ab_df.loc[ab_df.gid.duplicated(keep=False)].index) == 0
df = df.merge(ab_df, how='left', on='gid')
df.loc[df.gene_novelty=='Fusion', 'gene_novelty'] = 'Readthrough'
df.head()

Calculating iso TPM values


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 153398
# isos >= 1 tpm: 142504
Number of isos reported: 142504


Unnamed: 0,index,annot_transcript_id,adrenal_14d_f_2,hippocampus_18-20mo_m_1,hippocampus_2mo_f_1,cortex_wt_m_2_2,adrenal_gland_2_2,adrenal_4d_f_1,adrenal_36d_m_1,cortex_14d_f_2,...,gastroc_2mo_f_1,gastroc_14d_f_2,hippocampus_wt_f_1_2,gastroc_25d_m_2,hippocampus_5x_f_1_1,heart_2mo_m_1,adrenal_4d_m_1,adrenal_18-20mo_m_1,gid,gene_novelty
0,0,"ENSMUSG00000051951[2,2,3]",0.0,0.0,0.0,0.451524,0.0,0.0,0.0,0.0,...,0.0,0.0,0.629709,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000051951,Known
1,1,"ENSMUSG00000089699[1,1,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.123974,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000089699,Known
2,2,"ENSMUSG00000103161[1,1,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.629709,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000103161,Known
3,3,"ENSMUSG00000025902[1,2,1]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000025902,Known
4,4,"ENSMUSG00000025902[1,2,3]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.123974,1.259419,0.0,0.0,0.0,0.0,0.0,ENSMUSG00000025902,Known


In [10]:
# get novelty types of each IC from cerberus
ca = cerberus.read(m_cerberus_h5)
temp = ca.ic.copy(deep=True)

In [13]:
df['ic_id'] = df['gid']+'_'+\
    df.annot_transcript_id.str.split('[',expand=True)[1].str.split(',', expand=True)[1]

In [21]:
df = df.merge(temp[['Name', 'novelty']],
              how='left',
              left_on='ic_id',
              right_on='Name')

In [24]:
df[['annot_transcript_id', 'novelty']].groupby('novelty').nunique()

Unnamed: 0_level_0,annot_transcript_id
novelty,Unnamed: 1_level_1
ISM,7275
Known,93866
NIC,22847
NNC,18516


In [25]:
df[['annot_transcript_id', 'gene_novelty']].groupby('gene_novelty').nunique()

Unnamed: 0_level_0,annot_transcript_id
gene_novelty,Unnamed: 1_level_1
Intergenic,29
Known,142366
Readthrough,109


In [26]:
df[['annot_transcript_id', 'gene_novelty', 'novelty']].groupby(['gene_novelty', 'novelty']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,annot_transcript_id
gene_novelty,novelty,Unnamed: 2_level_1
Intergenic,NNC,29
Known,ISM,7275
Known,Known,93866
Known,NIC,22847
Known,NNC,18378
Readthrough,NNC,109


In [30]:
# list of detected, novel transcripts
tids = df.loc[df.novelty!='Known'].annot_transcript_id.tolist()
print(len(tids))

48638


In [32]:
# filter gtf and save
gtf_df = gtf_df.loc[gtf_df.transcript_id.isin(tids)]
print(len(gtf_df.transcript_id.unique()))

pr.PyRanges(gtf_df).to_gtf('mouse_novel_transcripts.gtf')

48638


In [None]:
# convert to BED


## Liftover

In [6]:
# download the chain file
# !wget https://hgdownload.soe.ucsc.edu/goldenPath/mm39/liftOver/mm39ToMm10.over.chain.gz -O /Users/fairliereese/Documents/programming/mortazavi_lab/data/paper_rnawg/proc_revisions/ref/mouse/mm39ToMm10.over.chain.gz