In [3]:
import pandas as pd
import pyranges as pr
import sys
import os
from Bio import SearchIO
import yaml
from snakemake.io import expand
import pdb
import swan_vis as swan

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *
from scripts.mane_utils import *

In [4]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [5]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
orf_fa = '../'+expand(config['data']['p_pred']['orf_fa'], species='human')[0]
pp_bed = '../'+expand(config['data']['p_pred']['cds_bed'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_ab = '../'+expand(config['data']['ab'], species='mouse')[0]
m_filt_ab = '../'+expand(config['data']['filt_ab'], species='mouse')[0]
m_read_annot = '../'+expand(config['data']['read_annot'], species='mouse')[0]
m_ref_t_metadata = '../'+expand(config['ref']['t_info'], species='mouse')[0]
m_ref_g_metadata = '../'+expand(config['ref']['g_info'], species='mouse')[0]
m_t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='mouse')[0]
m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_swan_file = '../'+expand(config['data']['sg'], species='mouse')[0]
m_cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='mouse')[0]
m_cerb_t_metadata = '../'+expand(config['data']['t_info'], species='mouse')[0]
m_major_isos = '../'+expand(config['data']['major_isos'], species='mouse', obs_col='sample')[0]
m_orf_fa = '../'+expand(config['data']['p_pred']['orf_fa'], species='mouse')[0]
m_pp_bed = '../'+expand(config['data']['p_pred']['cds_bed'], species='mouse')[0]

mouse_ver = 'vM25_cerberus'

# Human 

## What % of novel IC protein coding transcripts are predicted to be NMD?

In [20]:
ca = cerberus.read(cerberus_h5)
df = read_pred(pp_bed)
iso_df = pd.read_csv(filt_ab, sep='\t')
iso_df, tids = get_tpm_table(iso_df,
                           how='iso',
                           min_tpm=min_tpm,
                           gene_subset=gene_subset)
# get tids for each protein pred
df = df.loc[df.tid.isin(tids)]
len(df.index)

Calculating iso TPM values
Subsetting for protein_coding genes
Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 187945


187945

In [21]:
feat = 'ic'
df = add_feat(df, col='tid', kind=feat)
feat_df = get_ca_table(cerberus_h5, feat)
feat_df = feat_df[['Name', 'novelty']]
feat_df.rename({'novelty': '{}_novelty'.format(feat),
                'Name': feat}, axis=1, inplace=True)
df = df.merge(feat_df, how='left', on=feat)

In [26]:
n = len(df.loc[df.ic_novelty != 'Known'].index)
n_num = len(df.loc[(df.ic_novelty != 'Known')&(df.nmd == True)].index)
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of protein coding transcripts with novel ICs are predicted NMD')

26.70% (19647/73582) of protein coding transcripts with novel ICs are predicted NMD


In [37]:
# number of novel transcripts (including 5' / 3' end)
refs = ['v40', 'v29']
ref = ca.t_map.loc[ca.t_map.source.isin(refs)]
ref_tids = ref.transcript_id.unique().tolist()

n = len(df.loc[~df.tid.isin(ref_tids)].index)
n_num = len(df.loc[~(df.tid.isin(ref_tids))&(df.nmd==True)].index)
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of novel protein coding transcripts are predicted NMD')

21.90% (25494/116395) of novel protein coding transcripts are predicted NMD


## How often does a non-MANE predominant isoform have the same ORF as the MANE isoform?

In [10]:
sg = swan.read(swan_file)
ca = cerberus.read(cerberus_h5)
mp_df = get_mp_df_table(sg, ca, 
                       ref_t_metadata,
                       ref_g_metadata,
                       'dataset',
                       min_tpm,
                       feat='triplet')

orf_df = read_orf_fa(orf_fa)
pp_df = read_pred(pp_bed)

Read in graph from ../data/human/swan_graph.p


In [11]:
temp = mp_df.merge(orf_df[['tid', 'seq', 'len']],
                    how='left',
                    left_on='tid_princ',
                    right_on='tid',
                    suffixes=('', '_orf_princ'))
temp = temp.merge(orf_df[['tid', 'seq', 'len']],
                    how='left',
                    left_on='tid_mane',
                    right_on='tid',
                    suffixes=('', '_orf_mane'))

In [12]:
temp = temp.merge(pp_df[['tid', 'nmd']],
            how='left', 
            left_on='tid_princ',
            right_on='tid',
            suffixes=('', '_pp_princ'))
temp = temp.merge(pp_df[['tid', 'nmd']],
            how='left', 
            left_on='tid_mane',
            right_on='tid',
            suffixes=('', '_pp_mane'))

In [13]:
temp['triplet_princ_orf_is_mane'] = temp['seq'] == temp['seq_orf_mane']

In [14]:
n = len(temp.loc[temp.triplet_princ_is_mane==False, 'tid_princ'].unique().tolist())
n_num = len(temp.loc[(temp.triplet_princ_is_mane==False)&(temp.triplet_princ_orf_is_mane==True), 'tid_princ'].unique().tolist())
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of non-MANE predominant transcripts have the same ORF as MANE')


37.47% (20114/53686) of non-MANE predominant transcripts have the same ORF as MANE


## How often is non-MANE predominant transcript NMD?

In [15]:
n = len(temp.loc[temp.triplet_princ_is_mane==False, 'tid_princ'].unique().tolist())
n_num = len(temp.loc[(temp.triplet_princ_is_mane==False)&(temp.nmd==True), 'tid_princ'].unique().tolist())
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of non-MANE predominant transcripts are predicted NMD')


15.25% (8185/53686) of non-MANE predominant transcripts are predicted NMD


# Mouse

## What % of novel IC protein coding transcripts are predicted to be NMD?

In [38]:
ca = cerberus.read(m_cerberus_h5)
df = read_pred(m_pp_bed)
iso_df = pd.read_csv(m_filt_ab, sep='\t')
iso_df, tids = get_tpm_table(iso_df,
                           how='iso',
                           min_tpm=min_tpm,
                           gene_subset=gene_subset,
                           species='mouse')
# get tids for each protein pred
df = df.loc[df.tid.isin(tids)]
len(df.index)

Calculating iso TPM values
Subsetting for protein_coding genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 149463
# isos >= 1 tpm: 138642
Applying gene type and novelty subset
Number of isos reported: 127084


127084

In [39]:
feat = 'ic'
df = add_feat(df, col='tid', kind=feat)
feat_df = get_ca_table(m_cerberus_h5, feat)
feat_df = feat_df[['Name', 'novelty']]
feat_df.rename({'novelty': '{}_novelty'.format(feat),
                'Name': feat}, axis=1, inplace=True)
df = df.merge(feat_df, how='left', on=feat)

In [40]:
n = len(df.loc[df.ic_novelty != 'Known'].index)
n_num = len(df.loc[(df.ic_novelty != 'Known')&(df.nmd == True)].index)
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of protein coding transcripts with novel ICs are predicted NMD')

28.86% (14271/49448) of protein coding transcripts with novel ICs are predicted NMD


In [41]:
# number of novel transcripts (including 5' / 3' end)
refs = ['vM21', 'vM25']
ref = ca.t_map.loc[ca.t_map.source.isin(refs)]
ref_tids = ref.transcript_id.unique().tolist()

n = len(df.loc[~df.tid.isin(ref_tids)].index)
n_num = len(df.loc[~(df.tid.isin(ref_tids))&(df.nmd==True)].index)
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of novel protein coding transcripts are predicted NMD')

22.42% (17752/79169) of novel protein coding transcripts are predicted NMD
