In [1]:
import pandas as pd
import pyranges as pr
import sys
import os
from Bio import SearchIO
import yaml
from snakemake.io import expand
import pdb
import swan_vis as swan

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *
from scripts.mane_utils import *

In [2]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_ab = '../'+expand(config['data']['ab'], species='mouse')[0]
m_filt_ab = '../'+expand(config['data']['filt_ab'], species='mouse')[0]
m_read_annot = '../'+expand(config['data']['read_annot'], species='mouse')[0]
m_ref_t_metadata = '../'+expand(config['ref']['t_info'], species='mouse')[0]
m_ref_g_metadata = '../'+expand(config['ref']['g_info'], species='mouse')[0]
m_t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='mouse')[0]
m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_swan_file = '../'+expand(config['data']['sg'], species='mouse')[0]
m_cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='mouse')[0]
m_cerb_t_metadata = '../'+expand(config['data']['t_info'], species='mouse')[0]
m_major_isos = '../'+expand(config['data']['major_isos'], species='mouse', obs_col='sample')[0]
m_pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='mouse')[0]


mouse_ver = 'vM25_cerberus'

# Human 

## What % of novel IC protein coding transcripts are predicted to be NMD or non-coding?

In [4]:
def find_pc_tids(h5,
                 pp_file,
                 filt_ab,
                 species,
                 min_tpm):
    ca = cerberus.read(h5)
    df = pd.read_csv(pp_file, sep='\t')
    iso_df = pd.read_csv(filt_ab, sep='\t')
    iso_df, tids = get_tpm_table(iso_df,
                               how='iso',
                               min_tpm=min_tpm,
                               gene_subset=gene_subset,
                               species=species)
    # get tids for each protein pred
    df = df.loc[df.tid.isin(tids)]
    len(df.index)
    
    feat = 'ic'
    df = add_feat(df, col='tid', kind=feat)
    feat_df = get_ca_table(h5, feat)
    feat_df = feat_df[['Name', 'novelty']]
    feat_df.rename({'novelty': '{}_novelty'.format(feat),
                    'Name': feat}, axis=1, inplace=True)
    df = df.merge(feat_df, how='left', on=feat)
    
    temp = df.loc[df.ic_novelty != 'Known']
    n = len(temp.index)
    n_num = len(temp.loc[(temp.full_orf == True)&(temp.nmd == False)].index)
    print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of protein coding transcripts with novel ICs have a full ORF and no NMD')
    
    if species == 'human':
        refs = ['v40', 'v29']
    elif species == 'mouse':
        refs = ['vM21', 'vM25']
    
    # number of novel transcripts (including 5' / 3' end)
    ref = ca.t_map.loc[ca.t_map.source.isin(refs)]
    ref_tids = ref.transcript_id.unique().tolist()
    temp = df.loc[~df.tid.isin(ref_tids)]
    n = len(temp.index)
    n_num = len(temp.loc[(temp.nmd == False)&(temp.full_orf == True)].index)
    print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of novel protein coding transcripts have a full ORF and no NMD')

In [5]:
find_pc_tids(cerberus_h5,
             pp_summary,
             filt_ab,
             'human',
             min_tpm=min_tpm)

Calculating iso TPM values
Subsetting for protein_coding genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 187945
72.59% (53412/73582) of protein coding transcripts with novel ICs have a full ORF and no NMD
77.57% (90283/116395) of novel protein coding transcripts have a full ORF and no NMD


## How often does a non-MANE predominant isoform have the same ORF as the MANE isoform?

In [6]:
def get_mp_orf_table(sg,
                    ca,
                    pp_summary,
                    ref_t_metadata,
                    ref_g_metadata,
                    obs_col,
                    min_feat_tpm):

    mp_df = get_mp_df_table(sg, ca, 
                           ref_t_metadata,
                           ref_g_metadata,
                           obs_col,
                           min_feat_tpm,
                           feat='triplet')

    pp_df = pd.read_csv(pp_summary, sep='\t')

    temp = mp_df.merge(pp_df[['tid', 'seq', 'len', 'nmd', 'full_orf']],
                        how='left',
                        left_on='tid_princ',
                        right_on='tid',
                        suffixes=('', '_orf_princ'))
    temp = temp.merge(pp_df[['tid', 'seq', 'len', 'nmd', 'full_orf']],
                        how='left',
                        left_on='tid_mane',
                        right_on='tid',
                        suffixes=('', '_orf_mane'))

    return temp

In [7]:
sg = swan.read(swan_file)
ca = cerberus.read(cerberus_h5)
temp = get_mp_orf_table(sg,
                ca,
                pp_summary,
                ref_t_metadata,
                ref_g_metadata,
                'dataset',
                min_tpm)


Read in graph from ../data/human/swan_graph.p


In [8]:
temp['triplet_princ_orf_is_mane'] = temp['seq'] == temp['seq_orf_mane']

In [9]:
n = len(temp.loc[temp.triplet_princ_is_mane==False, 'tid_princ'].unique().tolist())
n_num = len(temp.loc[(temp.triplet_princ_is_mane==False)&(temp.triplet_princ_orf_is_mane==True), 'tid_princ'].unique().tolist())
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of non-MANE predominant transcripts have the same ORF as MANE')


18.79% (10086/53686) of non-MANE predominant transcripts have the same ORF as MANE


## How often is non-MANE predominant transcript NMD?

In [10]:
temp2 = temp.loc[temp.triplet_princ_is_mane==False]
n = len(temp2['tid_princ'].unique().tolist())
n_num = len(temp2.loc[(temp2.nmd==False)&(temp2.full_orf==True), 'tid_princ'].unique().tolist())
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of non-MANE predominant transcripts have a full ORF and no NMD')


84.40% (45311/53686) of non-MANE predominant transcripts have a full ORF and no NMD


# Mouse

## What % of novel IC protein coding transcripts are predicted to be NMD?

In [11]:
find_pc_tids(m_cerberus_h5,
             m_pp_summary,
             m_filt_ab,
             'mouse',
             min_tpm=min_tpm)

Calculating iso TPM values
Subsetting for protein_coding genes
Enforcing minimum TPM
Total # isos detected: 149463
# isos >= 1 tpm: 138642
Applying gene type and novelty subset
Number of isos reported: 127084
70.58% (34900/49448) of protein coding transcripts with novel ICs have a full ORF and no NMD
77.13% (61066/79169) of novel protein coding transcripts have a full ORF and no NMD
