In [13]:
import pandas as pd
import pyranges as pr
import sys
import os
from Bio import SearchIO
import yaml
from snakemake.io import expand
import pdb
import swan_vis as swan

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *
from scripts.mane_utils import *

In [14]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [17]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
orf_fa = '../'+expand(config['data']['p_pred']['orf_fa'], species='human')[0]
pp_bed = '../'+expand(config['data']['p_pred']['cds_bed'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_ab = '../'+expand(config['data']['ab'], species='mouse')[0]
m_filt_ab = '../'+expand(config['data']['filt_ab'], species='mouse')[0]
m_read_annot = '../'+expand(config['data']['read_annot'], species='mouse')[0]
m_ref_t_metadata = '../'+expand(config['ref']['t_info'], species='mouse')[0]
m_ref_g_metadata = '../'+expand(config['ref']['g_info'], species='mouse')[0]
m_t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='mouse')[0]
m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]
m_swan_file = '../'+expand(config['data']['sg'], species='mouse')[0]
m_cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='mouse')[0]
m_cerb_t_metadata = '../'+expand(config['data']['t_info'], species='mouse')[0]
m_major_isos = '../'+expand(config['data']['major_isos'], species='mouse', obs_col='sample')[0]
m_orf_fa = '../'+expand(config['data']['p_pred']['orf_fa'], species='mouse')[0]
m_pp_bed = '../'+expand(config['data']['p_pred']['cds_bed'], species='mouse')[0]

mouse_ver = 'vM25_cerberus'

In [6]:
df = read_pred(pp_bed)
df = read_orf_fa(orf_fa)

df.head()

Unnamed: 0,id,tid,seq,len
0,">ENSG00000000460.16;ENSG00000000460[3,10,3];NA...","ENSG00000000460[3,10,3]",MSQEGAVPASAVPLEELSSWPEELCRRELPSVLPRLLSLSQHSDSW...,911
1,">ENSG00000000460.16;ENSG00000000460[3,10,3];NA...","ENSG00000000460[3,10,3]",SFLSHHAARLGVTSGCRSRNKSDPRSGDFWPSPPHLNVMFTVIQTW...,157
2,">ENSG00000000460.16;ENSG00000000460[3,10,3];NA...","ENSG00000000460[3,10,3]",MFTVIQTWGCLLPDVSWDRASEKAHLTTPPPDLNRAPVSRVLQGLE...,119
3,">ENSG00000000460.16;ENSG00000000460[3,10,3];NA...","ENSG00000000460[3,10,3]",MHRIMLTTDYFRKHSNCVVFLPTPFCTTLRNFFLSSLILAVLCTNC...,70
4,">ENSG00000000460.16;ENSG00000000460[1,10,3];NA...","ENSG00000000460[1,10,3]",PLYGFGAGGERARVGGTASFRSGLWRVWFEALLFDESMSQEGAVPA...,948


In [7]:
iso_df = pd.read_csv(filt_ab, sep='\t')
iso_df, tids = get_tpm_table(iso_df,
                           how='iso',
                           min_tpm=min_tpm,
                           gene_subset=gene_subset)
len(tids)

Calculating iso TPM values
Subsetting for protein_coding genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 187945


187945

In [8]:
# get tids for each protein pred
df = df.loc[df.tid.isin(tids)]
len(df.index)

770999

In [None]:
ca = cerberus.read(cerberus_h5)

In [None]:
sg = swan.read(swan_file)

In [None]:
# get the acta1 transcripts from heart
def get_isos(ca, filt_ab, gene, sample, species):
    df = pd.read_csv(filt_ab, sep='\t')
    df = get_det_table(df,
                   groupby='sample',
                   how='iso',
                   min_tpm=1,
                   gene_subset='polya',
                   species=species)
    df = df.loc[sample]
    df = df.to_frame()
    df = df.loc[df[sample]==True]
    gid = ca.triplets.loc[ca.triplets.gname==gene, 'gid'].values[0]
    df.reset_index(inplace=True)
    df['gid'] = df['index'].str.split('[', expand=True)[0]
    df = df.loc[df.gid == gid]
    tids = df['index'].tolist()
    return tids

def get_tpm_df(sg, tids, obs_col, obs_condition):
    # get tpm df
    tpm_df = swan.calc_tpm(sg.adata, obs_col=obs_col).sparse.to_dense()
    tpm_df = tpm_df.transpose()
    tpm_df = tpm_df.loc[tids, obs_condition].to_frame()
    return tpm_df


In [None]:
obs_condition = 'heart'
tids = get_isos(ca, filt_ab, 'ACTA1', obs_condition, 'human')
tpm_df = get_tpm_df(sg, tids, obs_col, obs_condition)
tpm_df = tpm_df.sort_values(by=obs_condition, ascending=False)


In [None]:
tpm_df.head()
tids = tpm_df.index.tolist()

In [None]:
# df.set_index('tid', inplace=True)
df.loc[tids][['CDS_Start', 'CDS_Stop', 'frame', 'nmd']]

In [None]:
a_df = df.loc[df.tid.isin(tids)]
a_df[['tid', 'CDS_Start', 'CDS_Stop', 'frame', 'nmd']].sort_values(['CDS_Start', 'frame'])


## Okay, what are the actual ORF sequences for all these ACTA transcripts?

In [None]:
df = read_orf_fa(orf_fa)
# df.set_index('tid', inplace=True)

In [None]:
len(tids)
tids

In [None]:
# df.loc[tids][['seq', 'len']]
temp = df.sort_values(by='len', ascending=False).drop_duplicates(subset='tid', keep='first')
temp.set_index('tid', inplace=True)
temp.loc[tids][['seq', 'len']]

In [None]:
mane_orf = temp.loc['ENSG00000143632[1,1,1]', 'seq']
temp['mane_orf'] = temp.seq==mane_orf

In [None]:
temp = temp.merge(a_df, how='left', left_index=True, right_on='tid')

In [None]:
temp.set_index('tid', inplace=True)

In [None]:
temp.loc[tids][['seq', 'len', 'mane_orf', 'CDS_Start', 'CDS_Stop', 'frame', 'nmd']]

In [None]:
# plot some isoforms using swan
tid = 'ENSG00000143632[1,1,1]' 
sg.plot_transcript_path(tid, indicate_novel=True)

In [None]:
tid = 'ENSG00000143632[1,7,1]'
sg.plot_transcript_path(tid, indicate_novel=True)

In [None]:
tid = 'ENSG00000143632[1,6,1]'
sg.plot_transcript_path(tid, indicate_novel=True)

## What % of novel IC transcripts are predicted to be NMD?

In [9]:
ca = cerberus.read(cerberus_h5)

In [4]:
df = read_pred(pp_bed)
iso_df = pd.read_csv(filt_ab, sep='\t')
iso_df, tids = get_tpm_table(iso_df,
                           how='iso',
                           min_tpm=min_tpm,
                           gene_subset=gene_subset)
# get tids for each protein pred
df = df.loc[df.tid.isin(tids)]
len(df.index)

Calculating iso TPM values
Subsetting for protein_coding genes


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


Enforcing minimum TPM
Total # isos detected: 231136
# isos >= 1 tpm: 210239
Applying gene type and novelty subset
Number of isos reported: 187945


187945

In [5]:
for feat in ['tss', 'ic', 'tes']:
    df = add_feat(df, col='tid', kind=feat)
    feat_df = get_ca_table(cerberus_h5, feat)
    feat_df = feat_df[['Name', 'novelty']]
    feat_df.rename({'novelty': '{}_novelty'.format(feat),
                    'Name': feat}, axis=1, inplace=True)
    df = df.merge(feat_df, how='left', on=feat)

In [7]:
n = len(df.loc[df.ic_novelty != 'Known'].index)
n_num = len(df.loc[(df.ic_novelty != 'Known')&(df.nmd == True)].index)
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of protein coding transcripts with novel ICs are predicted NMD')

26.70% (19647/73582) of transcripts with novel ICs are predicted NMD


In [10]:
# number of novel transcripts (including 5' / 3' end)
refs = ['v40', 'v29']
ref = ca.t_map.loc[ca.t_map.source.isin(refs)]
ref_tids = ref.transcript_id.unique().tolist()

n = len(df.loc[df.tid.isin(ref_tids)].index)
n_num = len(df.loc[(df.tid.isin(ref_tids))&(df.nmd==True)].index)
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of protein coding novel transcripts are predicted NMD')

18.83% (13475/71550) of novel transcripts are predicted NMD


## How often does a non-MANE predominant isoform have the same ORF as the MANE isoform?

In [11]:
sg = swan.read(swan_file)
ca = cerberus.read(cerberus_h5)
mp_df = get_mp_df_table(sg, ca, 
                       ref_t_metadata,
                       ref_g_metadata,
                       'dataset',
                       min_tpm,
                       feat='triplet')

orf_df = read_orf_fa(orf_fa)
pp_df = read_pred(pp_bed)

Read in graph from ../data/human/swan_graph.p


In [12]:
temp = mp_df.merge(orf_df[['tid', 'seq', 'len']],
                    how='left',
                    left_on='tid_princ',
                    right_on='tid',
                    suffixes=('', '_orf_princ'))
temp = temp.merge(orf_df[['tid', 'seq', 'len']],
                    how='left',
                    left_on='tid_mane',
                    right_on='tid',
                    suffixes=('', '_orf_mane'))

In [45]:
temp = temp.merge(pp_df[['tid', 'nmd']],
            how='left', 
            left_on='tid_princ',
            right_on='tid',
            suffixes=('', '_pp_princ'))
temp = temp.merge(pp_df[['tid', 'nmd']],
            how='left', 
            left_on='tid_mane',
            right_on='tid',
            suffixes=('', '_pp_mane'))

In [47]:
temp['triplet_princ_orf_is_mane'] = temp['seq'] == temp['seq_orf_mane']

In [49]:
temp.columns

Index(['tid_princ', 'dataset', 'triplet_tpm_princ', 'triplet_pi_princ', 'gid',
       'gname', 'triplet_rank_princ', 'tid_mane', 'triplet_tpm_mane',
       'triplet_pi_mane', 'triplet_rank_mane', 'tid_sec', 'triplet_tpm_sec',
       'triplet_pi_sec', 'triplet_rank_sec', 'triplet_princ_is_mane',
       'triplet_sec_is_mane', 'tid', 'seq', 'len', 'tid_orf_mane',
       'seq_orf_mane', 'len_orf_mane', 'tid_pp_princ', 'nmd', 'tid_pp_mane',
       'nmd_pp_mane', 'triplet_princ_orf_is_mane'],
      dtype='object')

In [51]:
n = len(temp.loc[temp.triplet_princ_is_mane==False, 'tid_princ'].unique().tolist())
n_num = len(temp.loc[(temp.triplet_princ_is_mane==False)&(temp.triplet_princ_orf_is_mane==True), 'tid_princ'].unique().tolist())
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of non-MANE predominant transcripts have the same ORF as MANE')


37.47% (20114/53686) of non-MANE predominant transcripts have the same ORF as MANE


## How often is non-MANE predominant transcript NMD?

In [52]:
n = len(temp.loc[temp.triplet_princ_is_mane==False, 'tid_princ'].unique().tolist())
n_num = len(temp.loc[(temp.triplet_princ_is_mane==False)&(temp.nmd==True), 'tid_princ'].unique().tolist())
print(f'{(n_num/n)*100:.2f}% ({n_num}/{n}) of non-MANE predominant transcripts are predicted NMD')


15.25% (8185/53686) of non-MANE predominant transcripts are predicted NMD
