In [45]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [46]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [52]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = '../'+expand(config['data']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]

## ORF

In [56]:
orfs = '03-orfs_called_encode_human.tsv'

In [57]:
orf_df = pd.read_csv(orfs, sep='\t')
orf_df['tid'] = orf_df['ID'].str.split(';', expand=True)[1]

In [58]:
orf_df.columns

Index(['ID', 'len', 'orf_frame', 'orf_start', 'orf_end', 'orf_len', 'fickett',
       'hexamer', 'coding_score', 'orf_rank', 'seqname', 'strand',
       'gencode_atg', 'upstream_atgs', 'atg_rank', 'score_rank',
       'orf_calling_confidence', 'atg_score', 'orf_score', 'has_stop_codon',
       'tid'],
      dtype='object')

In [59]:
# df.loc[df.tid.isin(acta_ts)][['tid', 'orf_frame', 'orf_start', 'orf_end', 'orf_score', 'has_stop_codon', 'tid',
#                               'orf_calling_confidence']]

In [61]:
print(len(orf_df.index))

print(len(orf_df.tid.unique()))

229661
229661


In [62]:
# # get expressed transcripts
# ab_df = pd.read_csv(filt_ab, sep='\t')
# df, ids = get_tpm_table(ab_df,
#                how='iso',
#                min_tpm=min_tpm,
#                gene_subset=gene_subset)
# # merge with extra info
# df.reset_index(inplace=True)
# df = df.merge(ab_df[['annot_transcript_id', 'annot_transcript_name', 'annot_gene_id', 'gene_novelty', 'n_exons', 'annot_gene_name']],
#               how='left', 
#               on='annot_transcript_id')

In [63]:
# df.head()

In [64]:
# acta_t = df.loc[df.annot_gene_name=='ACTA1']
# acta_ts = acta_t.annot_transcript_id.tolist()

## How often does a non-mane predominant transcript have a different ORF than the mane one? 

In [79]:
df = pd.read_csv(major_isos, sep='\t')
df.head()

Unnamed: 0,tid,gname,gid,sample,pi,pi_rank
0,"ENSG00000237973[1,1,1]",MTCO1P12,ENSG00000237973.1,h9_neural_crest,100.0,1
1,"ENSG00000270019[1,1,1]",AC110769.2,ENSG00000270019.1,gm12878,100.0,1
2,"ENSG00000236682[1,3,1]",AC068282.1,ENSG00000236682.1,gm12878,100.0,1
3,"ENSG00000260163[1,1,1]",AC012508.1,ENSG00000260163.1,gm12878,100.0,1
4,"ENSG00000222467[1,1,1]",RF00019,ENSG00000222467.1,gm12878,100.0,1


In [72]:
df = pd.read_csv(pi_tpm_table, sep='\t')

In [73]:
len(df.tid.unique())

230018

In [75]:
df.head()

Unnamed: 0,tid,sample,triplet_tpm,triplet_pi,gid,gname,triplet_rank
0,"ENSG00000285991[1,2,2]",muscle,3.330414,100.0,ENSG00000285991,ENSG00000285991,1
1,"ENSG00000285991[1,2,3]",mcf10a,10.264354,69.047615,ENSG00000285991,ENSG00000285991,1
2,"ENSG00000285991[1,2,2]",mcf10a,2.830283,19.047619,ENSG00000285991,ENSG00000285991,2
3,"ENSG00000285991[1,2,4]",mcf10a,1.767401,11.904762,ENSG00000285991,ENSG00000285991,3
4,"ENSG00000285991[1,2,2]",lung,0.087118,100.0,ENSG00000285991,ENSG00000285991,1


In [77]:
len(df.loc[(df.triplet_rank==1)&(df.triplet_tpm>=1)])

554138

In [78]:
len(df.loc[df.triplet_rank==1])

697385

In [55]:
# only predominant transcripts
df = df.loc[df.triplet_rank==1]

In [69]:
# which orfs are used for which predom trnascripts
df = df.merge(orf_df, how='left', on='tid')

Unnamed: 0,tid,sample,triplet_tpm,triplet_pi,gid,gname,triplet_rank
3074558,"ENSG00000000003[1,1,7]",caco2,182.83484,38.483967,ENSG00000000003,TSPAN6,1
3074577,"ENSG00000000003[1,1,5]",brain,9.419128,39.772728,ENSG00000000003,TSPAN6,1
3074586,"ENSG00000000003[1,1,8]",adrenal gland,33.915565,44.585987,ENSG00000000003,TSPAN6,1
3074591,"ENSG00000000003[1,1,7]",adipose,72.80018,48.066296,ENSG00000000003,TSPAN6,1
3074599,"ENSG00000000003[1,1,7]",a673,42.102814,34.210526,ENSG00000000003,TSPAN6,1


In [70]:
df.loc[df.triplet_tpm < 1]

Unnamed: 0,tid,sample,triplet_tpm,triplet_pi,gid,gname,triplet_rank
4,"ENSG00000285991[1,2,2]",lung,0.087118,100.0,ENSG00000285991,ENSG00000285991,1
6,"ENSG00000285991[1,2,2]",huvec,0.284719,100.0,ENSG00000285991,ENSG00000285991,1
9,"ENSG00000285991[1,2,2]",hffc6,0.428080,100.0,ENSG00000285991,ENSG00000285991,1
10,"ENSG00000285991[1,2,2]",heart,0.174343,100.0,ENSG00000285991,ENSG00000285991,1
12,"ENSG00000285991[1,2,2]",brain,0.080766,100.0,ENSG00000285991,ENSG00000285991,1
...,...,...,...,...,...,...,...
3074164,"ENSG00000000005[3,1,1]",h9_osteocyte,0.716335,100.0,ENSG00000000005,TNMD,1
3074165,"ENSG00000000005[3,1,1]",brain,0.089855,100.0,ENSG00000000005,TNMD,1
3074166,"ENSG00000000005[2,2,2]",adipose,0.836784,100.0,ENSG00000000005,TNMD,1
3074303,"ENSG00000000003[1,1,5]",k562,0.204878,100.0,ENSG00000000003,TSPAN6,1


In [66]:
orf_df.head()

Unnamed: 0,ID,len,orf_frame,orf_start,orf_end,orf_len,fickett,hexamer,coding_score,orf_rank,...,strand,gencode_atg,upstream_atgs,atg_rank,score_rank,orf_calling_confidence,atg_score,orf_score,has_stop_codon,tid
0,"ENSG00000000457.13;ENSG00000000457[1,1,1];NA;N...",6471,3,273,2339,2067,0.9591,0.10134,1.0,1,...,-,"['ENST00000367771.11', 'ENST00000367770.5', 'E...",1,1.0,1.0,Clear Best ORF,0.989013,0.989013,True,"ENSG00000000457[1,1,1]"
1,"ENSG00000000457.13;ENSG00000000457[1,1,2];NA;N...",3028,3,273,2339,2067,0.9591,0.10134,1.0,1,...,-,"['ENST00000367771.11', 'ENST00000367770.5', 'E...",1,1.0,1.0,Clear Best ORF,0.989013,0.989013,True,"ENSG00000000457[1,1,2]"
2,"ENSG00000000457.13;ENSG00000000457[1,1,5];NA;N...",5028,3,273,2339,2067,0.9591,0.10134,1.0,1,...,-,"['ENST00000367771.11', 'ENST00000367770.5', 'E...",1,1.0,1.0,Clear Best ORF,0.989013,0.989013,True,"ENSG00000000457[1,1,5]"
3,"ENSG00000000457.13;ENSG00000000457[1,3,2];NA;N...",3190,3,273,2501,2229,0.882,0.101926,1.0,1,...,-,"['ENST00000367771.11', 'ENST00000367770.5', 'E...",1,1.0,1.0,Clear Best ORF,0.989013,0.989013,True,"ENSG00000000457[1,3,2]"
4,"ENSG00000000457.13;ENSG00000000457[1,4,4];NA;N...",1645,3,273,1040,768,0.8001,-0.014456,0.965905,1,...,-,"['ENST00000367771.11', 'ENST00000367770.5', 'E...",1,1.0,1.0,Clear Best ORF,0.989013,0.955293,True,"ENSG00000000457[1,4,4]"
