In [22]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
from pyfaidx import Fasta
import upsetplot
from pandarallel import pandarallel

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [23]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [24]:
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
# mt_df = mt_df.loc[mt_df['filter']=='pass']
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)

In [25]:
p_df = pd.read_csv(proc_cfg(config['lr']['poder_protein']['protein']['summary'], od), sep='\t')
p_df = p_df.merge(mt_df[['associated_gene_biotype',
                         'structural_category', 
                         'associated_transcript',
                         'isoform',
                         'associated_gene',
                         'length']],
                  how='left',
                  left_on='tid', 
                  right_on='isoform')
p_df.rename({'associated_gene':'gid',
             'protein_sequence':'seq'}, axis=1, inplace=True)
assert len(p_df.loc[p_df.isoform.isnull()]) == 0

## What % by structural category are high-confidence?
* have ORF
* no NMD
* \>80% blastp identity

In [26]:
# keep only things from annotated protein coding genes
p_df = p_df.loc[p_df.associated_gene_biotype=='Protein Coding']
n = len(p_df.index)
print(len(p_df.index))

100666


In [27]:
# add status of fsm cdss


# get annotated AA sequencesa
fasta_file = proc_cfg(config['ref']['pc'], od)
fasta = Fasta(fasta_file)

# Extract each entry's name and sequence
ref_orfs = {
    "name": [entry.name for entry in fasta],
    "seq": [str(entry) for entry in fasta]
}

ref_orfs = pd.DataFrame(ref_orfs)
ref_orfs['gid'] =  ref_orfs.name.str.split('|', expand=True)[2]
ref_orfs['tid'] = ref_orfs.name.str.split('|', expand=True)[1]
ref_orfs.head()


Unnamed: 0,name,seq,gid,tid
0,ENSP00000493376.2|ENST00000641515.2|ENSG000001...,MKKVTAEAISWNESTSETNNSMVTEFIFLGLSDSQELQTFLFMLFF...,ENSG00000186092.7,ENST00000641515.2
1,ENSP00000409316.1|ENST00000426406.4|ENSG000002...,MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIV...,ENSG00000284733.2,ENST00000426406.4
2,ENSP00000329982.2|ENST00000332831.5|ENSG000002...,MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIV...,ENSG00000284662.2,ENST00000332831.5
3,ENSP00000478421.2|ENST00000616016.5|ENSG000001...,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...,ENSG00000187634.13,ENST00000616016.5
4,ENSP00000480678.2|ENST00000618323.5|ENSG000001...,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...,ENSG00000187634.13,ENST00000618323.5


In [28]:
p_df.loc[~(p_df.associated_transcript.isin(ref_orfs.tid.tolist()))&\
         (p_df.structural_category=='FSM'),'structural_category'] = 'FSM w/o CDS'
# p_df.loc[(p_df.structural_category=='FSM')&\
#          (p_df.annot_cds==False), 'aa_seq_novelty'] = 'FSM w/o CDS'

In [29]:
p_df['high_conf'] = (p_df.blastp_identity>=80)&\
                    (p_df.protein_is_nmd==False)

In [30]:
temp = p_df[['high_conf', 'isoform', 'structural_category']].groupby(['structural_category', 'high_conf']).nunique().reset_index().rename({'isoform':'n_t'}, axis=1)

In [31]:
temp2 = temp[['n_t', 'structural_category']].groupby('structural_category').sum().reset_index().rename({'n_t':'n_total_t'}, axis=1)
temp = temp.merge(temp2, 
                  how='left',
                  on='structural_category')
temp['perc'] = (temp['n_t']/temp['n_total_t'])*100

In [32]:
temp

Unnamed: 0,structural_category,high_conf,n_t,n_total_t,perc
0,FSM,False,7366,47281,15.579197
1,FSM,True,39915,47281,84.420803
2,FSM w/o CDS,False,6577,20887,31.488486
3,FSM w/o CDS,True,14310,20887,68.511514
4,NIC,False,3548,15506,22.881465
5,NIC,True,11958,15506,77.118535
6,NNC,False,4033,16992,23.734699
7,NNC,True,12959,16992,76.265301


## Filter on high-conf and compute the sankey thing

In [33]:
hc_p_df = p_df.copy(deep=True)

In [39]:
p_df = pd.read_csv('orf_thing.tsv', sep='\t')

p_df['aa_seq_novelty'] = 'Novel'
p_df.loc[(p_df.annot_aa==True), 'aa_seq_novelty'] = 'Known'
p_df.loc[(p_df.annot_aa==False)&\
         (p_df.trunc_annot_aa==True), 'aa_seq_novelty'] = 'Truncation'

In [40]:
hc_tid = hc_p_df.loc[hc_p_df.high_conf==True, 'tid'].tolist()
print(len(p_df))
p_df = p_df.loc[p_df.tid.isin(hc_tid)]
print(len(p_df))

86306
79142


In [41]:
temp = p_df[['isoform', 'aa_seq_novelty', 'structural_category']].groupby(['aa_seq_novelty', 'structural_category']).nunique().rename({'isoform':'n_t'}, axis=1)
temp = temp.reset_index()
temp

temp2 = p_df[['isoform', 'structural_category']].groupby(['structural_category']).nunique().rename({'isoform':'n_total_t'}, axis=1)
temp = temp.merge(temp2, how='left', on='structural_category')
temp['perc'] = (temp['n_t']/temp['n_total_t'])*100
temp

Unnamed: 0,aa_seq_novelty,structural_category,n_t,n_total_t,perc
0,Known,FSM,34936,54225,64.427847
1,Known,NIC,2647,11958,22.135809
2,Known,NNC,2816,12959,21.730072
3,Novel,FSM,10201,54225,18.812356
4,Novel,NIC,6267,11958,52.40843
5,Novel,NNC,7710,12959,59.495331
6,Truncation,FSM,9088,54225,16.759797
7,Truncation,NIC,3044,11958,25.455762
8,Truncation,NNC,2433,12959,18.774597


In [42]:
64.427847+16.759797

81.187644

In [44]:
((6267+7710)/(11958+12959))*100

56.0942328530722