In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
from pyfaidx import Fasta
import upsetplot
from pandarallel import pandarallel

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [3]:
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
# mt_df = mt_df.loc[mt_df['filter']=='pass']
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)

In [4]:
p_df = pd.read_csv(proc_cfg(config['lr']['poder_protein']['protein']['summary'], od), sep='\t')
p_df = p_df.merge(mt_df[['associated_gene_biotype',
                         'structural_category', 
                         'associated_transcript',
                         'isoform',
                         'associated_gene',
                         'length']],
                  how='left',
                  left_on='tid', 
                  right_on='isoform')
p_df.rename({'associated_gene':'gid',
             'protein_sequence':'seq'}, axis=1, inplace=True)
assert len(p_df.loc[p_df.isoform.isnull()]) == 0

## Limit to protein coding genes

In [5]:
# keep only things from annotated protein coding genes
p_df = p_df.loc[p_df.associated_gene_biotype=='Protein Coding']
n = len(p_df.index)
print(len(p_df.index))

# keep only things w/ full ORFs
p_df = p_df.loc[(p_df.protein_has_start_codon==True)&\
                (p_df.protein_has_stop_codon==True)]
print(len(p_df.index))

100666
100666


## Get reference AA sequences and merge on gid + sequence

In [19]:
# get annotated AA sequencesa
fasta_file = proc_cfg(config['ref']['pc'], od)
fasta = Fasta(fasta_file)

# Extract each entry's name and sequence
ref_orfs = {
    "name": [entry.name for entry in fasta],
    "seq": [str(entry) for entry in fasta]
}

ref_orfs = pd.DataFrame(ref_orfs)
ref_orfs['gid'] =  ref_orfs.name.str.split('|', expand=True)[2]
ref_orfs['tid'] =  ref_orfs.name.str.split('|', expand=True)[1]

ref_orfs = ref_orfs[['seq', 'gid']]
ref_orfs['annot_aa'] = True
ref_orfs.drop_duplicates(inplace=True)

In [7]:
# also add orf completeness as info!
gtf_file = proc_cfg(config['ref']['gtf'], od)
gtf_df = pr.read_gtf(gtf_file).df

start_codon_tids = gtf_df.loc[gtf_df.Feature=='start_codon', 'transcript_id'].unique().tolist()
stop_codon_tids = gtf_df.loc[gtf_df.Feature=='stop_codon', 'transcript_id'].unique().tolist()
gtf_df = gtf_df[['transcript_id']].drop_duplicates()

gtf_df['has_start_codon'] = gtf_df.transcript_id.isin(start_codon_tids)
gtf_df['has_stop_codon'] = gtf_df.transcript_id.isin(stop_codon_tids)

gtf_df = gtf_df.loc[gtf_df.transcript_id.notnull()]
gtf_df['has_complete_orf'] = (gtf_df.has_start_codon)&(gtf_df.has_stop_codon)

assert len(gtf_df.transcript_id.unique()) == len(gtf_df.index)

In [20]:
# just going to call novel aas as those that are not in the 
# pc translations gencode file
l1 = len(p_df.index)
p_df = p_df.merge(ref_orfs, how='left', on=['gid', 'seq'])
l2 = len(p_df.index)
assert l1 == l2
p_df['annot_aa'] = p_df['annot_aa'].fillna(False)

100666
100666


In [24]:
# merge in information of if there was a complete annotated orf to begin with
p_df = p_df.merge(gtf_df, 
                  how='left', 
                  left_on='associated_transcript',
                  right_on='transcript_id')

In [26]:
p_df[['isoform', 'annot_aa', 'structural_category', 'has_complete_orf']].groupby(['annot_aa', 'has_complete_orf', 'structural_category']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,isoform
annot_aa,has_complete_orf,structural_category,Unnamed: 3_level_1
False,False,FSM,25834
False,True,FSM,3315
True,False,FSM,1315
True,True,FSM,37704


So most FSMs w/o a matching full-length ORF don't have a matching AA

In [None]:
p_df[['isoform', 'annot_aa', 'structural_category']].groupby(['annot_aa', 'structural_category']).nunique()

## Get our own protein sequence novelty categories
1. exact AA match
2. truncated AA match
3. novel, non-truncation AA seq

In [None]:
# 2. truncated AA match
def get_aa_seq_truncations(x, ref_orfs):
    # limit to just references from same gene
    temp = ref_orfs.loc[ref_orfs.gid == x.gid]
    for ind, entry in temp.iterrows():
        if x.seq in entry.seq:
            return True
    return False

p_df['trunc_annot_aa'] = p_df.apply(lambda x: get_aa_seq_truncations(x, ref_orfs), axis=1)

In [None]:
p_df.to_csv('temp.tsv', sep='\t')