## Goal: What % of novel AA sequences from novel trancripts represent elongations of annotated ORFs?

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
from pyfaidx import Fasta
import upsetplot
from pandarallel import pandarallel

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [7]:
df = pd.read_csv('241124_long_struct_cat_aa_cat.tsv', sep='\t')
df[['isoform', 'aa_seq_novelty', 'structural_category']].groupby(['aa_seq_novelty', 'structural_category']).nunique().rename({'isoform':'n_t'}, axis=1)

# limit to novel transcript --> novel aa
df = df.loc[(df.structural_category.isin(['NIC', 'NNC']))&(df.aa_seq_novelty=='Novel')]

In [9]:
# get annotated AA sequencesa
fasta_file = proc_cfg(config['ref']['pc'], od)
fasta = Fasta(fasta_file)

# Extract each entry's name and sequence
ref_orfs = {
    "name": [entry.name for entry in fasta],
    "seq": [str(entry) for entry in fasta]
}

ref_orfs = pd.DataFrame(ref_orfs)
ref_orfs['gid'] =  ref_orfs.name.str.split('|', expand=True)[2]
ref_orfs['tid'] = ref_orfs.name.str.split('|', expand=True)[1]

In [10]:
# 2. truncated AA match
def get_aa_seq_elongations(x, ref_orfs):
    # limit to just references from same gene
    temp = ref_orfs.loc[ref_orfs.gid == x.gid]
    for ind, entry in temp.iterrows():
        if entry.seq in x.seq:
            return True
    return False

df['elong_annot_aa'] = df.apply(lambda x: get_aa_seq_elongations(x, ref_orfs), axis=1)

In [11]:
print('Number of novel isoforms w/ novel aa chains that are elongations:')
df[['isoform', 'elong_annot_aa']].groupby('elong_annot_aa').nunique().reset_index()

Number of novel isoforms w/ novel aa chains that are elongations:


Unnamed: 0,elong_annot_aa,isoform
0,False,13401
1,True,3793
