## Goal: What's up w/ FSMs that have an annotated CDS but aren't predicting their annotated CDS?

See if they at least represent an extension by looking at if any annotated AAs represent a subchain of it

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
from pyfaidx import Fasta
import upsetplot
from pandarallel import pandarallel

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [25]:
df = pd.read_csv('241124_long_struct_cat_aa_cat.tsv', sep='\t')
df[['isoform', 'aa_seq_novelty', 'structural_category']].groupby(['aa_seq_novelty', 'structural_category']).nunique().rename({'isoform':'n_t'}, axis=1)

# limit to FSM w/ CDS and novelty
df = df.loc[(df.structural_category=='FSM')&(df.aa_seq_novelty=='Novel')]

In [14]:
# get annotated AA sequencesa
fasta_file = proc_cfg(config['ref']['pc'], od)
fasta = Fasta(fasta_file)

# Extract each entry's name and sequence
ref_orfs = {
    "name": [entry.name for entry in fasta],
    "seq": [str(entry) for entry in fasta]
}

ref_orfs = pd.DataFrame(ref_orfs)
ref_orfs['gid'] =  ref_orfs.name.str.split('|', expand=True)[2]
ref_orfs['tid'] = ref_orfs.name.str.split('|', expand=True)[1]

In [26]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Chromosome,Start,Stop,Strand,Source,CDS_Source,CDS_Start,CDS_Stop,...,seq,associated_gene_biotype,structural_category,associated_transcript,isoform,gid,length,annot_aa,trunc_annot_aa,aa_seq_novelty
7,7,7,chr6,70426527,70561171,+,HAVANA,ORFanage,70475720,70559918,...,MEIILHASLVALHQPLISFPRPVKTTWLNRNAPAQNKDSVIPTLES...,Protein Coding,FSM,ENST00000194672.11,ENST00000194672.11,ENSG00000082269.17,5198,False,False,Novel
21,21,21,chr12,12475896,12562383,-,ENSEMBL,ORFanage,12476835,12519948,...,MLPLSLQTVFSLYFWVNWRRASTLFTCLQELMQQNGIGYVLNASNT...,Protein Coding,FSM,ENST00000228862.3,ENST00000228862.3,ENSG00000111266.9,3402,False,False,Novel
120,120,120,chr2,240586870,240599084,+,HAVANA,ORFanage,240590914,240598677,...,MTACRALQGDSVSPAARGRMCSGSPYWKRVHGSYEHLWAGQVADAL...,Protein Coding,FSM,ENST00000270361.15,ENST00000270361.15,ENSG00000142330.20,2453,False,False,Novel
168,168,168,chr5,140401813,140539418,+,ENSEMBL,ORFanage,140401967,140539415,...,MLTDSGGGGTSFEEDLDSVAPRSAPAGASEPPPPGGVGLGIRTVRL...,Protein Coding,FSM,ENST00000297183.10,ENST00000297183.10,ENSG00000131503.21,7606,False,False,Novel
200,200,200,chr7,103074949,103098645,+,HAVANA,ORFanage,103084035,103098550,...,MKLVTGITFAIIRELGGIPIVANKINHSNQSIKEKALNALNNLSVN...,Protein Coding,FSM,ENST00000306450.5,ENST00000306450.5,ENSG00000170632.14,1668,False,False,Novel


In [16]:
ref_orfs.head()

Unnamed: 0,name,seq,gid,tid
0,ENSP00000493376.2|ENST00000641515.2|ENSG000001...,MKKVTAEAISWNESTSETNNSMVTEFIFLGLSDSQELQTFLFMLFF...,ENSG00000186092.7,ENST00000641515.2
1,ENSP00000409316.1|ENST00000426406.4|ENSG000002...,MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIV...,ENSG00000284733.2,ENST00000426406.4
2,ENSP00000329982.2|ENST00000332831.5|ENSG000002...,MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIV...,ENSG00000284662.2,ENST00000332831.5
3,ENSP00000478421.2|ENST00000616016.5|ENSG000001...,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...,ENSG00000187634.13,ENST00000616016.5
4,ENSP00000480678.2|ENST00000618323.5|ENSG000001...,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...,ENSG00000187634.13,ENST00000618323.5


In [28]:
# 2. truncated AA match
def get_aa_seq_elongations(x, ref_orfs):
    # limit to just references from same gene
    temp = ref_orfs.loc[ref_orfs.gid == x.gid]
    for ind, entry in temp.iterrows():
        if entry.seq in x.seq:
            return True
    return False

df['elong_annot_aa'] = df.apply(lambda x: get_aa_seq_elongations(x, ref_orfs), axis=1)

In [29]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Chromosome,Start,Stop,Strand,Source,CDS_Source,CDS_Start,CDS_Stop,...,associated_gene_biotype,structural_category,associated_transcript,isoform,gid,length,annot_aa,trunc_annot_aa,aa_seq_novelty,elong_annot_aa
7,7,7,chr6,70426527,70561171,+,HAVANA,ORFanage,70475720,70559918,...,Protein Coding,FSM,ENST00000194672.11,ENST00000194672.11,ENSG00000082269.17,5198,False,False,Novel,False
21,21,21,chr12,12475896,12562383,-,ENSEMBL,ORFanage,12476835,12519948,...,Protein Coding,FSM,ENST00000228862.3,ENST00000228862.3,ENSG00000111266.9,3402,False,False,Novel,False
120,120,120,chr2,240586870,240599084,+,HAVANA,ORFanage,240590914,240598677,...,Protein Coding,FSM,ENST00000270361.15,ENST00000270361.15,ENSG00000142330.20,2453,False,False,Novel,False
168,168,168,chr5,140401813,140539418,+,ENSEMBL,ORFanage,140401967,140539415,...,Protein Coding,FSM,ENST00000297183.10,ENST00000297183.10,ENSG00000131503.21,7606,False,False,Novel,True
200,200,200,chr7,103074949,103098645,+,HAVANA,ORFanage,103084035,103098550,...,Protein Coding,FSM,ENST00000306450.5,ENST00000306450.5,ENSG00000170632.14,1668,False,False,Novel,False


In [30]:
print(len(df.index))

3039


In [31]:
df[['isoform', 'elong_annot_aa']].groupby('elong_annot_aa').nunique().reset_index()

Unnamed: 0,elong_annot_aa,isoform
0,False,2311
1,True,728
