In [8]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
from pyfaidx import Fasta
import upsetplot
from pandarallel import pandarallel

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [3]:
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
# mt_df = mt_df.loc[mt_df['filter']=='pass']
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)

In [4]:
p_df = pd.read_csv(proc_cfg(config['lr']['poder_protein']['protein']['summary'], od), sep='\t')
p_df = p_df.merge(mt_df[['associated_gene_biotype',
                         'structural_category', 
                         'associated_transcript',
                         'isoform',
                         'associated_gene',
                         'length']],
                  how='left',
                  left_on='tid', 
                  right_on='isoform')
p_df.rename({'associated_gene':'gid',
             'protein_sequence':'seq'}, axis=1, inplace=True)
assert len(p_df.loc[p_df.isoform.isnull()]) == 0

## First, need to filter

In [5]:
# remove NMD things
p_df = p_df.loc[p_df.protein_is_nmd == False]

# keep only things w/ full ORFs
p_df = p_df.loc[(p_df.protein_has_start_codon==True)&\
                (p_df.protein_has_stop_codon==True)]

# keep only things from annotated protein coding genes
p_df = p_df.loc[p_df.associated_gene_biotype=='Protein Coding']

# length minimum?? maybe not
print(len(p_df.index))

86306


## Merge reference AA sequences on gid + sequence

In [6]:
# get annotated AA sequencesa
fasta_file = proc_cfg(config['ref']['pc'], od)
fasta = Fasta(fasta_file)

# Extract each entry's name and sequence
ref_orfs = {
    "name": [entry.name for entry in fasta],
    "seq": [str(entry) for entry in fasta]
}

ref_orfs = pd.DataFrame(ref_orfs)
ref_orfs['gid'] =  ref_orfs.name.str.split('|', expand=True)[2]
ref_orfs = ref_orfs[['seq', 'gid']]
ref_orfs['annot_aa'] = True
ref_orfs.drop_duplicates(inplace=True)

# just going to call novel aas as those that are not in the 
# pc translations gencode file
print(len(p_df))
p_df = p_df.merge(ref_orfs, how='left', on=['gid', 'seq'])
print(len(p_df))
p_df['annot_aa'] = p_df['annot_aa'].fillna(False)
p_df.head()

86306
86306


Unnamed: 0,Chromosome,Start,Stop,Strand,Source,CDS_Source,CDS_Start,CDS_Stop,tid,pid,...,protein_has_stop_codon,protein_has_start_codon,seq,associated_gene_biotype,structural_category,associated_transcript,isoform,gid,length,annot_aa
0,chr12,4590319,4613888,+,ENSEMBL,ORFanage,4591180,4613750,ENST00000010132.6,ENSP00000010132.5,...,True,True,MPASELKASEIPFHPSIKTQDPKAEEKSPKKQKVTLTAAEALKLFK...,Protein Coding,FSM,ENST00000010132.6,ENST00000010132.6,ENSG00000010219.14,1840,True
1,chr12,7919229,7936187,-,HAVANA,ORFanage,7921415,7936034,ENST00000075120.12,ENSP00000516774.1,...,True,True,MGTQKVTPALIFAITVATIGSFQFGYNTGVINAPEKIIKEFINKTL...,Protein Coding,FSM,ENST00000075120.12,ENST00000075120.12,ENSG00000059804.18,3827,True
2,chrY,2935380,2982506,+,HAVANA,ORFanage,2953936,2979990,ENST00000155093.8,ENSP00000155093.3,...,True,True,MDEDEFELQPQEPNSFFDGIGADATHMDGDQIVVEIQEAVFVSNIV...,Protein Coding,FSM,ENST00000155093.8,ENST00000155093.8,ENSG00000067646.12,5336,True
3,chrX,48922210,48957631,-,HAVANA,ORFanage,48923176,48957570,ENST00000156084.8,ENSP00000156084.4,...,True,True,MTILPKKKPPPPDADPANEPPPPGPMPPAPRRGGGVGVGGGGTGVG...,Protein Coding,FSM,ENST00000156084.8,ENST00000156084.8,ENSG00000068308.15,2740,True
4,chr6,155395367,155455839,-,HAVANA,ORFanage,155396838,155455800,ENST00000159060.3,ENSP00000159060.2,...,True,True,MMGCWILNEGLSTILVLSWLGINFYLFIDTFYWYEEEESFHYTRVI...,Protein Coding,FSM,ENST00000159060.3,ENST00000159060.3,ENSG00000074771.4,1980,True


## Get our own protein sequence novelty categories
1. exact AA match
2. truncated AA match
3. novel, non-truncation AA seq

In [18]:
# 2. truncated AA match
def get_aa_seq_truncations(x, ref_orfs):
    # limit to just references from same gene
    temp = ref_orfs.loc[ref_orfs.gid == x.gid]
    for ind, entry in temp.iterrows():
        if x.seq in entry.seq:
            return True
    return False

smol_p_df = p_df.loc[p_df.annot_aa==False].head(20)
smol_p_df['trunc_annot_aa'] = smol_p_df.apply(lambda x: get_aa_seq_truncations(x, ref_orfs), axis=1)
# pandarallel.initialize(nb_workers=8, verbose=1)
# p_df = p_df.parallel_apply(get_aa_seq_truncations, args=(ref_orfs), axis=1, result_type='expand', progress_bar=False)

In [17]:
smol_p_df.head()

Unnamed: 0,Chromosome,Start,Stop,Strand,Source,CDS_Source,CDS_Start,CDS_Stop,tid,pid,...,protein_has_start_codon,seq,associated_gene_biotype,structural_category,associated_transcript,isoform,gid,length,annot_aa,trunc_annot_aa
0,chr12,4590319,4613888,+,ENSEMBL,ORFanage,4591180,4613750,ENST00000010132.6,ENSP00000010132.5,...,True,MPASELKASEIPFHPSIKTQDPKAEEKSPKKQKVTLTAAEALKLFK...,Protein Coding,FSM,ENST00000010132.6,ENST00000010132.6,ENSG00000010219.14,1840,True,True
1,chr12,7919229,7936187,-,HAVANA,ORFanage,7921415,7936034,ENST00000075120.12,ENSP00000516774.1,...,True,MGTQKVTPALIFAITVATIGSFQFGYNTGVINAPEKIIKEFINKTL...,Protein Coding,FSM,ENST00000075120.12,ENST00000075120.12,ENSG00000059804.18,3827,True,True
2,chrY,2935380,2982506,+,HAVANA,ORFanage,2953936,2979990,ENST00000155093.8,ENSP00000155093.3,...,True,MDEDEFELQPQEPNSFFDGIGADATHMDGDQIVVEIQEAVFVSNIV...,Protein Coding,FSM,ENST00000155093.8,ENST00000155093.8,ENSG00000067646.12,5336,True,True
3,chrX,48922210,48957631,-,HAVANA,ORFanage,48923176,48957570,ENST00000156084.8,ENSP00000156084.4,...,True,MTILPKKKPPPPDADPANEPPPPGPMPPAPRRGGGVGVGGGGTGVG...,Protein Coding,FSM,ENST00000156084.8,ENST00000156084.8,ENSG00000068308.15,2740,True,True
4,chr6,155395367,155455839,-,HAVANA,ORFanage,155396838,155455800,ENST00000159060.3,ENSP00000159060.2,...,True,MMGCWILNEGLSTILVLSWLGINFYLFIDTFYWYEEEESFHYTRVI...,Protein Coding,FSM,ENST00000159060.3,ENST00000159060.3,ENSG00000074771.4,1980,True,True
