# let's look for a highly-expressed NNC that also has protein change

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [3]:
def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [4]:
meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])
    
    

f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tpm_tsv'],od))[0]
df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)

In [5]:
# get max expression value per transcript
df = df.set_index('tid')
df['max_cpm'] = df.max(axis=1)

In [6]:
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)

In [7]:
# protein stuff
p_df = pd.read_csv('241120_long_struct_cat_aa_cat.tsv', sep='\t')

# only novel 
p_df = p_df.loc[p_df.aa_seq_novelty=='Novel']

In [8]:
# min expression 
min_cpm = 5
df = df.loc[df.max_cpm>=min_cpm]

In [9]:
# nncs
df = df.reset_index()
print(len(df.index))
df = df.loc[df.tid.isin(mt_df.loc[mt_df.structural_category=='NNC', 'isoform'].tolist())]
print(len(df.index))

14647
1885


In [10]:
# novel things w/ novel proteins
print(len(df.index))
df = df.loc[df.tid.isin(p_df.isoform.tolist())]
print(len(df.index))

1885
844


In [11]:
# add gene IDs so I can look at these genes
df = df.merge(mt_df[['isoform', 'geneid.v']],
                how='left',
                left_on='tid',
                right_on='isoform')

In [15]:
df.sort_values(by='max_cpm', ascending=False).head(10)

Unnamed: 0,tid,YRI5,YRI6,YRI7,HAC1,HAC2,HAC3,HAC4,HAC5,HAC6,...,CEU3,CEU4,CEU5,MPC4,YRI1,YRI2,YRI3,max_cpm,isoform,geneid.v
100,transcript_102081,56.4351,87.2719,80.3607,97.4697,103.175,89.3429,64.4039,64.1183,102.883,...,103.168,96.4696,122.12,413.258,98.1334,107.775,40.7796,509.266,transcript_102081,ENSG00000197756.10
525,transcript_229410,194.057,15.5056,88.5941,90.6736,191.883,161.904,60.5593,162.968,86.538,...,115.288,130.312,147.645,16.0518,100.355,116.536,115.216,299.215,transcript_229410,ENSG00000150991.16
304,transcript_346879,1.89236,0.167479,1.81594,81.2511,0.00561,194.834,1.76788,1.29289,127.617,...,23.8188,1.95665,2.32538e-09,0.498969,1.29124,0.037405,0.857945,292.013,transcript_346879,ENSG00000196126.12
737,transcript_282857,188.651,188.076,181.17,160.731,187.546,130.206,140.292,186.387,146.438,...,214.366,157.565,200.279,237.394,101.755,201.705,101.022,265.157,transcript_282857,ENSG00000105193.9
542,transcript_239500,62.097,62.6318,234.88,191.692,38.4134,61.5826,69.3924,63.1722,63.6049,...,81.237,86.8512,99.0115,88.8089,215.233,58.2204,91.3379,234.88,transcript_239500,ENSG00000133112.17
101,transcript_102083,27.2318,38.4006,35.2529,41.4823,42.3924,42.4683,30.1631,32.1695,44.7523,...,54.5053,43.3113,55.0955,192.145,44.4811,47.1027,19.3335,231.366,transcript_102083,ENSG00000197756.10
138,transcript_134953,117.34,158.468,122.405,114.907,117.046,123.172,138.98,127.63,168.141,...,155.997,123.236,125.932,221.789,127.115,161.305,142.404,221.789,transcript_134953,ENSG00000232112.3
535,transcript_239361,37.5069,38.4891,174.973,116.69,28.5812,44.8938,53.4945,33.6503,44.3048,...,41.8343,61.2821,65.3081,63.7531,104.799,35.4848,51.6783,174.973,transcript_239361,ENSG00000133112.17
46,transcript_207293,2.96905,6.90695,153.15,170.177,3.45322,5.01338,31.3627,3.66373,7.30984,...,55.1088,52.0772,72.9264,146.927,112.846,2.12232,35.7163,173.801,transcript_207293,ENSG00000116251.11
461,transcript_33803,110.923,92.6796,85.5163,146.766,31.9265,115.859,96.5316,149.99,89.3016,...,154.674,120.068,124.83,41.3705,118.821,116.032,76.3958,166.055,transcript_33803,ENSG00000177600.10


In [13]:
df

Unnamed: 0,tid,YRI5,YRI6,YRI7,HAC1,HAC2,HAC3,HAC4,HAC5,HAC6,...,CEU3,CEU4,CEU5,MPC4,YRI1,YRI2,YRI3,max_cpm,isoform,geneid.v
0,transcript_11288,0.824280,0.722806,0.000000,0.000000,0.946322,0.514791,0.849513,0.905785,0.839436,...,1.407110,2.633860,1.448400,1.940460,1.205510,0.674467,1.316290,5.13914,transcript_11288,ENSG00000054282.16
1,transcript_1667,0.000000,0.279376,0.107144,0.144927,0.222129,0.165249,0.401112,0.094033,0.000000,...,0.000000,0.122660,0.084622,0.000000,0.328226,0.076182,0.394974,18.32370,transcript_1667,ENSG00000064886.14
2,transcript_6058,2.379530,2.018300,2.058910,1.139050,3.823520,4.694190,3.405580,3.851690,4.166660,...,3.255720,2.789130,3.823130,0.739017,2.710830,2.695930,3.487930,6.42632,transcript_6058,ENSG00000072694.22
3,transcript_12208,1.244680,3.632200,8.091200,2.507410,1.327680,5.088190,5.646510,2.260810,3.796290,...,3.526090,3.435090,3.154800,4.946250,2.617120,2.788730,11.057600,11.05760,transcript_12208,ENSG00000090273.14
4,transcript_12212,0.638518,2.283400,2.161360,1.893690,1.179290,1.512120,1.887090,1.301280,1.191090,...,0.519564,2.866770,1.578080,3.078970,1.210020,1.345540,4.326680,5.45017,transcript_12212,ENSG00000090273.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,transcript_380569,1.317280,2.470770,0.439373,0.842221,2.866520,1.581960,2.869110,1.830730,2.046370,...,3.168480,2.477630,2.515660,6.885170,1.951650,1.317370,2.256330,6.88517,transcript_380569,ENSG00000198157.11
840,transcript_189933,1.727640,0.000000,0.000000,0.000000,4.123090,0.000000,4.152280,0.000000,3.106910,...,3.236730,1.774490,0.000000,3.443970,2.322330,0.000000,0.000000,5.76161,transcript_189933,ENSG00000198692.10
841,transcript_189954,1.812510,0.000000,0.000000,0.000000,2.655900,0.000000,2.016160,0.000000,3.242370,...,3.233580,1.849100,0.000000,2.508110,2.186710,0.000000,0.000000,6.53964,transcript_189954,ENSG00000198692.10
842,transcript_190157,1.715910,30.997400,0.149749,10.424600,23.465500,7.545030,19.807700,29.633300,7.200820,...,4.987470,7.932460,8.338280,0.000000,24.001700,8.907420,7.349950,42.84740,transcript_190157,ENSG00000277400.1
