## How lowly-expresed are population-specific transcripts?

In [47]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [48]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

## How many pop-spec t?

In [53]:
mt_df = pd.read_csv('../data/05_mastertable/29102024_PODER_mastertable.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
print(len(mt_df.loc[mt_df.pop_spec_t==True, 'isoform'].unique()))

2267


## How many pop-spec t per pop?

In [54]:
mt_df = pd.read_csv('../data/05_mastertable/29102024_PODER_mastertable.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
print(len(mt_df.loc[mt_df.pop_spec_t==True, 'isoform'].unique()))

2267


In [58]:
# limit to pop-spec
mt_df = mt_df.loc[mt_df.pop_spec_t==True]

In [59]:
c_dict, order = get_population_colors()
order

['AJI', 'CEU', 'HAC', 'ITU', 'LWK', 'MPC', 'PEL', 'YRI']

In [61]:
mt_df['pop_spec_t_pop'] = mt_df[order].idxmax(axis=1)
mt_df[order+['pop_spec_t_pop']].head()

Unnamed: 0,AJI,CEU,HAC,ITU,LWK,MPC,PEL,YRI,pop_spec_t_pop
14,2,0,0,0,0,0,0,0,AJI
15,2,0,0,0,0,0,0,0,AJI
109,0,0,2,0,0,0,0,0,HAC
179,0,2,0,0,0,0,0,0,CEU
215,0,0,0,2,0,0,0,0,ITU


In [63]:
temp = mt_df[['pop_spec_t_pop', 'isoform']].groupby('pop_spec_t_pop').nunique().reset_index().rename({'isoform':'n_pop_spec_t'}, axis=1)
temp

Unnamed: 0,pop_spec_t_pop,n_pop_spec_t
0,AJI,345
1,CEU,247
2,HAC,422
3,ITU,199
4,LWK,197
5,MPC,224
6,PEL,360
7,YRI,273


In [64]:
temp.n_pop_spec_t.median()

260.0

In [49]:
meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])
    
    

f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tpm_tsv'],od))[0]
df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)

In [29]:
# melt 
df = df.melt(id_vars=['tid'], var_name='sample', value_name='counts')

In [30]:
# remove all unexpressed
df = df.loc[df.counts>0]

In [31]:
# get decile bins for overall transcripts
df['decile'], bin_edges = pd.qcut(df['counts'], q=10, labels=False, retbins=True)

In [32]:
df.head()

Unnamed: 0,tid,sample,counts,decile
1,transcript_6675,YRI5,0.393218,5
2,transcript_6676,YRI5,0.95606,7
3,transcript_6684,YRI5,0.196609,3
6,transcript_11363,YRI5,0.489169,5
7,transcript_11364,YRI5,0.366877,5


In [33]:
bin_edges

array([4.476850e-10, 7.754400e-02, 1.132000e-01, 1.630210e-01,
       2.372280e-01, 3.477610e-01, 5.332138e-01, 8.884990e-01,
       1.760984e+00, 5.228652e+00, 1.344310e+05])

In [52]:
# get the population-spec transcrips
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
print(len(mt_df.loc[mt_df.pop_spec_t==True, 'isoform'].unique()))
df = df.merge(mt_df[['isoform', 'pop_spec_t']],
              how='left',
              left_on='tid', 
              right_on='isoform')

2267


In [42]:
# get max expression of each transcript 
df = df.loc[df.pop_spec_t==True]
df = df.sort_values(by='counts', ascending=False)
temp = df.drop_duplicates(subset='tid', keep='first')

In [43]:
temp.head()

Unnamed: 0,tid,sample,counts,decile,isoform,pop_spec_t
1104714,transcript_120938,AJI2,42091.8,9,transcript_120938,True
2482232,transcript_185237,CEU4,15534.4,9,transcript_185237,True
165693,transcript_241802,YRI7,243.185,9,transcript_241802,True
1707919,transcript_121711,PEL5,162.143,9,transcript_121711,True
368607,transcript_243565,HAC3,63.1865,9,transcript_243565,True


In [45]:
df.loc[df.tid=='transcript_120938'].head()

Unnamed: 0,tid,sample,counts,decile,isoform,pop_spec_t
1104714,transcript_120938,AJI2,42091.8,9,transcript_120938,True
2291903,transcript_120938,CEU1,1439.72,9,transcript_120938,True
980833,transcript_120938,LWK5,1318.83,9,transcript_120938,True
1503405,transcript_120938,PEL2,942.82,9,transcript_120938,True
1966127,transcript_120938,ITU3,153.732,9,transcript_120938,True


In [46]:
# get count of pop spec t in each decile
temp[['decile', 'tid']].groupby('decile').count()

Unnamed: 0_level_0,tid
decile,Unnamed: 1_level_1
0,174
1,204
2,244
3,269
4,342
5,295
6,210
7,133
8,104
9,40


In [37]:
len(df.tid.unique())

2015

In [38]:
len(df.index)

23562