## Are there any transcripts that are expressed in all samples of one population and no samples of any other population?

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [69]:
f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tpm_tsv'],od))[0]

meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])

df = pd.read_csv(f, sep='\t')
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)
df.set_index('tid', inplace=True)
df['temp_merge'] = 1
df.head()

Unnamed: 0_level_0,YRI5,YRI6,YRI7,HAC1,HAC2,HAC3,HAC4,HAC5,HAC6,LWK1,...,CEU1,CEU2,CEU3,CEU4,CEU5,MPC4,YRI1,YRI2,YRI3,temp_merge
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENST00000413811.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
transcript_6675,0.393218,0.0,0.552231,0.127078,0.386482,0.313976,0.498281,0.328355,0.437621,0.38532,...,0.269962,0.108892,0.172414,0.405509,0.073628,0.83542,0.214743,0.477583,0.59838,1
transcript_6676,0.95606,0.438777,1.18418,0.75706,1.551,0.26602,0.195867,0.268813,0.351186,0.504766,...,0.519386,0.566595,0.495574,1.00455,0.540054,0.39701,0.998557,0.344694,0.18226,1
transcript_6684,0.196609,1.2452,0.631121,0.0,0.552185,0.444888,0.39433,0.492533,0.475874,0.481651,...,0.539923,0.0,0.822254,0.354531,0.441766,0.858465,0.214743,0.477583,0.29919,1
transcript_6687,0.0,0.0,0.0,0.055554,0.0,0.088617,0.0,0.0,0.056812,0.0,...,0.0,0.247031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [70]:
# add max # samples for each pop
exp_in_pop_cols = []
for p in meta['population'].unique():
    samps = meta.loc[meta.population==p, 'sample'].tolist()
    samps = [s for s in samps if s in df.columns]
    df[f'n_exp_{p}'] = (df[samps]>0).sum(axis=1)
    df[f'exp_in_{p}'] = (df[samps]>0).sum(axis=1)>0
    exp_in_pop_cols.append(f'exp_in_{p}')

temp = meta[['sample', 'population']].groupby('population').nunique().rename({'sample':'n_samples'}, axis=1)
temp = temp.transpose()
temp.columns = [f'n_samples_{c}' for c in temp.columns]
temp['temp_merge'] = 1

In [71]:
df.reset_index(inplace=True)
df = df.merge(temp, how='left',
              on='temp_merge')
df.drop('temp_merge', axis=1, inplace=True)

In [72]:
# call things as expressed in all samples of pop or not
exp_in_all_cols = []
for p in meta['population'].unique():
    samps = meta.loc[meta.population==p, 'sample'].tolist()
    samps = [s for s in samps if s in df.columns]
    df[f'exp_in_all_{p}'] = df[f'n_exp_{p}']==df[f'n_samples_{p}']
    exp_in_all_cols.append(f'exp_in_all_{p}')


In [73]:
df.to_csv('test.tsv', sep='\t')

In [77]:
# WRONG 
df['ultra_pop_spec'] = (df[exp_in_all_cols].sum(axis=1)==1)&\
                       (df[exp_in_pop_cols].sum(axis=1)==1)
df['pop_spec'] = df[exp_in_pop_cols].sum(axis=1)==1

In [78]:
df.to_csv('test.tsv', sep='\t')

In [58]:
df.loc[df.tid=='transcript_8852']

Unnamed: 0,tid,YRI5,YRI6,YRI7,HAC1,HAC2,HAC3,HAC4,HAC5,HAC6,...,n_samples_YRI,exp_in_all_ITU,exp_in_all_PEL,exp_in_all_HAC,exp_in_all_AJI,exp_in_all_LWK,exp_in_all_YRI,exp_in_all_CEU,exp_in_all_MPC,ultra_pop_spec
49,transcript_8852,0.450956,0.0,0.0,0.201096,0.340745,0.0,0.150552,0.347292,0.200877,...,7,False,False,False,True,False,False,False,False,True


In [76]:
df.loc[df.ultra_pop_spec==True][exp_in_all_cols+['ultra_pop_spec']].head()

Unnamed: 0,exp_in_all_ITU,exp_in_all_PEL,exp_in_all_HAC,exp_in_all_AJI,exp_in_all_LWK,exp_in_all_YRI,exp_in_all_CEU,exp_in_all_MPC,ultra_pop_spec


In [84]:
ups_df = df.loc[df.ultra_pop_spec==True]

In [85]:
len(ups_df.index)

0

In [86]:
# there are none

## What populations have UPS transcripts?

In [79]:
# ups_df['ups_pop'] = df[exp_in_all_cols].idxmax(axis=1).str.split('_',expand=True)[3]

In [81]:
# ups_df[['tid', 'ups_pop']].groupby('ups_pop').nunique().rename({'tid':'n_t'}, axis=1)

## What does their expression look like?

In [82]:
# temp = ups_df.copy(deep=True)
# temp.drop([c for c in temp.columns if 'n_exp' in c], axis=1, inplace=True)
# temp.drop([c for c in temp.columns if 'n_samples' in c], axis=1, inplace=True)
# temp.drop([c for c in temp.columns if 'exp_in_all' in c], axis=1, inplace=True)
# temp.drop('ultra_pop_spec', axis=1, inplace=True)
# temp.head()

In [83]:
# # add population info
# df = df.transpose()
# df = df.merge(meta[['sample', 'population']], 
#               how='left',
#               left_index=True,
#               right_on='sample')
# df.drop('sample', axis=1, inplace=True)

## What biotypes are they?

## What strctural categories are they?