In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [3]:
def compute_tau(df):
    """
    Compute tissue-specificity index for an abundance matrix output
    from `get_tpm_table`.
    """
    # number of samples 
    n = len(df.columns)
    print(f'Found {n} samples')
    
    # log-transform data and add 1 pseudocount to data
    df = np.log2(df+1)
    # df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # get max of TPM for transcript across samples
    temp = df.max(axis=1)
    temp_ind = temp.index.tolist()

    df = df.transpose()
    df_ind = df.columns.tolist()
    assert df_ind==temp_ind

    # compute x-hat (TPM / max TPM across tissues)
    x_hat = df/temp
    
    # compute (1-x-hat) 
    x_hat = 1-x_hat

    # compute sum across tissues
    x_hat = x_hat.transpose()
    x_hat['sum'] = x_hat.sum(axis=1)
    
    # compute tau for each transcript
    x_hat['tau'] = x_hat['sum']/n
    tau_df = x_hat[['tau']]
    
    return tau_df

In [26]:
f = expand(proc_cfg(config['gtex']['kallisto']['quant']['merge_matrix_tpm_tsv'],od))[0]

# meta = load_meta()
# meta = meta.loc[meta.merged_run_mode==True]
# sample_d = dict([(entry.cell_line_id, entry['sample']) \
#                  for ind, entry in meta.iterrows()])


df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('GTEx_v9_ONT_data_sequence_data_')[1].split('.')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
# # df.rename(sample_d, axis=1, inplace=True)
df.set_index('tid', inplace=True)
df = df.transpose()

# add sample information so I can groupby different tissues
meta_file = '/Users/fairliereese/Documents/programming/mele_lab/projects/240903_pt/snakemake/gtex_lr-kallisto/GTEx_v9_metadata_with_ancestry.txt'
meta = pd.read_csv(meta_file, sep='\t')
df = df.merge(meta, how='left',
                  left_index=True, 
                  right_on='sample_id')

In [27]:
temp = df.reset_index()
temp.rename({'index':'sample'}, axis=1, inplace=True)
temp = temp[['sample']]
temp.head()

tid,sample
0,GTEX-1192X-0011-R10a-SM-4RXXZ
1,GTEX-11H98-0011-R11b-SM-4SFLZ
2,GTEX-11TTK-0011-R7b-SM-4TVFS
3,GTEX-1211K-0826-SM-7LDFQ
4,GTEX-1313W-0011-R7b-SM-4ZL3U


In [30]:
temp.loc[temp.sample_id.isnull()]

Unnamed: 0,sample,sample_id,date_of_sequencing,sample_name,tissue,protocol,mrna_rin,flush_buffer,amount_loaded_ng,run_time,...,aligned_reads,median_read_length_align,median_read_quality_aligned,WGS,data_center,RNA_extraction_method,3_prime_bias_median,3_prime_bias_sd,inferred_ancestry,RACE
