In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [10]:
# Get sample by tissue
sample_tiss = pd.read_csv('./data/sr_sample_tissue.csv')
tiss_dict = dict()
for tid in set(sample_tiss.TISSUE_ID):
    tiss_dict[tid] = list(sample_tiss[sample_tiss.TISSUE_ID == tid]["SAMPLE_ID"])

In [17]:
# Split gct file into separate tissues
# Runtime ~3hrs
for tid in tqdm(set(sample_tiss.TISSUE_ID)):
    tiss_df = pd.read_csv(
        "./data/short_read_transcript_counts.gct", sep = "\t", low_memory=True,
        skiprows=2,
        usecols=["transcript_id", "gene_id"] + tiss_dict[tid]
    )

    tiss_df.to_csv(f"./data/SR_transcript_counts_by_tissue/{tid}.csv", index=False)

100%|██████████| 54/54 [2:45:18<00:00, 183.67s/it]  


In [4]:
sample_tiss = pd.read_csv('./data/sr_sample_tissue.csv')

tids = set(sample_tiss.TISSUE_ID)

In [None]:
counter = 0
for tid in tids:
    # Obtain and preprocess tissue-specific counts
    t_cts = pd.read_csv(f"./data/SR_transcript_counts_by_tissue/{tid}.csv")
    t_cts["transcript_id"] = t_cts.transcript_id.str.split(".").str[0]
    t_cts["gene_id"] = t_cts.gene_id.str.split(".").str[0]

    # Only take sample columns to sum over
    samp_cols = [x for x in t_cts.columns if "GTEX" in x]

    transc_df = t_cts[["gene_id", "transcript_id"]].copy()
    transc_df["agg_counts"] = t_cts[samp_cols].sum(axis=1)


    gb_transc = transc_df.groupby("gene_id")

    # Remove transcripts with less than 5% count share
    gene_df_list = []
    for gene, gp in tqdm(gb_transc):
        gp["count_share"] = gp["agg_counts"]/gp["agg_counts"].sum()
        gp = gp[gp.count_share >= 0.05]
        gene_df_list.append(gp)

    filtered_df = pd.concat(gene_df_list)
    filtered_df.to_csv(f"./data/SR_filtered_transcripts_by_tissue/{tid}.csv", index=False)
    
    print(f"Finished {tid}: {counter}")
    counter += 1