In [1]:
import pandas as pd
import numpy as np
import itertools
import os

In [18]:
def compile_data_for_DESeq(gene_cols, healthy_dfs, healthy_srcs, tumor_dfs, tumor_srcs, healthy_level, tumor_level):
    gene_col_df = healthy_dfs[0][gene_cols]
    for i, hd in enumerate(healthy_dfs):
        assert all(gene_col_df == hd[gene_cols]), f"Row ordering (order of gene symbols) violated by healthy dataframe {i}"
    for i, td in enumerate(tumor_dfs):
        assert all(gene_col_df == td[gene_cols]), f"Row ordering (order of gene symbols) violated by tumor dataframe {i}"
    
    healthy_sources = []
    tumor_sources = []
    
    for i, hd in enumerate(healthy_dfs):
        healthy_sources.extend(itertools.repeat(healthy_srcs[i], len(hd.columns) - len(gene_cols)))
    
    for i, td in enumerate(tumor_dfs):
        tumor_sources.extend(itertools.repeat(tumor_srcs[i], len(td.columns) - len(gene_cols)))
    
    all_healthy_df = pd.concat(
        (df.drop(gene_cols, axis=1) for df in healthy_dfs),
        axis=1
    )
    
    all_tumor_df = pd.concat(
        (df.drop(gene_cols, axis=1) for df in tumor_dfs),
        axis=1
    )
    
    col_data_healthy_df = pd.DataFrame(
        data=zip(all_healthy_df.columns, itertools.repeat(healthy_level, len(all_healthy_df.columns)), healthy_sources),
        columns=["sample_name", "condition", "data_source"]
    )
    
    col_data_tumor_df = pd.DataFrame(
        data=zip(all_tumor_df.columns, itertools.repeat(tumor_level, len(all_tumor_df.columns)), tumor_sources),
        columns=["sample_name", "condition", "data_source"]
    )
    
    return pd.concat([gene_col_df, all_healthy_df, all_tumor_df], axis=1), pd.concat([col_data_healthy_df, col_data_tumor_df], axis=0, ignore_index=True)


def save_for_DESeq(data_dir, healthy_files, tumor_files, dset_name, gene_cols, healthy_level, tumor_level):
    healthy_dfs = [pd.read_csv(f"{data_dir}/{fn}", sep="\t", compression="gzip") for fn in healthy_files]
    tumor_dfs = [pd.read_csv(f"{data_dir}/{fn}", sep="\t", compression="gzip") for fn in tumor_files]
    
    healthy_srcs = []
    tumor_srcs = []
    
    for hf in healthy_files:
        if "tcga" in hf:
            healthy_srcs.append("TCGA")
        elif "gtex" in hf:
            healthy_srcs.append("GTEx")
    
    for tf in tumor_files:
        if "tcga" in tf:
            tumor_srcs.append("TCGA")
        elif "gtex" in tf:
            tumor_srcs.append("GTEx")
    
    counts_df, coldata_df = compile_data_for_DESeq(gene_cols, healthy_dfs, healthy_srcs, tumor_dfs, tumor_srcs, healthy_level, tumor_level)
    
    if not os.path.exists(f"{data_dir}/{dset_name}"):
        os.makedirs(f"{data_dir}/{dset_name}")

    
    counts_df.to_csv(f"{data_dir}/{dset_name}/counts.tsv", sep="\t", index=False)
    coldata_df.to_csv(f"{data_dir}/{dset_name}/coldata.tsv", sep="\t", index=False)

In [19]:
# Read in root data directory
with open('../dev_paths.txt') as f:
    data_dir = f.readline().strip()

cvx_healthy = ["cervixrsemcountgtex.txt.gz", "cescrsemcounttcga.txt.gz"]
cvx_tumor = ["cescrsemcounttcgat.txt.gz"]
# ut_healthy = ["uterusrsemcountgtex.txt.gz", "ucecrsemcounttcga.txt.gz"]
# ut_tumor = ["ucecrsemcounttcgat.txt.gz", "ucsrsemcounttcgat.txt.gz"]
gene_cols = ["Hugo_Symbol", "Entrez_Gene_Id"]

In [20]:
save_for_DESeq(data_dir, cvx_healthy, cvx_tumor, "unified_cervical_data", gene_cols, "healthy", "tumor")
# save_for_DESeq(data_dir, ut_healthy, ut_tumor, "unified_uterine_data", gene_cols, "healthy", "tumor")