In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [5]:
cancer="KIRC"
base_path="/home/colombelli/Documents/datasets/graph-omics/"

In [40]:
def load_data(base_path):
    gene = pd.read_csv(base_path + '.uncv2.mRNAseq_RSEM_all.txt', sep='\t', index_col=0)
    mirna = pd.read_csv(base_path + '.miRseq_mature_RPM.txt', sep='\t', index_col=0)
    meth = pd.read_csv(base_path + '.meth.by_mean.data.txt', sep='\t', index_col=0)
    return gene.T.iloc[:, 1:].apply(pd.to_numeric), mirna.T.apply(pd.to_numeric), meth.T.iloc[:, 1:].apply(pd.to_numeric)

def log2(gene, mirna):
    return np.log2(gene+1), np.log2(mirna+1)


# Currently the meadian of the vial values are used as the sample value
def process_vial(df, idx):
    new_idx=[]
    for i in idx:
        if len(i.split('-')[-1]) > 2:
            new_idx.append(i[:-1])
    
    df['index'] = new_idx # Possibly, there will be repeated indexes => mean the value
    return df.groupby(['index']).mean()
    

def process_indexes(df):
    # Get only the following infos from barcode: 
    # Project-TSS-Participant-Sample_Vial (Vial if present, otherwise only sample)
    idx = ['-'.join(i.split('-')[:4]) for i in df.index]

    # Check if idx has only unique elements
    if(len(set(idx)) != len(idx)):
        raise(Exception("Indexes processing resulted in colliding indexes! Aborting..."))
        
    # Check if vial info is present:
    for i in idx:
        if len(i.split('-')[-1]) > 2: # It is present
            return process_vial(df, idx)
    
    df.index = idx
    return df


def get_classes(df):
    
    normal = []
    tumoral = []
    
    for barcode in df.index:
        splt = barcode.split('-')
        patient_id = '-'.join(splt[:-1]).lower()
        sample_type = splt[-1]
        
        if int(sample_type) <= 9: # Tumoral sample
            tumoral.append(barcode)
            
        elif int(sample_type) <= 19:  # Normal sample
            normal.append(barcode)
            
        # else: control sample -> ignore
        
    return normal, tumoral

    

def build_class_columns(df, normal, tumoral):
    class_col=[]
    for idx in df.index:
        if idx in normal:
            class_col.append('0')
        else:
            class_col.append('1')
            
    df['class'] = class_col
    return df


def get_final_dfs(gene_df, mirna_df, meth_df):
    
    gene = deepcopy(gene_df)
    mirna = deepcopy(mirna_df)
    meth = deepcopy(meth_df)
    
    gene, mirna = log2(gene, mirna)
    print("Processing gene expression...")
    gene = process_indexes(gene)
    print("\nProcessing miRNA expression...")
    mirna = process_indexes(mirna)
    print("\nProcessing methylation...")
    meth = process_indexes(meth)

    print()
    normal, tumoral = get_classes(gene)

    gene = build_class_columns(gene, normal, tumoral)
    mirna = build_class_columns(mirna, normal, tumoral)
    meth = build_class_columns(meth, normal, tumoral)
    return gene, mirna, meth
    

def save_df(df, base_path, name):
    df.to_csv(base_path+name+".csv")
    return

In [42]:
path=base_path+cancer+"/"
gene, mirna, meth = load_data(path+cancer)

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [43]:
f_gene, f_mirna, f_meth = get_final_dfs(gene,mirna,meth)
print("\n\nSaving dataframes...")
save_df(f_gene, path, "gene_proc")
save_df(f_mirna, path, "mirna_proc")
save_df(f_meth, path, "meth_proc")

Processing gene expression...

Processing miRNA expression...

Processing methylation...



Saving dataframes...


In [49]:
def process_clin_df(base_path, save_name):
    clin = pd.read_csv(base_path + '.clin.merged.picked.txt', sep='\t', index_col=0).T
    new_idxs = [idx.upper() for idx in clin.index]
    clin.index = new_idxs
    clin.to_csv(base_path+save_name+".csv")
    return clin

clin = process_clin_df(path+cancer, "clin_proc")