In [84]:
import pandas as pd
import numpy as np
from scipy.stats import iqr

In [106]:
base = "/home/colombelli/Documents/datasets/pankidney firebrowse/"

def load_data(base):
    gene = pd.read_csv(base + 'gene.txt', sep='\t', index_col=0)
    mirna = pd.read_csv(base + 'mirna.txt', sep='\t', index_col=0)
    meth = pd.read_csv(base + 'meth.txt', sep='\t', index_col=0)
    clin = pd.read_csv(base + 'clin.txt', sep='\t', index_col=0)
    return gene.T, mirna.T, meth.T.iloc[:, 1:], clin.T.iloc[:, [6]]

def log2(gene, mirna):
    return np.log2(gene+1), np.log2(mirna+1)

def process_nan_values(df):
    processed_df = df
    max_nan_values = len(df) * 0.05
    for col in df.columns:
        nan_count = df[col].isnull().sum()
        if nan_count > max_nan_values:
            processed_df = processed_df.drop(col, axis=1) 
            
    print("Columns before nan processing: ", len(df.columns))
    print("Columns after nan processing: ", len(processed_df.columns))
    return processed_df.fillna(0)


# Currently the meadian of the vial values are used as the sample value
def process_vial(df, idx):
    new_idx=[]
    for i in idx:
        if len(i.split('-')[-1]) > 2:
            new_idx.append(i[:-1])
    
    df['index'] = new_idx # Possibly, there will be repeated indexes => mean the value
    return df.groupby(['index']).mean()
    

def process_indexes(df):
    # Get only the following infos from barcode: 
    # Project-TSS-Participant-Sample_Vial (Vial if present, otherwise only sample)
    idx = ['-'.join(i.split('-')[:4]) for i in df.index]

    # Check if idx has only unique elements
    if(len(set(idx)) != len(idx)):
        raise(Exception("Indexes processing resulted in colliding indexes! Aborting..."))
        
    # Check if vial info is present:
    for i in idx:
        if len(i.split('-')[-1]) > 2: # It is present
            return process_vial(df, idx)
    
    df.index = idx
    return df
    
    

def get_classes(df, clin):
    
    normal = []
    stage1 = []
    stage2 = []
    stage3 = []
    stage4 = []
    
    for barcode in df.index:
        splt = barcode.split('-')
        patient_id = '-'.join(splt[:-1]).lower()
        sample_type = splt[-1]
        
        if int(sample_type) <= 9: # Tumoral sample
            stage = clin.loc[patient_id,'pathologic_stage']
            if stage == 'stage i':
                stage1.append(barcode)
            elif stage == 'stage ii':
                stage2.append(barcode)
            elif stage == 'stage iii':
                stage3.append(barcode)
            elif stage == 'stage iv':
                stage4.append(barcode)
            else:
                print("Unexpected stage ("+str(stage)+") for patient: ", patient_id, "\nIgnoring...")
            
            
        elif int(sample_type) <= 19:  # Normal sample
            normal.append(barcode)
            
        # else: control sample -> ignore
        
        
    return normal, stage1, stage2, stage3, stage4


def save_splitted_df(base_path, df, normal, stage1, stage2, stage3, stage4):
    df[df.index.isin(normal)].to_csv(base_path+"normal.csv")
    df[df.index.isin(stage1)].to_csv(base_path+"stage1.csv")
    df[df.index.isin(stage2)].to_csv(base_path+"stage2.csv")        
    df[df.index.isin(stage3)].to_csv(base_path+"stage3.csv")        
    df[df.index.isin(stage4)].to_csv(base_path+"stage4.csv")
    return


def save_processed_dfs(base_path, gene, mirna, meth):
    gene.to_csv(base_path+"gene_proc.csv")
    mirna.to_csv(base_path+"mirna_proc.csv")    
    meth.to_csv(base_path+"meth_proc.csv")
    return
    

# dataframes: [gene, mirna, meth]
# stages: [normal, stage1, stage2, stage3, stage4]
# only_common: if the dataframes to be saved are supposed to have only common samples
def save_dfs(base_path, dataframes, stages, only_common=True):
    gene, mirna, meth = dataframes
    normal, stage1, stage2, stage3, stage4 = stages
    
    # Eliminate samples without label
    all_possible_indexes = [item for sublist in stages for item in sublist]
    gene = gene[gene.index.isin(all_possible_indexes)]
    mirna = mirna[mirna.index.isin(all_possible_indexes)]
    meth = meth[meth.index.isin(all_possible_indexes)]

    
    if only_common:
        common_samples = list(set(gene.index)&set(mirna.index)&set(meth.index))
        cs = set(common_samples)
        
        print("Number of common samples (inter-omics) by class:")
        print("normal: ", len(cs&set(normal)))
        print("stage1: ", len(cs&set(stage1)))
        print("stage2: ", len(cs&set(stage2)))
        print("stage3: ", len(cs&set(stage3)))
        print("stage4: ", len(cs&set(stage4)))
        print("\nTotal samples: ", len(cs))
        
        gene = gene.loc[common_samples, :]
        mirna = mirna.loc[common_samples, :]
        meth = meth.loc[common_samples, :]
    
    save_splitted_df(base_path+"split_class/", gene, normal, stage1, stage2, stage3, stage4)
    save_processed_dfs(base_path, gene, mirna, meth)
    return

In [99]:
gene, mirna, meth, clin = load_data(base)

  if (await self.run_code(code, result,  async_=asy)):


In [100]:
gene, mirna = log2(gene, mirna)

print("Gene expression")
gene = process_indexes(process_nan_values(gene))
print("\nmiRNA expression")
mirna = process_indexes(process_nan_values(mirna))
print("\nMethylation")
meth = process_indexes(process_nan_values(meth))

print()
normal, stage1, stage2, stage3, stage4 = get_classes(gene, clin)

Gene expression
Columns before nan processing:  20531
Columns after nan processing:  20531

miRNA expression
Columns before nan processing:  2588
Columns after nan processing:  341

Methylation
Columns before nan processing:  20116
Columns after nan processing:  20116

Unexpected stage (nan) for patient:  tcga-bp-4798 
Ignoring...
Unexpected stage (nan) for patient:  tcga-mm-a563 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jv 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jw 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jy 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jz 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k0 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k2 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k3 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k4 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k6 
Ignoring...
Unexpected stage (nan) for patient:  

In [101]:
print("Normal samples: ", len(normal))
print("Stage 1 samples: ", len(stage1))
print("Stage 2 samples: ", len(stage2))
print("Stage 3 samples: ", len(stage3))
print("Stage 4 samples: ", len(stage4))

print("\nTotal samples: ", len(normal)+len(stage1)+len(stage2)+len(stage3)+len(stage4))

Normal samples:  129
Stage 1 samples:  461
Stage 2 samples:  104
Stage 3 samples:  189
Stage 4 samples:  105

Total samples:  988


In [107]:
base_path = "/home/colombelli/Documents/datasets/pankidney firebrowse/"
dataframes=[gene,mirna,meth]
stages=[normal, stage1, stage2, stage3, stage4]
save_dfs(base_path, dataframes, stages, only_common=True)

Number of common samples (inter-omics) by class:
normal:  47
stage1:  284
stage2:  65
stage3:  98
stage4:  53

Total samples:  547


# Second processing part

### Features definition by IQR selection 

In [112]:
def select_k_highest_iqr(k, df):
    iqrs=[]
    for col in df.columns:
        iqrs.append(iqr(df[col]))
            
    selected_idx = (-np.array(iqrs)).argsort()[:k]
    return df.iloc[:, selected_idx]


# k_iqrs: [gene_expr, mirna_expr, meth_expr]
def build_iqr_features_df(base_path, k_iqrs, drop_samples_with_missing_features=True):
    gene = pd.read_csv(base_path+"gene_proc.csv", index_col=0)
    mirna = pd.read_csv(base_path+"mirna_proc.csv", index_col=0)
    meth = pd.read_csv(base_path+"meth_proc.csv", index_col=0) 
    
    dfs = [gene,mirna,meth]
    sel_dfs = []
    for i, df in enumerate(dfs):
        sel_dfs.append(select_k_highest_iqr(k_iqrs[i], df))
        
    features_df = pd.concat(sel_dfs, axis=1)
    if drop_samples_with_missing_features:
        features_df = features_df.dropna(axis=0)
        
    features_df.to_csv(base_path+"stellargraph/features.csv")
    return features_df

In [113]:
gene_k_iqr = 500
mirna_k_iqr = 100
meth_k_iqr = 100

base_path="/home/colombelli/Documents/datasets/pankidney firebrowse/"
fdf = build_iqr_features_df(base_path, [gene_k_iqr, mirna_k_iqr, meth_k_iqr], False)

In [114]:
fdf

Unnamed: 0_level_0,RPS4Y1|6192,XIST|7503,CA9|768,DDX3Y|8653,PTGER3|5733,KDM5D|8284,GSTA2|2939,SLC6A3|6531,EIF1AY|9086,RAB25|57111,...,OR9K2,OR56B1,OTOL1,C14orf72,DEFB126,EMR4P,OR4C16,PRO0611,RXFP4,OR6P1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-B0-5400-01,5.138925,11.887151,11.928962,3.690160,8.401677,2.990265,10.354646,7.980428,0.000000,2.313014,...,0.589344,0.920016,0.629488,0.326077,0.689465,0.479498,0.813038,0.733196,0.632240,0.417707
TCGA-G7-A8LE-01,8.443256,2.986848,2.030936,7.856714,4.473047,7.145596,0.000000,1.268674,5.274359,10.021545,...,0.822448,0.800152,0.891864,0.177494,0.507953,0.476607,0.909109,0.894191,0.299492,0.641600
TCGA-P4-A5EA-01,0.000000,11.559032,0.000000,0.000000,6.852226,0.000000,6.241711,1.207955,0.000000,10.169780,...,0.362346,0.560984,0.337567,0.234217,0.477036,0.314490,0.519789,0.680685,0.692522,0.348567
TCGA-SX-A7SO-01,12.474486,0.000000,2.543199,10.862856,8.857511,10.304225,5.121082,3.868726,9.138974,0.000000,...,0.257150,0.743057,0.471950,0.521039,0.453159,0.331517,0.562262,0.578927,0.496706,0.138835
TCGA-B0-5694-01,12.820641,3.516885,12.952463,10.314900,6.801565,9.505929,13.488926,10.030239,8.954390,5.495347,...,0.485916,0.887604,0.650508,0.268488,0.669350,0.714459,0.876791,0.791309,0.583663,0.748776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-CJ-5677-01,1.086920,12.105577,12.180472,1.425620,8.373600,0.000000,12.240359,7.080499,0.000000,0.000000,...,0.414378,0.803543,0.645572,0.390250,0.459866,0.400397,0.600525,0.729499,0.619339,0.377630
TCGA-BQ-7045-01,8.257415,1.734785,9.034842,6.238504,1.362666,4.530988,2.499935,0.000000,4.294716,10.711983,...,0.756071,0.519462,0.791488,0.351588,0.333179,0.510279,0.822644,0.900117,0.546166,0.406740
TCGA-MH-A855-01,0.000000,11.519647,4.335526,0.000000,5.048485,0.000000,3.193236,0.492930,0.000000,3.483274,...,0.743566,0.912884,0.594592,0.421526,0.468252,0.448820,0.747629,0.797487,0.747395,0.620825
TCGA-MH-A854-01,0.000000,7.188755,7.149929,0.000000,3.984334,1.332966,1.332966,0.815248,0.000000,10.587224,...,0.692744,0.926121,0.754101,0.255937,0.563892,0.499358,0.886419,0.912725,0.281007,0.872019
