In [1]:
import pandas as pd
import numpy as np
from scipy.stats import iqr

In [9]:
base = "/home/colombelli/Documents/datasets/pankidney firebrowse/"

def load_data(base):
    gene = pd.read_csv(base + 'gene.txt', sep='\t', index_col=0)
    mirna = pd.read_csv(base + 'mirna.txt', sep='\t', index_col=0)
    meth = pd.read_csv(base + 'meth.txt', sep='\t', index_col=0)
    clin = pd.read_csv(base + 'clin.txt', sep='\t', index_col=0)
    return gene.T, mirna.T, meth.T.iloc[:, 1:], clin.T.iloc[:, [6]]

def log2(gene, mirna):
    return np.log2(gene+1), np.log2(mirna+1)

def process_nan_values(df):
    processed_df = df
    max_nan_values = len(df) * 0.05
    for col in df.columns:
        nan_count = df[col].isnull().sum()
        if nan_count > max_nan_values:
            processed_df = processed_df.drop(col, axis=1) 
            
    print("Columns before nan processing: ", len(df.columns))
    print("Columns after nan processing: ", len(processed_df.columns))
    return processed_df.fillna(0)


# Currently the meadian of the vial values are used as the sample value
def process_vial(df, idx):
    new_idx=[]
    for i in idx:
        if len(i.split('-')[-1]) > 2:
            new_idx.append(i[:-1])
    
    df['index'] = new_idx # Possibly, there will be repeated indexes => mean the value
    return df.groupby(['index']).mean()
    

def process_indexes(df):
    # Get only the following infos from barcode: 
    # Project-TSS-Participant-Sample_Vial (Vial if present, otherwise only sample)
    idx = ['-'.join(i.split('-')[:4]) for i in df.index]

    # Check if idx has only unique elements
    if(len(set(idx)) != len(idx)):
        raise(Exception("Indexes processing resulted in colliding indexes! Aborting..."))
        
    # Check if vial info is present:
    for i in idx:
        if len(i.split('-')[-1]) > 2: # It is present
            return process_vial(df, idx)
    
    df.index = idx
    return df
    
    

def get_classes(df, clin):
    
    normal = []
    stage1 = []
    stage2 = []
    stage3 = []
    stage4 = []
    
    for barcode in df.index:
        splt = barcode.split('-')
        patient_id = '-'.join(splt[:-1]).lower()
        sample_type = splt[-1]
        
        if int(sample_type) <= 9: # Tumoral sample
            stage = clin.loc[patient_id,'pathologic_stage']
            if stage == 'stage i':
                stage1.append(barcode)
            elif stage == 'stage ii':
                stage2.append(barcode)
            elif stage == 'stage iii':
                stage3.append(barcode)
            elif stage == 'stage iv':
                stage4.append(barcode)
            else:
                print("Unexpected stage ("+str(stage)+") for patient: ", patient_id, "\nIgnoring...")
            
            
        elif int(sample_type) <= 19:  # Normal sample
            normal.append(barcode)
            
        # else: control sample -> ignore
        
        
    return normal, stage1, stage2, stage3, stage4


def save_splitted_df(base_path, df, normal, stage1, stage2, stage3, stage4):
    df[df.index.isin(normal)].to_csv(base_path+"normal.csv")
    df[df.index.isin(stage1)].to_csv(base_path+"stage1.csv")
    df[df.index.isin(stage2)].to_csv(base_path+"stage2.csv")        
    df[df.index.isin(stage3)].to_csv(base_path+"stage3.csv")        
    df[df.index.isin(stage4)].to_csv(base_path+"stage4.csv")
    return


def save_processed_dfs(base_path, gene, mirna, meth):
    gene.to_csv(base_path+"gene_proc.csv")
    if isinstance(mirna, pd.DataFrame):
        mirna.to_csv(base_path+"mirna_proc.csv")
    if isinstance(meth, pd.DataFrame):
        meth.to_csv(base_path+"meth_proc.csv")
    return
    

def save_classes(base_path, df, stages):
    flatten_idx = [item for sublist in stages for item in sublist]
    normal = ['normal']*len(stages[0])
    stage1 = ['stage1']*len(stages[1])
    stage2 = ['stage2']*len(stages[2])
    stage3 = ['stage3']*len(stages[3])
    stage4 = ['stage4']*len(stages[4])
    
    class_col = normal+stage1+stage2+stage3+stage4
    
    classes = pd.DataFrame({'id':flatten_idx, 'class':class_col})
    classes = classes.loc[classes['id'].isin(df.index)]
    
    classes.to_csv(base_path+"stellargraph/classes.csv", index=False)
    return
    
    

# dataframes: [gene, mirna, meth]
# stages: [normal, stage1, stage2, stage3, stage4]
# only_common: if the dataframes to be saved are supposed to have only common samples
def save_dfs(base_path, dataframes, stages, only_common=True):
        
    normal, stage1, stage2, stage3, stage4 = stages
    
    # Eliminate samples without label
    all_possible_indexes = [item for sublist in stages for item in sublist]
    
    no_integration=False
    if len(dataframes) == 1:
        print("No omics integration will be considered!\n")
        no_integration=True
    
    if no_integration:
        gene = dataframes[0]
        gene = gene[gene.index.isin(all_possible_indexes)]
        mirna=None
        meth=None
        
    else:
        gene, mirna, meth = dataframes
        gene = gene[gene.index.isin(all_possible_indexes)]
        mirna = mirna[mirna.index.isin(all_possible_indexes)]
        meth = meth[meth.index.isin(all_possible_indexes)]
    
    if only_common:
        if no_integration:
            common_samples = list(gene.index)
            gene = gene.loc[common_samples, :]
        else:
            common_samples = list(set(gene.index)&set(mirna.index)&set(meth.index))
            gene = gene.loc[common_samples, :]
            mirna = mirna.loc[common_samples, :]
            meth = meth.loc[common_samples, :]

            
        cs = set(common_samples)
        
        print("Number of common samples (inter-omics) by class:")
        print("normal: ", len(cs&set(normal)))
        print("stage1: ", len(cs&set(stage1)))
        print("stage2: ", len(cs&set(stage2)))
        print("stage3: ", len(cs&set(stage3)))
        print("stage4: ", len(cs&set(stage4)))
        print("\nTotal samples: ", len(cs))
    
    save_classes(base_path, gene, stages)
    save_splitted_df(base_path+"split_class/", gene, normal, stage1, stage2, stage3, stage4)
    save_processed_dfs(base_path, gene, mirna, meth)
    return

In [3]:
gene, mirna, meth, clin = load_data(base)

  if (await self.run_code(code, result,  async_=asy)):


In [4]:
gene, mirna = log2(gene, mirna)

print("Gene expression")
gene = process_indexes(process_nan_values(gene))
print("\nmiRNA expression")
mirna = process_indexes(process_nan_values(mirna))
print("\nMethylation")
meth = process_indexes(process_nan_values(meth))

print()
normal, stage1, stage2, stage3, stage4 = get_classes(gene, clin)

Gene expression
Columns before nan processing:  20531
Columns after nan processing:  20531

miRNA expression
Columns before nan processing:  2588
Columns after nan processing:  341

Methylation
Columns before nan processing:  20116
Columns after nan processing:  20116

Unexpected stage (nan) for patient:  tcga-bp-4798 
Ignoring...
Unexpected stage (nan) for patient:  tcga-mm-a563 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jv 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jw 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jy 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9jz 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k0 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k2 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k3 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k4 
Ignoring...
Unexpected stage (nan) for patient:  tcga-5p-a9k6 
Ignoring...
Unexpected stage (nan) for patient:  

In [5]:
print("Normal samples: ", len(normal))
print("Stage 1 samples: ", len(stage1))
print("Stage 2 samples: ", len(stage2))
print("Stage 3 samples: ", len(stage3))
print("Stage 4 samples: ", len(stage4))

print("\nTotal samples: ", len(normal)+len(stage1)+len(stage2)+len(stage3)+len(stage4))

Normal samples:  129
Stage 1 samples:  461
Stage 2 samples:  104
Stage 3 samples:  189
Stage 4 samples:  105

Total samples:  988


In [10]:
base_path = "/home/colombelli/Documents/datasets/pankidney firebrowse/"
dataframes=[gene,mirna,meth]  #for no integration = [gene]
stages=[normal, stage1, stage2, stage3, stage4]
save_dfs(base_path, dataframes, stages, only_common=True)

Number of common samples (inter-omics) by class:
normal:  47
stage1:  284
stage2:  65
stage3:  98
stage4:  53

Total samples:  547


# Second processing part

### Features definition by IQR selection 

In [7]:
def select_k_highest_iqr(k, df):
    iqrs=[]
    for col in df.columns:
        iqrs.append(iqr(df[col]))
            
    selected_idx = (-np.array(iqrs)).argsort()[:k]
    return df.iloc[:, selected_idx]


# k_iqrs: [gene_expr, mirna_expr, meth_expr]
def build_iqr_features_df(base_path, k_iqrs, integration=True, drop_samples_with_missing_features=True):
    gene = pd.read_csv(base_path+"gene_proc.csv", index_col=0)
    
    if integration:
        mirna = pd.read_csv(base_path+"mirna_proc.csv", index_col=0)
        meth = pd.read_csv(base_path+"meth_proc.csv", index_col=0) 
        dfs = [gene,mirna,meth]
        
    else:
        dfs = [gene]
        
    sel_dfs = []
    for i, df in enumerate(dfs):
        sel_dfs.append(select_k_highest_iqr(k_iqrs[i], df))
    
    sel_dfs[0].T.to_csv(base_path+"gene_iqr.csv")
    
    features_df = pd.concat(sel_dfs, axis=1)
    if drop_samples_with_missing_features:
        features_df = features_df.dropna(axis=0)
        
    features_df.to_csv(base_path+"stellargraph/features.csv")
    return features_df

In [8]:
gene_k_iqr = 500
mirna_k_iqr = 100
meth_k_iqr = 100

base_path="/home/colombelli/Documents/datasets/pankidney firebrowse/"
integration=True #for no integration = False
drop_missing_features=True
fdf = build_iqr_features_df(base_path, [gene_k_iqr, mirna_k_iqr, meth_k_iqr], integration, drop_missing_features)

# At this point the coexp_net_build.r must be executed

# Third processing part

### Transform correlation data into loadable stellargraph data

In [4]:
def corr_to_edges(base_path, networks):
    for net in networks:
        path = base_path+net+"/"
        df = pd.read_csv(path+"corr.txt", sep=" ", header=None)
        df.columns = ["source", "target", "weight"]
        
        new_ids = [i.replace('.', '-') for i in df['source'].values]
        df['source'] = new_ids
        
        new_ids = [i.replace('.', '-') for i in df['target'].values]
        df['target'] = new_ids
        
        df.to_csv(path+"edges.csv", index=False)

In [5]:
base_path = "/home/colombelli/Documents/datasets/pankidney firebrowse/stellargraph/"
networks = ["N1", "N2", "N3", "N4", "N5"]
corr_to_edges(base_path, networks)