In [1]:
import pandas as pd
import numpy as np
import math

# 1. Data pre-processing

## 1.1. GSE2034 
Raw data are preprocessed using the RMA method. Two criteria were used to create DEG lists: simple
fold change and p-value (calculated using a simple t-test).

In [2]:
# Preprocess data from file and return new DataFrame and dictionary of genes with their corresponding KEGG-IDs
def data_preprocessing(file):
    raw_df=pd.read_csv(file, compression='zip', header=0, sep='\t', quotechar='"')
    new_df=raw_df.copy()

    # preprocess dataset
    new_df[['Gene','KEGG-ID']] = new_df['Class'].str.split('#',expand=True)
    new_df.drop('Class', inplace=True, axis=1)
    cols = new_df.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    new_df=new_df[cols]
    
    # Change column names (estrogen receptor)
    labels=new_df.columns[2:]
    for x in range(len(labels)):
        if(labels[x].startswith('ERpos')):
            labels.values[x]="ERpos"
        elif(labels[x].startswith('ERneg')):
            labels.values[x]="ERneg"

    # Get the genes that are present in the GSE2034 dataset and create a dictionary 
    # where the keys are the genes and the values are the corresponding KEGG-IDs
    # 'a gene can be mapped to more than one Entrez identifier'
    gene_list=sorted(set(new_df['Gene'].tolist()))
    gene_dict={}
    for i in gene_list:
        tmp=new_df.loc[new_df['Gene'] == i]['KEGG-ID']
        tmp_list=[]
        for t in tmp:
            tmp_list.append(t)
        gene_dict.update({i:tmp_list})  
        
    # transpose dataframe so that the columns indicate the genes
    # and rows correspond to samples (class: ERpos or ERneg)
    genes=new_df['Gene']
    new_df.drop('KEGG-ID', inplace=True, axis=1)
    new_df=np.transpose(new_df.iloc[:,1:])
    new_df.columns=genes.values.tolist()
    
    # Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
    # value and get the following simplified dataframe
    new_df=new_df.groupby(level=0,axis=1).mean()
            
    return new_df,gene_dict

In [3]:
gse2034_df,gene_dict=data_preprocessing('Data/GSE2034.zip')
#gse2034_df

In [9]:
from BaselineRemoval import BaselineRemoval
from sklearn.linear_model import LinearRegression

# RMA is an algorithm used to create an expression matrix from Affymetrix data. The raw intensity values are background 
# corrected, log2 transformed and then quantile normalized. Next a linear model is fit to the normalized data to obtain an 
# expression measure for each probe set on each array.

# https://cmdlinetips.com/2020/06/computing-quantile-normalization-in-python/
def quantile_norm(df):
    # 1. Order values in each sample 
    df_sorted = pd.DataFrame(np.sort(df.values, axis=0), index=df.index, columns=df.columns)
    columns = df.columns
    df.columns = np.arange(1, df.shape[1] + 1)
    
    # 2. Compute Row Means
    df_mean = df_sorted.mean(axis=1)
    df_mean.index = np.arange(1, len(df_mean) + 1) # Change index to reflect that the mean computed is ranked from from low to high.
    
    # 3. Use Average Values to each sample in the original order
    df_qn =df.rank(method="min").stack().astype(int).map(df_mean).unstack()
    df_qn.columns=columns
    
    return df_qn.transpose()

def rma_discretization(df):
    # Background correction
    Zhangfit_output=[]
    for i in range(df.shape[0]):
        baseObj=BaselineRemoval(df.iloc[i])
        Zhangfit_output.append(baseObj.ZhangFit())
    
    # log2 transformation
    #log2_df=np.log2(pd.DataFrame(Zhangfit_output,columns=df.columns,index=df.index))
    #log2_df=log2_df.fillna(0) 
    log_list=np.sign(Zhangfit_output)*np.log2(np.array(list(map(abs, Zhangfit_output)))+1)
    log2_df = pd.DataFrame(log_list,columns=df.columns,index=df.index)
    
    # Quantile normalization
    return quantile_norm(log2_df.transpose())
    
gse2034_dis_df=rma_discretization(gse2034_df)
#gse2034_dis_df

In [10]:
# DEG lists: simple fold change and p-value

## 1.2. Selected sub-paths
The pathways retrieved by the KEGG database were represented in the form of a graph.

In [11]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

In [12]:
from itertools import chain

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def get_pathway(data):
    s=[]
    cnt_act=len(data.split(relations_dict['Activation']))
    cnt_tmp=1
    # If there is at least one Activation relation, then split data
    if(cnt_act>0):
        for e in data.split(relations_dict['Activation']):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt_act):
                    s.append(relations_dict['Activation'])
                cnt_tmp+=1

    for i in range(len(s)):
        tmp_s=[]
        cnt_inh=len(s[i].split(relations_dict['Inhibition']))
        cnt_tmp=1
        # If there is at least one Inhibition relation, then split data
        if(cnt_inh>0):
            for e in s[i].split(relations_dict['Inhibition']):
                if e:
                    tmp_s.append(e)
                    # Remove the final relation
                    if(cnt_tmp<cnt_inh):
                        tmp_s.append(relations_dict['Inhibition'])
                    cnt_tmp+=1
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

In [13]:
def subpaths_preprocessing(file):
    raw_df = pd.read_csv(file, compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
    
    data=[]
    for row in raw_df['SubPathID']:
        data.append(get_pathway(row))
    
    raw_df = pd.DataFrame(data).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes
    return(raw_df)

selected_df=subpaths_preprocessing('Data/Selected.zip')
#selected_df

In [14]:
import statistics

gene_expression_profiles_df=selected_df.copy()
expr_prof_tmp={}
for row in range(gene_expression_profiles_df.shape[0]):
    row_tmp=[]
    path_tmp=gene_expression_profiles_df.iloc[row][~gene_expression_profiles_df.iloc[row].isnull()]
    expr_prof_tmp[row]=[]
    
    for i in path_tmp[::2]:
        node_genes=[(g.split('#')) for g in list(filter(None,i.split(' ')))] # Get genes of node
        node_genes=list(filter(None, node_genes))
        tmp_expr_vals=[]
        
        for n in node_genes:
            # Check if gene exists in gse2034 dataset or KEGG-ID with noProbe correspondes to specific gene from gse2034 dataset
            if(n[0]=='noProbe'):
                if(not(n[1] in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis_df.shape[0])).astype(int))
                    continue
            if(len(n)==1):
                if(not(n in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis_df.shape[0])).astype(int))
                    continue
            tmp_expr_vals.append(list(gse2034_dis_df[n[0]]))
    
        if(len(tmp_expr_vals)>0):
            expr_prof_tmp[row].append((np.transpose(tmp_expr_vals)).max(axis=1))
              
#expr_prof_tmp