### Erase previous outputs from command prompt
1. Go to file's directory
2. "pip install nbstripout"
3. "nbstripout mynotebook.ipynb"

In [None]:
import pandas as pd
import numpy as np
import math

## 1. Preprocess datasets

In [None]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

### 1.1. GSE2034

In [None]:
gse2034_raw = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"')
gse2034=gse2034_raw.copy()

# preprocess dataset
gse2034[['Gene','KEGG-ID']] = gse2034['Class'].str.split('#',expand=True)
gse2034.drop('Class', inplace=True, axis=1)
cols = gse2034.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034=gse2034[cols]

#gse2034

In [None]:
# Change column names
labels=gse2034.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [None]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=gse2034.loc[gse2034['Gene'] == i]['KEGG-ID']
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
    gene_dict.update({i:tmp_list})
#print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

In [None]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=gse2034['Gene']
gse2034.drop('KEGG-ID', inplace=True, axis=1)
gse2034=np.transpose(gse2034.iloc[:,1:])
gse2034.columns=genes.values.tolist()
#gse2034.columns

In [None]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034=gse2034.groupby(level=0,axis=1).mean()
#gse2034

### 1.2. Selected

In [None]:
selected_raw = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
#selected_raw

## 2. Discretization process
Transform gene expression values into high (expressed / up-regulated) or low (not-expressed / down-regulated) gene expression binary equivalents

### 2.1. The expression values of a gene over the total number of input samples are sorted in descending order;

In [None]:
genes_dis = pd.DataFrame(np.nan, index=gse2034.columns, columns=['Expression value'])
genes_dis['Expression value']=gse2034.sum()/gse2034.shape[0] # sum of expression values of a gene / total number of samples
genes_dis=genes_dis.sort_values(by=['Expression value'],ascending=False)
#genes_dis

### 2.2. The midpoints between each two consecutive values are calculated;

In [None]:
def midpoint(num1,num2):
    return (num1+num2)/2

midpoints_dict={} # Key corresponds to the position of the first gene. The second gene is in the next position (i+1).
for i in range(genes_dis.shape[0]-1):
    midpoints_dict.update({i:midpoint(genes_dis.iloc[i]['Expression value'],genes_dis.iloc[i+1]['Expression value'])})

#midpoints_dict

### 2.3. For each midpoint, μi, the Information Gain (IG) of the system is computed. Let IG(S,μi) to denote the IG of the system for midpoint μi.

In [None]:
classes=sorted(set(gse2034.index)) # Τhe classes to which a sample may belong
samples=gse2034.index # the samples class

# Calculate the proportion of samples in S that belong in Class C
def P(C,S):
    return list(S).count(C)/len(S)

def E(S,m=1):
    # m not given: calculate the entropy of the system taking into account the prior assignment of sample cases into phenotype classes
    # m given: calculate the respective entropy of the system taking into account its division into subgroups around midpoint μi
    tmp=0
    for c in classes:
        # P(c,S) must be greater than zero
        tmp+=P(c,S)*math.log(P(c,S))/m
    return -(tmp)

entropy=E(samples)
#print('Dataset Entropy: %.3f bits' % entropy)

In [None]:
# Calculate the Information Gain (IG) of the system
def IG(S,m):
    return E(S)-E(S,m)

information_gain=[]
for m in list(midpoints_dict.values()):
    information_gain.append(IG(samples,m))
#print('Information Gain: '+str(information_gain))

In [None]:
# The midpoint with the highest information gain is selected as the discretization point
max_value = max(information_gain)
max_mid_pos = information_gain.index(max(information_gain))
dis_point=midpoints_dict.get(max_mid_pos)
#print('Discretization point: %.3f' %dis_point)

### 2.4. The sample cases with expression values lower than the discretization point are assigned the '0' value (meaning that the gene is under-expressed), and the sample cases with expression values bigger that the discretization point are assigned the '1' value (the gene is over-expressed).
The discretization process is applied for each gene separately, and the final dataset is a matrix of discretized, actually binarized, values.

In [None]:
gse2034_dis=gse2034.copy()
gse2034_dis[gse2034_dis<dis_point]=0 # under-expressed
gse2034_dis[gse2034_dis>=dis_point]=1 # over-expressed
gse2034_dis=gse2034_dis.astype('int')
#gse2034_dis

## 3. Matching sub-paths with gene expression profiles

In [None]:
from itertools import chain

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def calc_pathway(data):
    s=[]
    cnt_act=len(data.split(relations_dict['Activation']))
    cnt_tmp=1
    # If there is at least one Activation relation, then split data
    if(cnt_act>0):
        for e in data.split(relations_dict['Activation']):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt_act):
                    s.append(relations_dict['Activation'])
                cnt_tmp+=1

    for i in range(len(s)):
        tmp_s=[]
        cnt_inh=len(s[i].split(relations_dict['Inhibition']))
        cnt_tmp=1
        # If there is at least one Inhibition relation, then split data
        if(cnt_inh>0):
            for e in s[i].split(relations_dict['Inhibition']):
                if e:
                    tmp_s.append(e)
                    # Remove the final relation
                    if(cnt_tmp<cnt_inh):
                        tmp_s.append(relations_dict['Inhibition'])
                    cnt_tmp+=1
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

data=[]
for row in selected_raw['SubPathID']:
    data.append(calc_pathway(row))

#data

In [None]:
selected=pd.DataFrame(data).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes

# Change index names
index= ['P'+str(i) for i in range(selected.shape[0])]
selected.index=index

#selected

In [None]:
import statistics

gene_expression_profiles=selected.copy()
expr_prof_tmp={}
for row in range(gene_expression_profiles.shape[0]):
    row_tmp=[]
    path_tmp=gene_expression_profiles.iloc[row][~gene_expression_profiles.iloc[row].isnull()]
    expr_prof_tmp[row]=[]
    
    for i in path_tmp[::2]:
        node_genes=[(g.split('#')) for g in list(filter(None,i.split(' ')))] # Get genes of node
        node_genes=list(filter(None, node_genes))
        tmp_expr_vals=[]
        
        for n in node_genes:
            # Check if gene exists in gse2034 dataset or KEGG-ID with noProbe correspondes to specific gene from gse2034 dataset
            if(n[0]=='noProbe'):
                if(not(n[1] in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis.shape[0])).astype(int))
                    continue
            if(len(n)==1):
                if(not(n in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis.shape[0])).astype(int))
                    continue
            tmp_expr_vals.append(list(gse2034_dis[n[0]]))
    
        if(len(tmp_expr_vals)>0):
            expr_prof_tmp[row].append((np.transpose(tmp_expr_vals)).max(axis=1))
              
#expr_prof_tmp

In [None]:
# Replace genes in pathway with their samples (max value in case of multiple genes in node)
for row in range(tmp_selected.shape[0]):
    for column in range(0,gene_expression_profiles.iloc[row][~gene_expression_profiles.iloc[row].isnull()].shape[0],2):
        # Since nodes are in the even columns, the correspondence with the positions of the table expr_prof_tmp of consecutive 
        # positions is calculated as follows: for each column c, we get c/2 (even number/2= even number)
        gene_expression_profiles.iat[row,column]=expr_prof_tmp[row][int(column/2)]
        
#gene_expression_profiles

In [None]:
def and_boolean_op(num1,num2):
    result=[]
    for n in range(len(num1)):
        result.append(num1[n]*num2[n])
    return result
    
def xor_boolean_op(num1,num2):
    result=[]
    for n in range(len(num1)):
        result.append(1 if(num1[n] and not num2[n]) or (not num1[n] and num2[n]) else 0)
    return result

operations_dict={'Activation':and_boolean_op,'Inhibition':xor_boolean_op}

In [None]:
# Calculate the pathway expression with boolean operations
def calc_pathway_expression(path,prev_result):
    if(len(path)>0):
        t=0
        relation=list(relations_dict.keys())[list(relations_dict.values()).index(path.iloc[0])]
        next_node=path.iloc[1]
        #print(str(prev_result)+':'+relation+':'+str(next_node))
        result=operations_dict[relation](prev_result,next_node)
        calc_pathway_expression(path.iloc[2:].reset_index(drop=True),result)
    return prev_result

tmp_path=gene_expression_profiles.iloc[0][~gene_expression_profiles.iloc[0].isnull()]
results=[]
for row in range(gene_expression_profiles.shape[0]):
    tmp_path=gene_expression_profiles.iloc[row][~gene_expression_profiles.iloc[row].isnull()]
    results.append(calc_pathway_expression(tmp_path.iloc[1:].reset_index(drop=True),tmp_path[0])) 

In [None]:
# Binary sub-path expression matrix
funct_subpaths=pd.DataFrame(results,index=list(gene_expression_profiles.index),columns=labels)
#funct_subpaths