In [None]:
import pandas as pd
import numpy as np
import math

# 1. Read data and make them easier to understand

## 1.1. GSE2034

In [None]:
raw_gse2034_df = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"') # Breast cancer
gse2034_df=raw_gse2034_df.copy()

# preprocess dataset
gse2034_df[['Gene','KEGG-ID']] = gse2034_df['Class'].str.split('#',expand=True)
gse2034_df.drop('Class', inplace=True, axis=1)
cols = gse2034_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034_df=gse2034_df[cols]

#gse2034_df

In [None]:
# Change column names (estrogen receptor)
labels=gse2034_df.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [None]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034_df['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=gse2034_df.loc[gse2034_df['Gene'] == i]['KEGG-ID']
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
    gene_dict.update({i:tmp_list})
#print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

In [None]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=gse2034_df['Gene']
gse2034_df.drop('KEGG-ID', inplace=True, axis=1)
gse2034_df=np.transpose(gse2034_df.iloc[:,1:])
gse2034_df.columns=genes.values.tolist()
#gse2034_df

In [None]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034_df=gse2034_df.groupby(level=0,axis=1).mean()
gse2034_df

## 1.2. Selected
Cellular processes (15), Signal transduction (Environmental information process) (24), Cancer overview (8).

In [None]:
raw_selected_df = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
#raw_selected_df

In [None]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

In [None]:
from itertools import chain

# Split each pathway based on the relation
def split_path(data,relation):
    s=[]
    cnt=len(data.split(relations_dict[relation]))
    cnt_tmp=1
    # If there is at least one relation, then split data
    if(cnt>0):
        for e in data.split(relations_dict[relation]):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt):
                    s.append(relations_dict[relation])
                cnt_tmp+=1
    return s

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def get_pathway(data):
    s=split_path(data,'Activation')

    for i in range(len(s)):
        tmp_s=split_path(s[i],'Inhibition')
        if(len(s[i])>1):
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

subpaths_list=[get_pathway(row) for row in raw_selected_df['SubPathID']]
#subpaths_list

In [None]:
selected_df=pd.DataFrame(subpaths_list).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes
selected_df

## 1.3. Important values

### 1.3.1. Node genes- all genes of each node

In [None]:
# Each node in a pathway represents a discrete function mapping to one or more transcript.
# Returns a dictionary corresponding each node of each pathway to its gene or genes.
def Node_genes(df):
    Node_genes={}
    for path in range(df.shape[0]):
        tmp_node=[]
        for node in range(0,len(df.iloc[path][~df.iloc[path].isnull()]),2):
            genes=list(filter(None,[x.strip() for x in df.iloc[path,node].split(' ')]))
            tmp_genes=[]
            for g in genes:
                tmp_genes.append(list(filter(None,[x.strip() for x in g.split('#')]))[0])
            tmp_node.append(tmp_genes)
        Node_genes.update({path:tmp_node})
    return Node_genes

node_genes=Node_genes(selected_df)
node_genes

### 1.3.2. Expression value

In [None]:
from statistics import mean

# Calculate fold change for each gene
def expression_value(df):
    expr_val_dict={}
    
    for gene in df.columns:
        # Get average value for each case
        expr_val=gse2034_df[gene].mean()
        
        expr_val_dict.update({gene:expr_val})
        
    expr_val_dict.update({'noProbe':mean(list(expr_val_dict.values()))})
        
    return expr_val_dict        

expr_val=expression_value(gse2034_df)
genes_df=pd.DataFrame(expr_val.values(),index=expr_val.keys(),columns=['Expression Value'])
genes_df

In [None]:
# For each node of a sub-path consisting of more than one genes, get the average value of the expression values.
def path_expression_value(path_no):
    cur_path=[]
    for node in node_genes[path_no]:
        cur_node=[]
        for gene in node:
            # Check if gene is 'noProbe' or not in genes' list
            if(gene=='noProbe' or not(gene in gene_dict.keys())):
                cur_node.append(genes_df.loc['noProbe']['Expression Value'])
                continue
            cur_node.append(genes_df.loc[gene]['Expression Value'])
        cur_path.append(mean(cur_node))
    return cur_path
         
def get_expression_values(node_genes):
    expression_values={}
    for path in node_genes:
        expression_values.update({path:path_expression_value(path)})  
    return expression_values

expression_values_dict=get_expression_values(node_genes)
expression_values_dict

### 1.3.3. P-value and threshold <= 0.05 (gene is on)

In [None]:
from numpy import sqrt, abs, round
from scipy.stats import norm

# Calculates the p-value of each gene
def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):
    pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)
    z = ((X1 - X2) - mudiff)/pooledSE
    pval = 2*(1 - norm.cdf(abs(z)))
    return round(z,3), pval

def get_genes_pvalue(df):
    pos_mean=df['ERpos'].mean(axis=1)
    neg_mean=df['ERneg'].mean(axis=1)
    pos_std=df['ERpos'].std(axis=1)
    neg_std=df['ERneg'].std(axis=1)
    no_of_pos=df['ERpos'].count(axis=1)
    no_of_neg=df['ERneg'].count(axis=1)

    z,p = twoSampZ(pos_mean,neg_mean,0,pos_std,neg_std,no_of_pos,no_of_neg)
    return z,p
    
z,pvalue = get_genes_pvalue(gse2034_df.T)
genes_df['P-value']=np.append(pvalue,mean(pvalue)) # Get average in case of noProbe gene (missing values)
genes_df

In [None]:
# For each node of a sub-path consisting of more than one genes, get the average value of the p-values.
def path_pvalue(path_no):
    cur_path=[]
    for node in node_genes[path_no]:
        cur_node=[]
        for gene in node:
            # Check if gene is 'noProbe' or not in genes' list
            if(gene=='noProbe' or not(gene in gene_dict.keys())):
                cur_node.append(genes_df.loc['noProbe']['P-value'])
                continue
            cur_node.append(genes_df.loc[gene]['P-value'])
        cur_path.append(mean(cur_node))
    return cur_path
         
def get_pvalues(node_genes):
    pvalues={}
    for path in node_genes:
        pvalues.update({path:path_pvalue(path)})  
    return pvalues

pvalue_threshold=0.05
pvalues_dict=get_pvalues(node_genes)
pvalues_dict

### 1.3.4. Fold Change and Log Fold Change
Add fold change column to genes_df

In [None]:
# Calculate fold change for each gene
def fold_change(df):
    fc_dict={}
    
    for gene in df.columns:
        # Get average value for each case
        erneg_av=gse2034_df[gene]['ERneg'].mean()
        erpos_av=gse2034_df[gene]['ERpos'].mean()
        
        # Calculate fold change (B/A)
        cur_fc=erneg_av/erpos_av
        fc_dict.update({gene:cur_fc})
        
    fc_dict.update({'noProbe':mean(list(fc_dict.values()))})
        
    return fc_dict        

fc=fold_change(gse2034_df)
genes_df['Fold Change']=fold_change(gse2034_df).values()
genes_df['Log FC']=[math.log(fc+1-min(genes_df['Fold Change'])) for fc in genes_df['Fold Change']] # Calculate log fold change
genes_df

In [None]:
# For each node of a sub-path consisting of more than one genes, get the average value of the fold change.
def path_fc(path_no):
    fc=[]
    log_fc=[]
    for node in node_genes[path_no]:
        cur_fc=[]
        cur_log_fc=[]
        for gene in node:
            # Check if gene is 'noProbe' ot not in genes' list
            if(gene=='noProbe' or not(gene in gene_dict.keys())):
                cur_fc.append(genes_df.loc['noProbe']['Fold Change'])
                cur_log_fc.append(genes_df.loc['noProbe']['Log FC'])
                continue
            cur_fc.append(genes_df.loc[gene]['Fold Change'])
            cur_log_fc.append(genes_df.loc[gene]['Log FC'])
        fc.append(mean(cur_fc))
        log_fc.append(mean(cur_log_fc))
    return fc,log_fc
            

def get_fc(genes_df,node_genes):
    fc={}
    log_fc={}
    for path in node_genes:
        cur_fc,cur_log_fc=path_fc(path)
        fc.update({path:cur_fc})  
        log_fc.update({path:cur_log_fc})  
    return fc,log_fc

log_fc_threshold=1.5 
fc_dict,log_fc_dict=get_fc(genes_df,node_genes) # Return fold change and log fold change in dictionary form

### 1.3.5. Differentially Expressed Genes (DEG)

In [None]:
genes_df['DEG']=[1 if p else 0 for p in list(genes_df['P-value']<=pvalue_threshold)]
genes_df

In [None]:
# Get the differentialy expressed genes as dictionary
def path_de_genes(path_no):
    de_genes=[]
    for node in range(len(pvalues_dict[path_no])):
        if(pvalues_dict[path_no][node]<=pvalue_threshold):
            de_genes.append(1)
        else:
            de_genes.append(0)
    return de_genes

def get_de_genes(pvalues_dict):
    de_genes={}
    for path in pvalues_dict:
        de_genes.update({path:path_de_genes(path)})
    return de_genes

de_genes_dict=get_de_genes(pvalues_dict)
de_genes_dict

# 2. Methods

## 2.1. PRS
Each node in a pathway has three attributes: Node_genes, Node_value (NV), Node_weight (NW)
### 2.1.1.  Development of the PRS algorithm

#### 2.1.1.1. Node_genes

In [None]:
# Already calculated on 1.3.1.
node_genes

#### 2.1.1.2. Node_value (NV)

In [None]:
# Each node is assigned a value derived from expression data. The following values are assigned to the node: 0 if the 
# corresponding gene or genes are not expressed, 1 if they are expressed but remain unchanged (non-significant), or the
# maximum fold-change value if one or more of the mapped transcripts is above threshold.
def expressed_genes(path_no):
    expr_genes=[]
    for n in range(len(log_fc_dict[path_no])):
        if(log_fc_dict[path_no][n]<log_fc_threshold): # not expressed
            expr_genes.append(0)
        else: # expressed
            if(pvalues_dict[path_no][n]>pvalue_threshold): # non-significant
                expr_genes.append(1)
            else: # significant
                expr_genes.append(max(fc_dict[path_no]))
    return expr_genes

def Node_value(node_genes):
    Node_value={}
    
    for path in node_genes:
        cur_path=expressed_genes(path)
        Node_value.update({path:cur_path})
            
    return Node_value

node_value=Node_value(node_genes)
node_value

#### 2.1.1.3. Node_weight (NW)

In [None]:
# Generate each sub-path in the form of a graph (start_node,next_node).
def get_graph(node_value):
    path_graph={}
    for path in node_value:
        graph=[]
        for n in range(len(node_value[path])-1):
            graph.append([node_value[path][n],node_value[path][n+1]])
        path_graph.update({path:graph})
    return path_graph
            
graphs=get_graph(node_value)
graphs

In [None]:
# All significant (above-threshold) nodes were assigned a weighting that reflected
# their topological strength (i.e., the number of significant downstream nodes that are pointed to, either
# directly or via other significant nodes).
# An initiating child node, n_i, was ignored if non-significant, and the algorithm proceeds to the next child.
# Otherwise, we increase the weight counter by 1 and look for children of this node
# and so on. All non-significant nodes have NW = 0.

def sign_children(graph,weight=0):
    if(len(graph)==0):
        return weight
    if(graph[0]<threshold):
        return sign_children(graph[1:],weight)
    else:
        return sign_children(graph[1:],weight+1) # Increase weight, if the current node/child is significant


threshold=1.5
node_weight={}
for path in node_value:
    cur_node=[]
    cur_weight=0
    for n in range(len(node_value[path])):
        cur_weight=sign_children(node_value[path][n:])
        cur_node.append(cur_weight)  
    node_weight.update({path:cur_node})
node_weight

#### 2.1.1.4. Node_score (NS)
NV and NW values are combined to calculate a Node_Score (NS)

In [None]:
def Node_score(NV,NW):
    node_score={}
    for path in NV:
        cur_path=[]
        for n in range(len(NV[path])):
            if(NV[path][n]>1):
                cur_path.append(NV[path][n]*NW[path][n])
            else:
                cur_path.append(0)
        node_score.update({path:cur_path})
    
    return node_score
node_score=Node_score(node_value,node_weight)
node_score

#### 2.1.1.5. PRS

In [None]:
def PRS(NS):
    prs={}
    for path in NS:
        cur_sum=0
        prs.update({path:sum(NS[path])})
    return prs

prs_df=pd.DataFrame(list(PRS(node_score).values()), columns=['PRS'])
prs_df

### 2.1.2. Normalizing pathway scores
A normalization step is required to control for two key features: (i)
pathway size and (ii) statistical bias contributed by pathway-specific PRS score null distributions.

#### 2.1.2.1.  Pathway size 
Multiply each PRS score by the ratio of the number of DEGs (NDEGs) in a pathway to the total number of expressed genes (NEGs)

In [None]:
def path_size_PRS(prs):
    NEGs=len([element for element in chain.from_iterable(node_value.values()) if element > 0]) # total number of expressed genes (NEGs)
    new_prs={}
    for path in range(prs.shape[0]):
        NDEGs=len([element for element in node_value[path] if element > 1]) # number of DEGs (NDEGs) in a pathway
        
        cur_prs=prs.iloc[path]['PRS']*(NDEGs/NEGs)
        new_prs.update({path:cur_prs})
    return new_prs

new_prs_df=pd.DataFrame(list(path_size_PRS(prs_df).values()),columns=['PRS'])
new_prs_df

#### 2.1.2.2. Statistical bias contributed by pathway-specific PRS score null distributions
Not Ready ...

## 2.2. MinePath

### 2.2.1. Discretization of gene expression values
Transform gene expression values into high (expressed / up-regulated) or low (not-expressed / down-regulated) gene expression binary equivalents

#### 2.2.1.1. The expression values of a gene over the total number of input samples are sorted in descending order;

In [None]:
# The expression values were calculated on 1.3.2.
genes_dis_df=genes_df.sort_values(by=['Expression Value'],ascending=False)
genes_dis_df

#### 2.2.1.2. The midpoints between each two consecutive values are calculated;

In [None]:
def midpoint(num1,num2):
    return (num1+num2)/2

midpoints_dict={} # Key corresponds to the position of the first gene. The second gene is in the next position (i+1).
for i in range(genes_dis_df.shape[0]-1):
    midpoints_dict.update({i:midpoint(genes_dis_df.iloc[i]['Expression Value'],genes_dis_df.iloc[i+1]['Expression Value'])})

midpoints_dict

#### 2.2.1.3. For each midpoint, μi, the Information Gain (IG) of the system is computed. Let IG(S,μi) to denote the IG of the system for midpoint μi.

In [None]:
classes=sorted(set(gse2034_df.index)) # Τhe classes to which a sample may belong
samples=gse2034_df.index # the samples class

# Calculate the proportion of samples in S that belong in Class C
def P(C,S):
    return list(S).count(C)/len(S)

def E(S,m=1):
    # m not given: calculate the entropy of the system taking into account the prior assignment of sample cases into phenotype classes
    # m given: calculate the respective entropy of the system taking into account its division into subgroups around midpoint μi
    tmp=0
    for c in classes:
        # P(c,S) must be greater than zero
        tmp+=P(c,S)*math.log(P(c,S))/m
    return -(tmp)

entropy=E(samples)
print('Dataset Entropy: %.3f bits' % entropy)

In [None]:
# Calculate the Information Gain (IG) of the system
def IG(S,m):
    return E(S)-E(S,m)

information_gain=[]
for m in list(midpoints_dict.values()):
    information_gain.append(IG(samples,m))
print('Information Gain: '+str(information_gain))

In [None]:
# The midpoint with the highest information gain is selected as the discretization point
max_value = max(information_gain)
max_mid_pos = information_gain.index(max(information_gain))
dis_point=midpoints_dict.get(max_mid_pos)
print('Discretization point: %.3f' %dis_point)

#### 2.2.1.4. The sample cases with expression values lower than the discretization point are assigned the '0' value (meaning that the gene is under-expressed), and the sample cases with expression values bigger that the discretization point are assigned the '1' value (the gene is over-expressed).
The discretization process is applied for each gene separately, and the final dataset is a matrix of discretized, actually binarized, values.

In [None]:
gse2034_dis_df=gse2034_df.copy()
gse2034_dis_df[gse2034_dis_df<dis_point]=0 # under-expressed
gse2034_dis_df[gse2034_dis_df>=dis_point]=1 # over-expressed
gse2034_dis_df=gse2034_dis_df.astype('int')
gse2034_dis_df

### 2.2.2. Functional sub-paths: Matching sub-paths with gene expression profiles

In [None]:
import statistics

# Get the nodes of each sub-path in binary form
gene_expression_profile_df=selected_df.copy()
expr_prof_tmp={}
for row in range(gene_expression_profile_df.shape[0]):
    row_tmp=[]
    path_tmp=gene_expression_profile_df.iloc[row][~gene_expression_profile_df.iloc[row].isnull()]
    expr_prof_tmp[row]=[]
    
    for i in path_tmp[::2]:
        tmp_node_genes=[(g.split('#')) for g in list(filter(None,i.split(' ')))] # Get genes of node
        tmp_node_genes=list(filter(None, tmp_node_genes))
        tmp_expr_vals=[]
        
        for n in tmp_node_genes:
            # Check if gene exists in gse2034 dataset or KEGG-ID with noProbe correspondes to specific gene from gse2034 dataset
            if(n[0]=='noProbe'):
                if(not(n[1] in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis_df.shape[0])).astype(int))
                    continue
            if(len(n)==1):
                if(not(n in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis_df.shape[0])).astype(int))
                    continue
            tmp_expr_vals.append(list(gse2034_dis_df[n[0]]))
    
        if(len(tmp_expr_vals)>0):
            expr_prof_tmp[row].append((np.transpose(tmp_expr_vals)).max(axis=1))
              
expr_prof_tmp

In [None]:
# Replace genes in pathway with their samples binary value (max value in case of multiple genes in node)
for row in range(gene_expression_profile_df.shape[0]):
    for column in range(0,gene_expression_profile_df.iloc[row][~gene_expression_profile_df.iloc[row].isnull()].shape[0],2):
        # Since nodes are in the even columns, the correspondence with the positions of the table expr_prof_tmp of consecutive 
        # positions is calculated as follows: for each column c, we get c/2 (even number/2= even number)
        gene_expression_profile_df.iat[row,column]=expr_prof_tmp[row][int(column/2)]
        
gene_expression_profile_df

In [None]:
# The following functions compute the 'and' and 'xor' boolean operations
def and_boolean_op(num1,num2):
    result=[]
    for n in range(len(num1)):
        result.append(num1[n]*num2[n])
    return result
    
def xor_boolean_op(num1,num2):
    result=[]
    for n in range(len(num1)):
        result.append(1 if(num1[n] and not num2[n]) or (not num1[n] and num2[n]) else 0)
    return result

operations_dict={'Activation':and_boolean_op,'Inhibition':xor_boolean_op}

In [None]:
# Calculate the pathway expression with boolean operations
def calc_pathway_expression(path,prev_result):
    if(len(path)>0):
        t=0
        relation=list(relations_dict.keys())[list(relations_dict.values()).index(path.iloc[0])] # Get the current edge type
        next_node=path.iloc[1]
        result=operations_dict[relation](prev_result,next_node)
        calc_pathway_expression(path.iloc[2:].reset_index(drop=True),result)
    return prev_result

results=[]
for row in range(gene_expression_profile_df.shape[0]):
    tmp_path=gene_expression_profile_df.iloc[row][~gene_expression_profile_df.iloc[row].isnull()]
    results.append(calc_pathway_expression(tmp_path.iloc[1:].reset_index(drop=True),tmp_path[0])) 

In [None]:
# Binary sub-path expression matrix
binary_expression_df=pd.DataFrame(results,index=list(gene_expression_profile_df.index),columns=labels)
binary_expression_df

In [None]:
# To make it easier to use, the resulting DataFrame is copied to a new one with the same name as the method name.
minepath_df=binary_expression_df.copy()
minepath_df

## 2.3. TAPPA

### 2.3.1. Pathway connectivity index
The molecular connectivity index is a widely used topological descriptor of chemical compounds and has been successfully used in many other fields, including protein structure and drug discovery.

#### 2.3.1.1. Adjacency matrix

In [None]:
# The adjacency matrix is defined as A=(a_ij), where a_ij=1 if i=j or (g_i, g_j) belongs to E and a_ij=0 if (g_i, g_j) does 
# not belong to E.
def adjacency_matrix(nodes):
    tmp_adj=[]
    
    #i=j -> a_ij=1
    for i in range(len(nodes)):
        tmp_adj.append([0]*len(nodes))
        for j in range(len(nodes)):
            if(i==j):
                tmp_adj[i][j]=1
                continue
                
    # (g_i,g_j) belongs to E (current sub-paths are linear) -> a_ij=1          
    for i in range(len(nodes)-1):
        tmp_adj[i][i+1]=1
        tmp_adj[i+1][i]=1
            
    return tmp_adj
    
adjacency_matrices={}
for path in node_genes:
    adjacency_matrices.update({path:adjacency_matrix(node_genes[path])})
adjacency_matrices

#### 2.3.1.2. Define PCI
Assuming that x_is is the normalized log expression measurement for gene i in sample s.

In [None]:
# Each column expression values are normalized to zero mean.
norm_gse2034_df=(genes_df['Expression Value']-genes_df['Expression Value'].mean())/genes_df['Expression Value'].std()

# Further normalize to (-0.5,0.5) with Sigmoid function (Sigmoid (x_is) - 0.5) to lower the effects of extremely large/small 
# values for gene i in sample s.
def sigmoid(df):
    x=[]
    for sample in range(df.shape[0]):
        x.append(1 / (1 + math.exp(-df.iloc[sample])))
    return x

gene_expression_df=pd.DataFrame(sigmoid(norm_gse2034_df),columns=['Normalized log expression'],index=norm_gse2034_df.index)-0.5
gene_expression_df

In [None]:
from statistics import mean

# Each node consists of one or more genes, so each node gets the average value.
def get_x(node_genes):
    x={}
    for path in node_genes:
        cur_path=[]
        for node in range(len(node_genes[path])):
            cur_node=[]
            for gene in node_genes[path][node]:
                if(not(gene in gene_expression_df.index)):
                    cur_node.append(gene_expression_df.loc['noProbe']['Normalized log expression'])
                else:
                    cur_node.append(gene_expression_df.loc[gene]['Normalized log expression'])
            cur_path.append(mean(cur_node))
        x.update({path:cur_path})
    return x
        
# Node_genes was initialized on a previous method (PRS)
x=get_x(node_genes)
x

In [None]:
def PCI(df,x,a):
    pci_dict={}
    for path in range(df.shape[0]):
        cur_path=df.iloc[path][~df.iloc[path].isnull()]
        
        # Number of gene (ignore the edges)
        N=len(cur_path.iloc[::2])  
        
        cur_sum=0
        for i in range(N):
            for j in range(N):
                cur_sum+=np.sign(x[path][i]+x[path][j])*(abs(x[path][i])**0.5)*a[path][i][j]*(abs(x[path][j])**0.5)
        
        pci_dict.update({path:cur_sum})
    return pci_dict

pci=PCI(selected_df,x,adjacency_matrices)
pci_df=pd.DataFrame(pci.values(),columns=['PCI'])
pci_df

#### 2.3.1.3. Normalize PCI (divided by the gene number in pathway)

In [None]:
def normalize_PCI(node_genes,pci_df):
    norm_dict={}
    for path in range(pci_df.shape[0]):
        cur_len=len(node_genes[path])
        norm_dict.update({path:pci_df.iloc[path]['PCI']/cur_len})
    return norm_dict

norm_pci_df=pd.DataFrame(normalize_PCI(node_genes,pci_df).values(),columns=['Normalized PCI'])

# To make it easier to use, the resulting DataFrame is copied to a new one with the method name.
tappa_df=norm_pci_df.copy()
tappa_df

## 2.4. HiPathia

### 2.4.1. Normalize the gene expression values

In [None]:
# The normalization process is not defined,so the normalized gene expression values calculated for the TAPPA method are used.
hipathia_norm_gse2034_df=gene_expression_df.copy()
hipathia_norm_gse2034_df

### 2.4.2. The Hipathia mechanistic model

In [None]:
# Normalized gene expression value
def get_u(node_genes):
    u={}
    for path in node_genes:
        cur_path=[]
        for node in range(len(node_genes[path])):
            cur_node=[]
            for gene in node_genes[path][node]:
                if(not(gene in hipathia_norm_gse2034_df.index)):
                    cur_node.append(hipathia_norm_gse2034_df.loc['noProbe']['Normalized log expression'])
                else:
                    cur_node.append(hipathia_norm_gse2034_df.loc[gene]['Normalized log expression'])
            cur_path.append(mean(cur_node))
        u.update({path:cur_path})
    return u

u=get_u(node_genes)
u

In [None]:
# Signal intensity of each node
def mechanistic_model(u,pathways_df):
    s={}
    
    for path in range(pathways_df.shape[0]):
        cur_path=list(pathways_df.iloc[path][~pathways_df.iloc[path].isnull()])
        edges=list(cur_path[1::2]) # Edges are at the odd columns
        
        cur_s=[u[path][0]] 
        
        for node in range(1,(len(u[path]))):
            cur_s_a=1
            cur_s_i=1
            
            for prev_node in range(node):
                
                # Activation signals
                if(edges[prev_node]==relations_dict['Activation']): 
                    cur_s_a=cur_s_a*(1-cur_s[prev_node])
                # Inhibition signals
                else:
                    cur_s_i=cur_s_i*(1-cur_s[prev_node])
            
            new_u=u[path][node]*(1-cur_s_a)*cur_s_i
            cur_s.append(new_u)
            
        # Changes in the activity of the nodes will be reflected (or remain unnoticed) in the last effector node
        s.update({path:cur_s[node]})   
    return s

s=mechanistic_model(u,selected_df)
s

In [None]:
hipathia_df=pd.DataFrame(list(s.values()),columns=['S'])
hipathia_df

## 2.5. SPIA

Two types of evidence: (i) the over-representation of DE genes in a given pathway and (ii) the abnormal perturbation of that pathway, as measured by propagating measured expression changes across the pathway topology (P_NDE,P_PERT).

### 2.5.1. P_NDE = P(X >= N_DE | H0)
- Captures the significance of the given pathway Pi as provided by an over-representation analysis of the number of DE genes (N_DE) observed on the pathway.
- N_DE: number of DE genes on the pathway analyzed
- H0: the genes that appear as DE on a given pathway are completely random (the pathway is not relevant to the condition under study)

#### Because of the computational time, the value of P_NDE is already calculated and transformed in csv form.

In [None]:
n_de={} # Number of DE genes on the pathway analyzed
for path in pvalues_dict:
    n_de.update({path:len([1 for n in pvalues_dict[path] if n<pvalue_threshold])})

# Already calculated
def get_p_nde(node_value,n_de):
    p_nde={}
    for path in node_value:
        # Calculating Probability of a Random Variable in a Distribution
        p_nde.update({path:0.5 * (1 + math.erf((n_de[path] - mean(n_de.values()))/math.sqrt(2 * stdev(n_de.values()) **2)))})
        # p_nde.update({path:n_de[path]/len(node_value[path])})
    return p_nde

#p_nde=get_p_nde(node_value,n_de) # It takes a lot of time to compute
#p_nde_df=pd.DataFrame(p_nde.values(),index=p_nde.keys(),columns=['P_NDE'])
#p_nde_df.to_csv(r'C:\Users\user\Desktop\ΤΕΙ\Πτυχιακή\Project\P_NDE.csv',index = False, header=True)

In [None]:
import csv

# P_NDE is already calculated and saved is csv form because of its computational time
p_nde_df = pd.read_csv ('P_NDE.csv')

# Also in dictionary form
p_nde_dict = {p:p_nde_df.loc[p]['P_NDE'] for p in range(p_nde_df.shape[0])}

p_nde_df

### 2.5.2. P_PERT 
Calculated based on the amount of perturbation measured in each pathway.

#### 2.5.2.1. Gene perturbation factor (PF)

In [None]:
# Represents the signed normalized measured expression change of the gene g_i (log fold-change if two conditions are compared)
log_fc_dict # Calculated on 1.3.3.

In [None]:
# The sign of β reflects the type of interaction: +1 for induction (activation), −1 for repression and inhibition.
def get_b(edges):
    b=[]
    for e in edges:
        if(relations_dict['Activation']==e):
            b.append(1)
        else:
            b.append(-1)
    return b

def path_pf(path,edges,log_fc):
    # Calculate β_j
    b=get_b(edges)

    # First node 
    cur_pf=[log_fc[0]]

    for i in range(1,len(path[::2])): 
        prev_pf=0

        for j in range(i):
            # The number of downstream genes of each such gene N_ds(g_j)
            N_ds=len(path[::2])-j-1  

            prev_pf+=b[j]*(cur_pf[j]/N_ds)

        cur_pf.append(log_fc[i]+prev_pf)
        
    return cur_pf

# Define gene perturbation factor
def get_pf(log_fc_dict,pathways_df):
    pf={}
    
    for path in log_fc_dict:
        cur_path=list(pathways_df.iloc[path][~pathways_df.iloc[path].isnull()])
        edges=list(cur_path[1::2]) # Edges are at the odd columns
        
        # The term ΔE(g_i) represents the signed normalized measured expression change of the gene g_i
        cur_log_fc=log_fc_dict[path]
    
        cur_pf=path_pf(cur_path,edges,cur_log_fc)
            
        pf.update({path:cur_pf})
    return pf

pf=get_pf(log_fc_dict,selected_df)
pf

#### 2.5.2.2. Net perturbation accumulation at the level of each gene, Acc_g
This subtraction is needed to ensure that DE genes not connected with any other genes will not contribute to the second type of evidence since such genes are already taken into consideration in the ORA and captured by P_NDE

In [None]:
def path_accumulation(pf,log_fc):
    acc=[]
    for node in range(len(pf)):
        tmp_acc=pf[node]-log_fc[node]
        acc.append(tmp_acc)
    return acc
        

def get_acc(pf,log_fc_dict):
    acc={}
    for path in pf:
        cur_acc=path_accumulation(pf[path],log_fc_dict[path])
        acc.update({path:cur_acc})
    return acc
    
acc=get_acc(pf,log_fc_dict)
acc

#### 2.5.2.3. Total net accumulated perturbation in the pathway, t_A

In [None]:
def get_t_A(acc):
    t_A={}
    for path in acc:
        t_A.update({path:sum(acc[path])})
    return t_A

t_A=get_t_A(acc)
t_A

#### 2.5.2.4. Probability P_PERT
The probability to observe a total accumulated perturbation of the pathway, T_A, more extreme than t_A just by chance

In [None]:
"""
import itertools

def path_p_pert(path_no):
    T_A=[]
    
    # Required for perturbation factor calculations
    cur_path=list(selected_df.iloc[path_no][~selected_df.iloc[path_no].isnull()])
    edges=cur_path[1::2]
    
    if(n_de[path_no]==0):
        return 0
    
    permutations=list(itertools.permutations(log_fc_dict[path_no])) # Calculates all possible combinations of list items
    
    # Compute for each permutation the total net accumulated perturbation
    for p in range(len(permutations)):
        
        # First, calulate the perturbation factor
        cur_pf=path_pf(cur_path,edges,permutations[p])
        
        # Then, compute the perturbation accumulation
        cur_acc=path_accumulation(cur_pf,permutations[p])
        
        # Subsqequently, compute the total net accumulated perturbation
        cur_t_A=path_t_A(cur_acc)
        T_A.append(cur_t_A)
        
    # Finally, find the probability to observe a total accumulated perturbation of the pathway, T_A, more extreme than t_A
    return len([t for t in T_A if t>t_A[path_no]])/len(T_A)

def get_p_pert(log_fc_dict):
    p_pert={}
    
    for path in log_fc_dict:
        cur_p_pert=path_p_pert(path)
        p_pert.update({path:cur_p_pert})
        
    return p_pert

p_pert=get_p_pert(log_fc_dict)
p_pert
"""

#### 2.5.3. Global probability value, P_G

In [None]:
"""
def p_g(p_nde,p_pert):
    c=p_nde*p_pert
    return c-c*math.log(c)

def get_p_g(p_nde,p_pert):
    p_g={}
    for path in p_nde:
        p_g.update({path:p_g(p_nde[path],p_pert[path])})
    return p_g

p_g=get_pg(p_nde,p_pert)
p_g
"""

## 2.6. SubSPIA

### 2.6.1. The statistical significance of subpathways
Two types of evidence: the overrepresentation of DEGs and the abnormal perturbation in a given subpathway.

In [None]:
from scipy.stats import hypergeom

def path_hypergeom(path_no):
    t=len(list(dict.fromkeys([item for sublist in node_genes[path_no] for item in sublist]))) # Genes involved in the pathway under investigation 
    rv = hypergeom(m, t, n)
    x = np.arange(0, t+1)
    return rv.pmf(x)

# The p-value can be calculated to evaluate enrichment significance for each pathway
def get_pvalue(node_genes):
    pvalue={}
    for path in node_genes:
        cur_hypergeom=path_hypergeom(path)
        pvalue.update({path:(1-sum(cur_hypergeom))})
    return pvalue
    
m=30000 # Total genes of human genome in the current analysis (population size)
n=gse2034_df.shape[1] # Set of genes submitted for analysis

tmp_pvalue_dict=get_pvalue(node_genes)
tmp_pvalue_dict

## 2.7. DEAP

In [None]:
# The indicator of whether a gene is ‘on’ or ‘off’
def get_d(pvalues,fc):
    d=[]
    on_genes=[]
    for p in range(len(pvalues)):
        # fold change > 0 indicates up-regulation and fold change < 0 indicates down-regulation of genes
        # pvalue <= 0.05 indicates that the gene is on, else gene is off
        if(pvalues[p]>pvalue_threshold): # Gene is off
            d.append(0)
        else: # Gene is on
            on_genes.append(fc[p])
            if(fc[p]>0): # up-regulated
                d.append(1)
            else: # down-regulated
                d.append(-1)
    return d

In [None]:
# The mean of the absolute value of expression for ‘on’ genes (pathway effect).
def get_m(path_no):
    path=node_genes[path_no]
    m=[]
    all_m=[]
    total_mean=abs(gse2034_df).mean().mean()
    for node in range(len(path)):
        node_m=[]
        node_all_m=[]
        if(pvalues_dict[path_no][node]<=pvalue_threshold): # If the gene is turned on
            m.append(abs(expression_values_dict[path_no][node])) # Get the absolute expression value of the node
    if(len(m)):
        return mean(m)
    return 0 # If there are no on genes

In [None]:
def variance(data): # σ
    # Number of observations
    n = len(data)
    # Mean of the data
    mean = sum(data) / n
    # Square deviations
    deviations = [(x - mean) ** 2 for x in data]
    # Variance
    variance = sum(deviations) / n
    return variance

# Normal distribution
def normal_dist(x , mean , sd):
    prob_density = (np.pi*sd) * np.exp(-0.5*((x-mean)/sd)**2)
    return prob_density

# The variable g is assumed to come from a normal distribution with mean 0 and variance σ
def get_g(path_no,m):
    mean=0
    s=variance(expression_values_dict[path_no])
    if(s==0): # If the variance equals to zero, then the variable g is also zero, since division by zero is not possible. 
        g=len(expression_values_dict[path_no])*[0]
    else:
        g=[]
        for node in range(len(expression_values_dict[path_no])):
            g.append(normal_dist(expression_values_dict[path_no][node],mean,s))
    return g

# Problem: calculation of the variable e

In [None]:
# Expression data (presumably on a log scale) for each gene in a pathway was simulated using a multivariate normal distribution
def get_E(pvalues_dict,fc_dict):
    E={}
    for path in pvalues_dict:
        node_E=[]
        
        d=get_d(pvalues_dict[path],fc_dict[path])
        m=get_m(path)
        g=get_g(path,m)
        
        for node in range(len(pvalues_dict[path])):
            node_E.append(d[node]*(m+g[node]))
        E.update({path:node_E}) # E=d*(m+g)+e
    return E

e=get_E(pvalues_dict,fc_dict)
e