In [1]:
import pandas as pd
import numpy as np
import math

# 1. Read data and make them easier to understand

## 1.1. GSE2034

In [2]:
raw_gse2034_df = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"') # Breast cancer
gse2034_df=raw_gse2034_df.copy()

# preprocess dataset
gse2034_df[['Gene','KEGG-ID']] = gse2034_df['Class'].str.split('#',expand=True)
gse2034_df.drop('Class', inplace=True, axis=1)
cols = gse2034_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034_df=gse2034_df[cols]

#gse2034_df

In [3]:
# Change column names (estrogen receptor)
labels=gse2034_df.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [4]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034_df['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=gse2034_df.loc[gse2034_df['Gene'] == i]['KEGG-ID']
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
    gene_dict.update({i:tmp_list})
#print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

In [5]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=gse2034_df['Gene']
gse2034_df.drop('KEGG-ID', inplace=True, axis=1)
gse2034_df=np.transpose(gse2034_df.iloc[:,1:])
gse2034_df.columns=genes.values.tolist()
#gse2034_df

In [6]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034_df=gse2034_df.groupby(level=0,axis=1).mean()
gse2034_df

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-HSAC07/X00351_5_at,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,6520.9,112.5,189.8,2083.3,145.8,802.8,278.4,28.3,449.0,122.9,...,211.8,1413.2,19474.9,408.7,2021.0,6087.7,16.1,79.3,274.5,247.8
ERpos,5285.7,178.4,269.7,1203.4,42.5,557.5,183.3,56.4,101.9,85.9,...,3713.7,11882.3,26784.7,9109.6,20412.1,4264.5,96.3,837.3,937.3,2297.5
ERneg,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,90.7,...,16474.5,38913.5,23342.8,9439.4,18474.2,3496.5,71.1,1024.2,982.5,807.1
ERpos,4263.6,417.7,327.1,1043.3,69.2,653.2,185.8,21.8,3629.3,96.0,...,9506.9,30323.1,29313.6,8671.0,22684.5,3952.3,36.6,969.1,656.3,141.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,3066.9,265.5,347.9,1127.4,47.0,583.9,138.2,16.1,577.3,44.7,...,2939.6,12251.5,24463.5,5378.0,14403.8,3991.3,46.7,399.2,329.5,413.1
ERpos,2773.0,209.8,226.7,1071.8,45.1,859.9,121.0,24.8,935.6,78.8,...,2621.4,13415.3,14976.3,1593.7,5722.3,1361.4,11.1,143.1,281.9,231.2
ERpos,2984.3,160.0,252.9,1178.5,146.3,664.2,183.3,49.6,443.5,74.0,...,2330.7,11946.3,22092.0,3503.0,9549.3,1917.7,33.1,119.7,295.9,269.2
ERneg,3540.0,285.7,135.1,1256.7,75.9,603.1,125.0,72.9,73.5,126.1,...,9934.4,22759.9,15283.2,6157.2,12799.3,1880.2,8.4,533.8,491.1,113.0


## 1.2. Selected
Cellular processes (15), Signal transduction (Environmental information process) (24), Cancer overview (8).

In [7]:
raw_selected_df = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
#raw_selected_df

In [8]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

In [9]:
from itertools import chain

# Split each pathway based on the relation
def split_path(data,relation):
    s=[]
    cnt=len(data.split(relations_dict[relation]))
    cnt_tmp=1
    # If there is at least one relation, then split data
    if(cnt>0):
        for e in data.split(relations_dict[relation]):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt):
                    s.append(relations_dict[relation])
                cnt_tmp+=1
    return s

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def get_pathway(data):
    s=split_path(data,'Activation')

    for i in range(len(s)):
        tmp_s=split_path(s[i],'Inhibition')
        if(len(s[i])>1):
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

subpaths_list=[get_pathway(row) for row in raw_selected_df['SubPathID']]
#subpaths_list

In [10]:
selected_df=pd.DataFrame(subpaths_list).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes
selected_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,208200_at#hsa:3552 210118_s_at#hsa:3552 205067...,-->,202948_at#hsa:3554 215561_s_at#hsa:3554 205403...,-->,203901_at#hsa:10454,-->,206853_s_at#hsa:6885 206854_s_at#hsa:6885 2115...,-->,209666_s_at#hsa:1147 209341_s_at#hsa:3551 2093...,,...,,,,,,,,,,
1,208438_s_at#hsa:2268 208018_s_at#hsa:3055 2026...,-->,207821_s_at#hsa:5747 208820_at#hsa:5747,,,,,,,,...,,,,,,,,,,
2,207072_at#hsa:8807 206618_at#hsa:8809,-->,209239_at#hsa:4790 201783_s_at#hsa:5970 209878...,,,,,,,,...,,,,,,,,,,
3,206109_at#hsa:2523 208505_s_at#hsa:2524 210608...,-->,214088_s_at#hsa:2525 216010_x_at#hsa:2525,,,,,,,,...,,,,,,,,,,
4,213464_at#hsa:25759 noProbe#hsa:399694 206330_...,-->,215075_s_at#hsa:2885,-->,207112_s_at#hsa:2549 214987_at#hsa:2549,-->,220566_at#hsa:23533 204369_at#hsa:5290 212688_...,-->,212607_at#hsa:10000 212609_s_at#hsa:10000 2193...,--|,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41603,206890_at#hsa:3594,-->,206118_at#hsa:6775,-->,207849_at#hsa:3558,,,,,,...,,,,,,,,,,
41604,221271_at#hsa:59067,-->,219971_at#hsa:50615 221658_s_at#hsa:50615,-->,208991_at#hsa:6774 208992_s_at#hsa:6774,-->,210426_x_at#hsa:6095 210479_s_at#hsa:6095,-->,noProbe#hsa:112744 208402_at#hsa:3605 216876_s...,,...,,,,,,,,,,
41605,219971_at#hsa:50615 221658_s_at#hsa:50615,-->,208991_at#hsa:6774 208992_s_at#hsa:6774,-->,210426_x_at#hsa:6095 210479_s_at#hsa:6095,-->,noProbe#hsa:112744 208402_at#hsa:3605 216876_s...,,,,...,,,,,,,,,,
41606,216857_at#hsa:51561 217326_x_at#hsa:51561 2173...,-->,206890_at#hsa:3594,-->,208991_at#hsa:6774 208992_s_at#hsa:6774,-->,221271_at#hsa:59067,-->,219971_at#hsa:50615 221658_s_at#hsa:50615,,...,,,,,,,,,,


# 2. Methods

## 2.1. PRS
Each node in a pathway has three attributes: Node_genes, Node_value (NV), Node_weight (NW)
### 2.1.1.  Development of the PRS algorithm

#### 2.1.1.1. Node_genes

In [11]:
# Each node in a pathway represents a discrete function mapping to one or more transcript.
# Returns a dictionary corresponding each node of each pathway to its gene or genes.
def Node_genes(df):
    Node_genes={}
    for path in range(df.shape[0]):
        tmp_node=[]
        for node in range(0,len(df.iloc[path][~df.iloc[path].isnull()]),2):
            genes=list(filter(None,[x.strip() for x in df.iloc[path,node].split(' ')]))
            tmp_genes=[]
            for g in genes:
                tmp_genes.append(list(filter(None,[x.strip() for x in g.split('#')]))[0])
            tmp_node.append(tmp_genes)
        Node_genes.update({path:tmp_node})
    return Node_genes

node_genes=Node_genes(selected_df)
node_genes

{0: [['208200_at', '210118_s_at', '205067_at', '39402_at'],
  ['202948_at', '215561_s_at', '205403_at', '211372_s_at'],
  ['203901_at'],
  ['206853_s_at', '206854_s_at', '211536_x_at', '211537_x_at'],
  ['209666_s_at',
   '209341_s_at',
   '209342_s_at',
   '211027_s_at',
   '209929_s_at',
   '36004_at']],
 1: [['208438_s_at',
   '208018_s_at',
   '202625_at',
   '202626_s_at',
   '210754_s_at',
   '213324_at',
   '221281_at',
   '221284_s_at'],
  ['207821_s_at', '208820_at']],
 2: [['207072_at', '206618_at'], ['209239_at', '201783_s_at', '209878_s_at']],
 3: [['206109_at', '208505_s_at', '210608_s_at'],
  ['214088_s_at', '216010_x_at']],
 4: [['213464_at', 'noProbe', '206330_s_at', '201469_s_at', '214853_s_at'],
  ['215075_s_at'],
  ['207112_s_at', '214987_at'],
  ['220566_at',
   '204369_at',
   '212688_at',
   '217620_s_at',
   '203879_at',
   '211230_s_at',
   '206369_s_at',
   '206370_at',
   '212239_at',
   '212240_s_at',
   '212249_at',
   '207105_s_at',
   '202743_at',
   '2115

#### 2.1.1.2. Node_value (NV)

In [12]:
# Calculate fold change for each gene
def fold_change(df):
    fc_dict={}
    
    for gene in df.columns:
        # Get average value for each case
        erneg_av=gse2034_df[gene]['ERneg'].mean()
        erpos_av=gse2034_df[gene]['ERpos'].mean()
        
        # Calculate fold change (B/A)
        cur_fc=erneg_av/erpos_av
        fc_dict.update({gene:cur_fc})
        
    return fc_dict        
    
fc_dict=fold_change(gse2034_df)

In [13]:
gene_df=pd.DataFrame.from_dict(fc_dict,orient='index',columns=['Fold Change'])
gene_df=gene_df.sort_values(['Fold Change'])
gene_df

Unnamed: 0,Fold Change
205509_at,0.039829
220922_s_at,0.066133
205358_at,0.104891
205009_at,0.112513
205225_at,0.112553
...,...
213711_at,11.271291
205029_s_at,12.215422
209720_s_at,13.942993
215729_s_at,14.203339


In [14]:
# Each node is assigned a value derived from expression data. The following values are assigned to the node: 0 if the 
# corresponding gene or genes are not expressed, 1 if they are expressed but remain unchanged (non-significant), or the
# maximum fold-change value if one or more of the mapped transcripts is above threshold.
def expressed_genes(fc_list):
    state={'not expressed':0,'expressed':0,'significant':0}
    for x in fc_list:
        if(x<=0):
            state['not expressed']+=1
        elif(x>0 and x<threshold):
            state['expressed']+=1
        else:
            state['significant']+=1
            
    if(state['significant']>0):
        return max(fc_list)
    else:
        if(max(state, key=state.get)=='not expressed'):
            return 0
        else:
            return 1

def Node_value(node_genes):
    Node_value={}
    
    for path in node_genes:
        cur_path=[]
        
        for node in node_genes[path]:
            cur_node=[]
            
            for gene in node:
                # Check if gene is 'noProbe' ot not in genes' list
                if(gene=='noProbe'):
                    cur_node.append(0)
                    continue
                elif(not(gene in gene_dict.keys())):
                    cur_node.append(0)
                    continue
                
                cur_node.append(gene_df.loc[gene]['Fold Change'])
            
            # Call function and calculate new value of node
            cur_path.append(expressed_genes(cur_node))
            
        Node_value.update({path:cur_path})
            
    return Node_value
        
threshold=1.5   
node_value=Node_value(node_genes)
node_value

{0: [1, 3.4701014265402987, 1, 1, 1],
 1: [2.0255223098296264, 1],
 2: [1, 1],
 3: [1, 2.1523184421579074],
 4: [1, 1, 1, 1.6996846097881093, 1, 1],
 5: [1.64446003519234, 1.5502775822972865],
 6: [1, 1, 1.5233155903608468, 1, 1],
 7: [1, 0],
 8: [1, 1, 1, 2.432881019230585, 1.6521001183821733, 1, 1],
 9: [1, 1],
 10: [1, 1],
 11: [1, 1, 1, 1, 1],
 12: [1.6732888523505396, 1],
 13: [1, 1, 1, 1],
 14: [1, 1],
 15: [1, 1, 1, 1, 1],
 16: [1, 1],
 17: [1, 1],
 18: [1, 1, 1, 1],
 19: [1, 1],
 20: [1, 1, 1, 1],
 21: [1, 1, 1, 1.7350271357115432],
 22: [1, 1, 1],
 23: [1, 1],
 24: [1.6029637020551817, 1, 1, 1.5233155903608468],
 25: [1, 1.6996846097881093, 1],
 26: [2.176067863326125, 1, 1],
 27: [1, 2.0210994881841473, 3.1540668393481206],
 28: [1, 1],
 29: [2.270107226265169, 1],
 30: [1, 1, 1, 2.432881019230585],
 31: [1, 1],
 32: [1, 1],
 33: [1.7558593707214387, 1],
 34: [1, 2.71316200441204, 1, 1.925558615892236, 1],
 35: [2.353781807093476, 1, 1.5233155903608468],
 36: [1, 1, 1, 1, 1],

#### 2.1.1.3. Node_weight (NW)

In [61]:
# Generate each sub-path in the form of a graph (start_node,next_node).
def get_graph(node_value):
    path_graph={}
    for path in node_value:
        graph=[]
        for n in range(len(node_value[path])-1):
            graph.append([node_value[path][n],node_value[path][n+1]])
        path_graph.update({path:graph})
    return path_graph
            
graphs=get_graph(node_value)
graphs

{0: [[1, 3.4701014265402987], [3.4701014265402987, 1], [1, 1], [1, 1]],
 1: [[2.0255223098296264, 1]],
 2: [[1, 1]],
 3: [[1, 2.1523184421579074]],
 4: [[1, 1], [1, 1], [1, 1.6996846097881093], [1.6996846097881093, 1], [1, 1]],
 5: [[1.64446003519234, 1.5502775822972865]],
 6: [[1, 1], [1, 1.5233155903608468], [1.5233155903608468, 1], [1, 1]],
 7: [[1, 0]],
 8: [[1, 1],
  [1, 1],
  [1, 2.432881019230585],
  [2.432881019230585, 1.6521001183821733],
  [1.6521001183821733, 1],
  [1, 1]],
 9: [[1, 1]],
 10: [[1, 1]],
 11: [[1, 1], [1, 1], [1, 1], [1, 1]],
 12: [[1.6732888523505396, 1]],
 13: [[1, 1], [1, 1], [1, 1]],
 14: [[1, 1]],
 15: [[1, 1], [1, 1], [1, 1], [1, 1]],
 16: [[1, 1]],
 17: [[1, 1]],
 18: [[1, 1], [1, 1], [1, 1]],
 19: [[1, 1]],
 20: [[1, 1], [1, 1], [1, 1]],
 21: [[1, 1], [1, 1], [1, 1.7350271357115432]],
 22: [[1, 1], [1, 1]],
 23: [[1, 1]],
 24: [[1.6029637020551817, 1], [1, 1], [1, 1.5233155903608468]],
 25: [[1, 1.6996846097881093], [1.6996846097881093, 1]],
 26: [[2.1

In [16]:
# All significant (above-threshold) nodes were assigned a weighting that reflected
# their topological strength (i.e., the number of significant downstream nodes that are pointed to, either
# directly or via other significant nodes).
# An initiating child node, ni, was ignored if non-significant, and the algorithm proceeds to the next child.
# Otherwise, we increase the weight counter by 1 and look for children of this node
# and so on. All non-significant nodes have NW = 0.

def sign_children(graph,weight=0):
    if(len(graph)==0):
        return weight
    if(graph[0]<threshold):
        return sign_children(graph[1:],weight)
    else:
        return sign_children(graph[1:],weight+1) # Increase weight, if the current node/child is significant

node_weight={}
for path in node_value:
    cur_node=[]
    cur_weight=0
    for n in range(len(node_value[path])):
        cur_weight=sign_children(node_value[path][n:])
        cur_node.append(cur_weight)  
    node_weight.update({path:cur_node})
node_weight

{0: [1, 1, 0, 0, 0],
 1: [1, 0],
 2: [0, 0],
 3: [1, 1],
 4: [1, 1, 1, 1, 0, 0],
 5: [2, 1],
 6: [1, 1, 1, 0, 0],
 7: [0, 0],
 8: [2, 2, 2, 2, 1, 0, 0],
 9: [0, 0],
 10: [0, 0],
 11: [0, 0, 0, 0, 0],
 12: [1, 0],
 13: [0, 0, 0, 0],
 14: [0, 0],
 15: [0, 0, 0, 0, 0],
 16: [0, 0],
 17: [0, 0],
 18: [0, 0, 0, 0],
 19: [0, 0],
 20: [0, 0, 0, 0],
 21: [1, 1, 1, 1],
 22: [0, 0, 0],
 23: [0, 0],
 24: [2, 1, 1, 1],
 25: [1, 1, 0],
 26: [1, 0, 0],
 27: [2, 2, 1],
 28: [0, 0],
 29: [1, 0],
 30: [1, 1, 1, 1],
 31: [0, 0],
 32: [0, 0],
 33: [1, 0],
 34: [2, 2, 1, 1, 0],
 35: [2, 1, 1],
 36: [0, 0, 0, 0, 0],
 37: [0, 0],
 38: [0, 0, 0, 0],
 39: [1, 1, 1],
 40: [0, 0],
 41: [1, 0, 0, 0],
 42: [0, 0],
 43: [1, 0, 0],
 44: [0, 0, 0, 0, 0, 0],
 45: [0, 0, 0],
 46: [0, 0, 0],
 47: [0, 0],
 48: [1, 1, 1],
 49: [2, 1, 0],
 50: [0, 0, 0],
 51: [0, 0],
 52: [0, 0, 0],
 53: [1, 1, 0],
 54: [0, 0, 0],
 55: [0, 0],
 56: [0, 0, 0, 0, 0, 0],
 57: [0, 0],
 58: [1, 1, 1, 1, 1],
 59: [1, 1],
 60: [0, 0, 0, 0],
 61:

#### 2.1.1.4. Node_score (NS)
NV and NW values are combined to calculate a Node_Score (NS)

In [17]:
def Node_score(NV,NW):
    node_score={}
    for path in NV:
        cur_path=[]
        for n in range(len(NV[path])):
            if(NV[path][n]>1):
                cur_path.append(NV[path][n]*NW[path][n])
            else:
                cur_path.append(0)
        node_score.update({path:cur_path})
    
    return node_score
node_score=Node_score(node_value,node_weight)
node_score

{0: [0, 3.4701014265402987, 0, 0, 0],
 1: [2.0255223098296264, 0],
 2: [0, 0],
 3: [0, 2.1523184421579074],
 4: [0, 0, 0, 1.6996846097881093, 0, 0],
 5: [3.28892007038468, 1.5502775822972865],
 6: [0, 0, 1.5233155903608468, 0, 0],
 7: [0, 0],
 8: [0, 0, 0, 4.86576203846117, 1.6521001183821733, 0, 0],
 9: [0, 0],
 10: [0, 0],
 11: [0, 0, 0, 0, 0],
 12: [1.6732888523505396, 0],
 13: [0, 0, 0, 0],
 14: [0, 0],
 15: [0, 0, 0, 0, 0],
 16: [0, 0],
 17: [0, 0],
 18: [0, 0, 0, 0],
 19: [0, 0],
 20: [0, 0, 0, 0],
 21: [0, 0, 0, 1.7350271357115432],
 22: [0, 0, 0],
 23: [0, 0],
 24: [3.2059274041103634, 0, 0, 1.5233155903608468],
 25: [0, 1.6996846097881093, 0],
 26: [2.176067863326125, 0, 0],
 27: [0, 4.042198976368295, 3.1540668393481206],
 28: [0, 0],
 29: [2.270107226265169, 0],
 30: [0, 0, 0, 2.432881019230585],
 31: [0, 0],
 32: [0, 0],
 33: [1.7558593707214387, 0],
 34: [0, 5.42632400882408, 0, 1.925558615892236, 0],
 35: [4.707563614186952, 0, 1.5233155903608468],
 36: [0, 0, 0, 0, 0],
 

#### 2.1.1.5. PRS

In [18]:
def PRS(NS):
    prs={}
    for path in NS:
        cur_sum=0
        for n in range(len(NS[path])):
            cur_sum+=NS[path][n]
        prs.update({path:cur_sum})
    return prs

prs_df=pd.DataFrame(list(PRS(node_score).values()), columns=['PRS'])
prs_df

Unnamed: 0,PRS
0,3.470101
1,2.025522
2,0.000000
3,2.152318
4,1.699685
...,...
41603,0.000000
41604,1.631178
41605,1.631178
41606,1.631178


### 2.1.2. Normalizing pathway scores
A normalization step is required to control for two key features: (i)
pathway size and (ii) statistical bias contributed by pathway-specific PRS score null distributions.

#### 2.1.2.1.  Pathway size 
Multiply each PRS score by the ratio of the number of DEGs (NDEGs) in a pathway to the total number of expressed genes (NEGs)

In [19]:
def path_size_PRS(prs):
    new_prs={}
    for path in range(prs.shape[0]):
        NDEGs=len([element for element in node_value[path] if element > threshold])
        NEGs=len([element for element in node_value[path] if element > 0])
        if(NEGs==0):
            cur_prs=0
        else:
            cur_prs=prs.iloc[path]['PRS']*(NDEGs/NEGs)
        new_prs.update({path:cur_prs})
    return new_prs

new_prs_df=pd.DataFrame(list(path_size_PRS(prs_df).values()),columns=['PRS'])
new_prs_df

Unnamed: 0,PRS
0,0.694020
1,1.012761
2,0.000000
3,1.076159
4,0.283281
...,...
41603,0.000000
41604,0.326236
41605,0.407795
41606,0.326236


#### 2.1.2.2. Statistical bias contributed by pathway-specific PRS score null distributions
....

## 2.2. MinePath

### 2.2.1. Discretization of gene expression values
Transform gene expression values into high (expressed / up-regulated) or low (not-expressed / down-regulated) gene expression binary equivalents

#### 2.2.1.1. The expression values of a gene over the total number of input samples are sorted in descending order;

In [20]:
genes_dis_df = pd.DataFrame(np.nan, index=gse2034_df.columns, columns=['Expression value'])
genes_dis_df['Expression value']=gse2034_df.sum()/gse2034_df.shape[0] # sum of expression values of a gene / total number of samples
genes_dis_df=genes_dis_df.sort_values(by=['Expression value'],ascending=False)
genes_dis_df

Unnamed: 0,Expression value
212869_x_at,51450.534615
208834_x_at,50993.236364
207783_x_at,50961.720979
200095_x_at,50559.692657
208825_x_at,50135.258392
...,...
221721_s_at,4.913636
207397_s_at,4.841958
208088_s_at,4.773427
220919_s_at,3.655944


#### 2.2.1.2. The midpoints between each two consecutive values are calculated;

In [21]:
def midpoint(num1,num2):
    return (num1+num2)/2

midpoints_dict={} # Key corresponds to the position of the first gene. The second gene is in the next position (i+1).
for i in range(genes_dis_df.shape[0]-1):
    midpoints_dict.update({i:midpoint(genes_dis_df.iloc[i]['Expression value'],genes_dis_df.iloc[i+1]['Expression value'])})

midpoints_dict

{0: 51221.88548951049,
 1: 50977.478671328674,
 2: 50760.70681818182,
 3: 50347.47552447552,
 4: 49790.438111888114,
 5: 49361.35856643357,
 6: 49115.87062937062,
 7: 47758.42954545454,
 8: 45414.589685314684,
 9: 43913.81625874126,
 10: 43452.01695804196,
 11: 43070.28391608392,
 12: 42796.815734265736,
 13: 42735.23758741259,
 14: 42635.55367132867,
 15: 42141.14807692307,
 16: 41521.927622377625,
 17: 41129.12132867133,
 18: 40866.69300699301,
 19: 40791.69667832168,
 20: 40085.94807692307,
 21: 39246.8472027972,
 22: 39022.19230769231,
 23: 38931.84055944056,
 24: 38832.02919580419,
 25: 38527.90786713287,
 26: 38209.13723776224,
 27: 37872.88129370629,
 28: 37237.05786713286,
 29: 36603.90419580419,
 30: 36144.67517482517,
 31: 35936.633566433564,
 32: 35626.45472027972,
 33: 35139.13251748252,
 34: 34742.13059440559,
 35: 34463.339685314684,
 36: 34318.382342657336,
 37: 34162.81538461538,
 38: 34083.87062937063,
 39: 33940.811888111886,
 40: 33791.63409090908,
 41: 33772.9674825

#### 2.2.1.3. For each midpoint, μi, the Information Gain (IG) of the system is computed. Let IG(S,μi) to denote the IG of the system for midpoint μi.

In [22]:
classes=sorted(set(gse2034_df.index)) # Τhe classes to which a sample may belong
samples=gse2034_df.index # the samples class

# Calculate the proportion of samples in S that belong in Class C
def P(C,S):
    return list(S).count(C)/len(S)

def E(S,m=1):
    # m not given: calculate the entropy of the system taking into account the prior assignment of sample cases into phenotype classes
    # m given: calculate the respective entropy of the system taking into account its division into subgroups around midpoint μi
    tmp=0
    for c in classes:
        # P(c,S) must be greater than zero
        tmp+=P(c,S)*math.log(P(c,S))/m
    return -(tmp)

entropy=E(samples)
print('Dataset Entropy: %.3f bits' % entropy)

Dataset Entropy: 0.582 bits


In [23]:
# Calculate the Information Gain (IG) of the system
def IG(S,m):
    return E(S)-E(S,m)

information_gain=[]
for m in list(midpoints_dict.values()):
    information_gain.append(IG(samples,m))
print('Information Gain: '+str(information_gain))

Information Gain: [0.5824808719442691, 0.5824808174225486, 0.5824807686262339, 0.5824806744420477, 0.582480545007305, 0.5824804433134041, 0.5824803843325099, 0.5824800472476458, 0.5824794177811043, 0.582478979443527, 0.5824788384717101, 0.5824787196591852, 0.5824786332404793, 0.5824786136286013, 0.5824785817604252, 0.5824784214745501, 0.5824782153396294, 0.5824780813591134, 0.582477990413465, 0.5824779642081802, 0.5824777128018296, 0.5824774021260575, 0.5824773166804792, 0.5824772820379089, 0.5824772435809344, 0.5824771251755332, 0.582476999043386, 0.5824768636914291, 0.5824766010743506, 0.5824763304938433, 0.5824761283094001, 0.5824760350143435, 0.5824758938931227, 0.5824756671455206, 0.5824754777214028, 0.5824753420916576, 0.5824752707002087, 0.5824751934095437, 0.5824751539173855, 0.5824750818840688, 0.5824750061200652, 0.5824749965926049, 0.5824749818422327, 0.5824749070618054, 0.5824747341911618, 0.5824745380936865, 0.5824744332821313, 0.5824743587205016, 0.5824742357334811, 0.582

In [24]:
# The midpoint with the highest information gain is selected as the discretization point
max_value = max(information_gain)
max_mid_pos = information_gain.index(max(information_gain))
dis_point=midpoints_dict.get(max_mid_pos)
print('Discretization point: %.3f' %dis_point)

Discretization point: 51221.885


#### 2.2.1.4. The sample cases with expression values lower than the discretization point are assigned the '0' value (meaning that the gene is under-expressed), and the sample cases with expression values bigger that the discretization point are assigned the '1' value (the gene is over-expressed).
The discretization process is applied for each gene separately, and the final dataset is a matrix of discretized, actually binarized, values.

In [25]:
gse2034_dis_df=gse2034_df.copy()
gse2034_dis_df[gse2034_dis_df<dis_point]=0 # under-expressed
gse2034_dis_df[gse2034_dis_df>=dis_point]=1 # over-expressed
gse2034_dis_df=gse2034_dis_df.astype('int')
gse2034_dis_df

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-HSAC07/X00351_5_at,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERneg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERneg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2.2.2. Functional sub-paths: Matching sub-paths with gene expression profiles

In [26]:
import statistics

# Get the nodes of each sub-path in binary form
gene_expression_profile_df=selected_df.copy()
expr_prof_tmp={}
for row in range(gene_expression_profile_df.shape[0]):
    row_tmp=[]
    path_tmp=gene_expression_profile_df.iloc[row][~gene_expression_profile_df.iloc[row].isnull()]
    expr_prof_tmp[row]=[]
    
    for i in path_tmp[::2]:
        tmp_node_genes=[(g.split('#')) for g in list(filter(None,i.split(' ')))] # Get genes of node
        tmp_node_genes=list(filter(None, tmp_node_genes))
        tmp_expr_vals=[]
        
        for n in tmp_node_genes:
            # Check if gene exists in gse2034 dataset or KEGG-ID with noProbe correspondes to specific gene from gse2034 dataset
            if(n[0]=='noProbe'):
                if(not(n[1] in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis_df.shape[0])).astype(int))
                    continue
            if(len(n)==1):
                if(not(n in gene_dict.values())):
                    tmp_expr_vals.append((np.zeros(gse2034_dis_df.shape[0])).astype(int))
                    continue
            tmp_expr_vals.append(list(gse2034_dis_df[n[0]]))
    
        if(len(tmp_expr_vals)>0):
            expr_prof_tmp[row].append((np.transpose(tmp_expr_vals)).max(axis=1))
              
expr_prof_tmp

{0: [array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
  array([0, 0, 0, 0,

In [27]:
# Replace genes in pathway with their samples binary value (max value in case of multiple genes in node)
for row in range(gene_expression_profile_df.shape[0]):
    for column in range(0,gene_expression_profile_df.iloc[row][~gene_expression_profile_df.iloc[row].isnull()].shape[0],2):
        # Since nodes are in the even columns, the correspondence with the positions of the table expr_prof_tmp of consecutive 
        # positions is calculated as follows: for each column c, we get c/2 (even number/2= even number)
        gene_expression_profile_df.iat[row,column]=expr_prof_tmp[row][int(column/2)]
        
gene_expression_profile_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,...,,,,,,,,,,
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,,,...,,,,,,,,,,
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,,,...,,,,,,,,,,
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,,,...,,,,,,,,,,
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",--|,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41603,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,...,,,,,,,,,,
41604,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,...,,,,,,,,,,
41605,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,...,,,,,,,,,,
41606,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-->,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,...,,,,,,,,,,


In [28]:
# The following functions compute the 'and' and 'xor' boolean operations
def and_boolean_op(num1,num2):
    result=[]
    for n in range(len(num1)):
        result.append(num1[n]*num2[n])
    return result
    
def xor_boolean_op(num1,num2):
    result=[]
    for n in range(len(num1)):
        result.append(1 if(num1[n] and not num2[n]) or (not num1[n] and num2[n]) else 0)
    return result

operations_dict={'Activation':and_boolean_op,'Inhibition':xor_boolean_op}

In [29]:
# Calculate the pathway expression with boolean operations
def calc_pathway_expression(path,prev_result):
    if(len(path)>0):
        t=0
        relation=list(relations_dict.keys())[list(relations_dict.values()).index(path.iloc[0])] # Get the current edge type
        next_node=path.iloc[1]
        result=operations_dict[relation](prev_result,next_node)
        calc_pathway_expression(path.iloc[2:].reset_index(drop=True),result)
    return prev_result

results=[]
for row in range(gene_expression_profile_df.shape[0]):
    tmp_path=gene_expression_profile_df.iloc[row][~gene_expression_profile_df.iloc[row].isnull()]
    results.append(calc_pathway_expression(tmp_path.iloc[1:].reset_index(drop=True),tmp_path[0])) 

In [30]:
# Binary sub-path expression matrix
binary_expression_df=pd.DataFrame(results,index=list(gene_expression_profile_df.index),columns=labels)
binary_expression_df

Unnamed: 0,ERpos,ERpos.1,ERpos.2,ERneg,ERpos.3,ERpos.4,ERpos.5,ERpos.6,ERpos.7,ERpos.8,...,ERneg.1,ERneg.2,ERneg.3,ERneg.4,ERpos.9,ERpos.10,ERpos.11,ERpos.12,ERneg.5,ERpos.13
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# To make it easier to use, the resulting DataFrame is copied to a new one with the method name.
minepath_df=binary_expression_df.copy()
minepath_df

Unnamed: 0,ERpos,ERpos.1,ERpos.2,ERneg,ERpos.3,ERpos.4,ERpos.5,ERpos.6,ERpos.7,ERpos.8,...,ERneg.1,ERneg.2,ERneg.3,ERneg.4,ERpos.9,ERpos.10,ERpos.11,ERpos.12,ERneg.5,ERpos.13
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41603,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2.3. TAPPA

### 2.3.1. Pathway connectivity index
The molecular connectivity index is a widely used topological descriptor of chemical compounds and has been successfully used in many other fields, including protein structure and drug discovery.

#### 2.3.1.1. Adjacency matrix

In [32]:
# The adjacency matrix is defined as A=(a_ij), where a_ij=1 if i=j or (g_i, g_j) belongs to E and a_ij=0 if (g_i, g_j) does 
# not belong to E.
def adjacency_matrix(nodes):
    tmp_adj=[]
    
    #i=j -> a_ij=1
    for i in range(len(nodes)):
        tmp_adj.append([0]*len(nodes))
        for j in range(len(nodes)):
            if(i==j):
                tmp_adj[i][j]=1
                continue
                
    # (g_i,g_j) belongs to E (current sub-paths are linear) -> a_ij=1          
    for i in range(len(nodes)-1):
        tmp_adj[i][i+1]=1
        tmp_adj[i+1][i]=1
            
    return tmp_adj
    
adjacency_matrices={}
for path in node_genes:
    adjacency_matrices.update({path:adjacency_matrix(node_genes[path])})
adjacency_matrices

{0: [[1, 1, 0, 0, 0],
  [1, 1, 1, 0, 0],
  [0, 1, 1, 1, 0],
  [0, 0, 1, 1, 1],
  [0, 0, 0, 1, 1]],
 1: [[1, 1], [1, 1]],
 2: [[1, 1], [1, 1]],
 3: [[1, 1], [1, 1]],
 4: [[1, 1, 0, 0, 0, 0],
  [1, 1, 1, 0, 0, 0],
  [0, 1, 1, 1, 0, 0],
  [0, 0, 1, 1, 1, 0],
  [0, 0, 0, 1, 1, 1],
  [0, 0, 0, 0, 1, 1]],
 5: [[1, 1], [1, 1]],
 6: [[1, 1, 0, 0, 0],
  [1, 1, 1, 0, 0],
  [0, 1, 1, 1, 0],
  [0, 0, 1, 1, 1],
  [0, 0, 0, 1, 1]],
 7: [[1, 1], [1, 1]],
 8: [[1, 1, 0, 0, 0, 0, 0],
  [1, 1, 1, 0, 0, 0, 0],
  [0, 1, 1, 1, 0, 0, 0],
  [0, 0, 1, 1, 1, 0, 0],
  [0, 0, 0, 1, 1, 1, 0],
  [0, 0, 0, 0, 1, 1, 1],
  [0, 0, 0, 0, 0, 1, 1]],
 9: [[1, 1], [1, 1]],
 10: [[1, 1], [1, 1]],
 11: [[1, 1, 0, 0, 0],
  [1, 1, 1, 0, 0],
  [0, 1, 1, 1, 0],
  [0, 0, 1, 1, 1],
  [0, 0, 0, 1, 1]],
 12: [[1, 1], [1, 1]],
 13: [[1, 1, 0, 0], [1, 1, 1, 0], [0, 1, 1, 1], [0, 0, 1, 1]],
 14: [[1, 1], [1, 1]],
 15: [[1, 1, 0, 0, 0],
  [1, 1, 1, 0, 0],
  [0, 1, 1, 1, 0],
  [0, 0, 1, 1, 1],
  [0, 0, 0, 1, 1]],
 16: [[1, 1], [1, 1]],


#### 2.3.1.2. Define PCI
Assuming that xis is the normalized log expression measurement for gene i in sample s.

In [33]:
# Each column expression values are normalized to zero mean.
norm_gse2034_df=(gse2034_df-gse2034_df.mean())/gse2034_df.std()

# Further normalize to (-0.5,0.5) with Sigmoid function (Sigmoid (x_is) - 0.5) to lower the effects of extremely large/small 
# values for gene i in sample s.
def sigmoid(df):
    x=[]
    for sample in range(df.shape[0]):
        x.append([])
        for gene in range(df.shape[1]):
            x[sample].append(1 / (1 + math.exp(-df.iloc[sample,gene])))
    return x

sig_gse2034_df=pd.DataFrame(sigmoid(norm_gse2034_df),columns=norm_gse2034_df.columns,index=norm_gse2034_df.index)-0.5
sig_gse2034_df

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-HSAC07/X00351_5_at,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at
ERpos,0.026481,-0.010711,-0.060899,-0.091904,-0.220023,-0.143070,-0.062577,-0.211593,-0.129022,-0.241819,...,0.084611,0.034529,-0.075119,0.149744,0.108078,-0.016905,-0.095245,-0.027978,-0.024182,0.168053
ERpos,0.410582,-0.253873,-0.090359,0.456061,0.429842,0.101784,0.293155,-0.113271,-0.107576,0.235619,...,-0.250255,-0.347822,-0.073515,-0.221092,-0.294594,0.367303,-0.151869,-0.198418,-0.107568,-0.174789
ERpos,0.285372,-0.124783,0.012059,0.062604,-0.148707,-0.137563,-0.036951,0.073547,-0.159924,-0.007574,...,-0.095417,-0.133610,0.136511,0.228619,0.300936,0.181749,0.187165,0.202588,0.320156,0.489560
ERneg,0.066627,0.321647,0.066928,-0.016614,0.305397,-0.127372,-0.019749,-0.022263,-0.036013,0.026572,...,0.401483,0.422221,0.039165,0.242894,0.250805,0.072469,0.084809,0.282225,0.338985,0.170677
ERpos,0.110714,0.345961,0.085511,-0.065343,0.053083,-0.045713,-0.027185,-0.153913,0.336438,0.063975,...,0.188700,0.319501,0.201940,0.208882,0.349582,0.138949,-0.066059,0.260449,0.165694,-0.231863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,-0.132114,0.073378,0.111384,0.001862,-0.116345,-0.112914,-0.201760,-0.187815,-0.087505,-0.269150,...,-0.132775,-0.123980,0.071577,0.039089,0.121372,0.144435,-0.021910,-0.030603,-0.068299,-0.075418
ERpos,-0.186720,-0.054655,-0.043488,-0.042712,-0.130154,0.154864,-0.255019,-0.135393,-0.030102,-0.057859,...,-0.147708,-0.093011,-0.194934,-0.165079,-0.190060,-0.233880,-0.171600,-0.167464,-0.102353,-0.184105
ERpos,-0.147882,-0.163755,-0.009684,0.042820,0.430845,-0.034824,-0.036951,0.028232,-0.108429,-0.091257,...,-0.161103,-0.131948,0.002607,-0.064845,-0.057049,-0.162406,-0.081152,-0.179005,-0.092424,-0.162574
ERneg,-0.037250,0.118330,-0.156710,0.104215,0.103610,-0.094609,-0.243226,0.177233,-0.164058,0.252957,...,0.207063,0.161011,-0.187258,0.081805,0.063626,-0.167518,-0.182013,0.044682,0.050235,-0.245978


In [34]:
# Get mean value of samples for each gene (x_is)
gene_expression_values=sig_gse2034_df.mean()
gene_expression_df=pd.DataFrame(gene_expression_values.values,index=sig_gse2034_df.columns,columns=['Normalized log expression'])
gene_expression_df

Unnamed: 0,Normalized log expression
1007_s_at,-0.009237
1053_at,-0.008840
117_at,-0.014179
121_at,-0.013235
1255_g_at,-0.006472
...,...
AFFX-HUMISGF3A/M97935_3_at,-0.012377
AFFX-HUMISGF3A/M97935_5_at,-0.019718
AFFX-HUMISGF3A/M97935_MA_at,-0.018688
AFFX-HUMISGF3A/M97935_MB_at,-0.018433


In [35]:
from statistics import mean

# Each node consists of one or more genes, so each node gets the average value.
def get_x(node_genes):
    x={}
    for path in node_genes:
        cur_path=[]
        for node in range(len(node_genes[path])):
            cur_node=[]
            for gene in node_genes[path][node]:
                if(not(gene in gene_expression_df.index)):
                    cur_node.append(0)
                else:
                    cur_node.append(gene_expression_df.loc[gene]['Normalized log expression'])
            cur_path.append(mean(cur_node))
        x.update({path:cur_path})
    return x
        
# Node_genes was initialized on a previous method (PRS)
x=get_x(node_genes)
x

{0: [-0.010911315781879101,
  -0.015934529354636153,
  -0.006924901606123658,
  -0.00921778602462716,
  -0.012076600037593647],
 1: [-0.017946410254766447, -0.015150417618157568],
 2: [-0.009359964320065564, -0.00705131415095385],
 3: [-0.012064266647183838, -0.01800968764236645],
 4: [-0.012095777388089595,
  -0.013897238340624217,
  -0.009010901516007572,
  -0.012209172138024443,
  -0.013244086456672606,
  -0.008822094809217232],
 5: [-0.017523240641817997, -0.011577085542465611],
 6: [-0.00874878487148111,
  -0.01965366550071844,
  -0.014384564687311424,
  -0.009928669442826018,
  -0.015702326627747478],
 7: [-0.00901370289098233, -0.004283883431443533],
 8: [-0.009536422120133229,
  -0.016184406715669935,
  -0.02446313127166314,
  -0.009646647393773683,
  -0.016334131868323512,
  -0.012713746771469525,
  -0.018586967965086562],
 9: [-0.016207697014710958, -0.004352894424758916],
 10: [-0.009843469710336967, -0.01193664318367791],
 11: [-0.02020059614577441,
  -0.012858790935402292,

In [50]:
def PCI(df,x,a):
    pci_dict={}
    for path in range(df.shape[0]):
        cur_path=df.iloc[path][~df.iloc[path].isnull()]
        
        # Number of gene (ignore the edges)
        N=len(cur_path.iloc[::2])  
        
        cur_sum=0
        for i in range(N):
            for j in range(N):
                cur_sum+=np.sign(x[path][i]+x[path][j])*(abs(x[path][i])**0.5)*a[path][i][j]*(abs(x[path][j])**0.5)
        
        pci_dict.update({path:cur_sum})
    return pci_dict

pci=PCI(selected_df,x,adjacency_matrices)
pci_df=pd.DataFrame(pci.values(),columns=['PCI'])
pci_df

{0: -0.13952651781633577,
 1: -0.0660753427034313,
 2: -0.03265936129484838,
 3: -0.059554366359261965,
 4: -0.1856191721114377,
 5: -0.0575866777156073,
 6: -0.1771452254680838,
 7: -0.025725563224662826,
 8: -0.28750306717713126,
 9: -0.03735944782596441,
 10: -0.04345941044920644,
 11: -0.1821401533409876,
 12: -0.0488633170277999,
 13: -0.10089466239864793,
 14: -0.045197099830444565,
 15: -0.18234470146640225,
 16: -0.06755161972838555,
 17: -0.05007443533294818,
 18: -0.11166820045360776,
 19: -0.05898201561984574,
 20: -0.108622067081906,
 21: -0.16241122463508695,
 22: -0.09139830690853207,
 23: -0.03571777384480942,
 24: -0.15314535470915697,
 25: -0.0900972433594974,
 26: -0.10092426100358956,
 27: -0.11888620018511525,
 28: -0.04114472118270591,
 29: -0.06838889858879789,
 30: -0.15519673569058967,
 31: -0.04151484121847469,
 32: -0.04391052228465254,
 33: -0.06810651433536358,
 34: -0.14584402959858322,
 35: -0.09705550135951013,
 36: -0.14923953574434404,
 37: -0.062045470

#### 2.3.1.3. Normalize PCI (divided by the gene number in pathway)

In [58]:
def normalize_PCI(node_genes,pci_df):
    norm_dict={}
    for path in range(pci_df.shape[0]):
        cur_len=len(node_genes[path])
        norm_dict.update({path:pci_df.iloc[path]['PCI']/cur_len})
    return norm_dict

norm_pci_df=pd.DataFrame(normalize_PCI(node_genes,pci_df).values(),columns=['Normalized PCI'])

# To make it easier to use, the resulting DataFrame is copied to a new one with the method name.
tappa_df=norm_pci_df.copy()
tappa_df

Unnamed: 0,Normalized PCI
0,-0.027905
1,-0.033038
2,-0.016330
3,-0.029777
4,-0.030937
...,...
41603,-0.034247
41604,-0.027075
41605,-0.028844
41606,-0.030776


## 2.4. HiPathia