In [1]:
import pandas as pd
import numpy as np
import math

## 1. Preprocess datasets

In [74]:
# Two types of nodes relations
relation_dict={'Activation':'-->','Inhibition':'--|'}
relation_dict

{'Activation': '-->', 'Inhibition': '--|'}

### 1.1. GSE2034

In [132]:
gse2034_raw = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"')
gse2034=gse2034_raw.copy()

# preprocess dataset
gse2034[['Gene','KEGG-ID']] = gse2034['Class'].str.split('#',expand=True)
gse2034.drop('Class', inplace=True, axis=1)
cols = gse2034.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034=gse2034[cols]

gse2034

Unnamed: 0,Gene,KEGG-ID,ERpos,ERpos.1,ERpos.2,ERneg,ERpos.3,ERpos.4,ERpos.5,ERpos.6,...,ERneg.72,ERneg.73,ERneg.74,ERneg.75,ERpos.204,ERpos.205,ERpos.206,ERpos.207,ERneg.76,ERpos.208
0,1007_s_at,hsa:100616237,3848.1,6520.9,5285.7,4043.7,4263.6,2949.8,5498.9,3863.1,...,4058.2,4017.6,2841.0,2914.2,3681.0,3066.9,2773.0,2984.3,3540.0,2620.0
1,1007_s_at,hsa:780,3848.1,6520.9,5285.7,4043.7,4263.6,2949.8,5498.9,3863.1,...,4058.2,4017.6,2841.0,2914.2,3681.0,3066.9,2773.0,2984.3,3540.0,2620.0
2,1053_at,hsa:5982,228.9,112.5,178.4,398.7,417.7,221.2,280.4,198.2,...,183.4,356.1,234.6,169.4,94.5,265.5,209.8,160.0,285.7,180.5
3,117_at,hsa:3310,213.1,189.8,269.7,312.4,327.1,225.0,243.5,244.4,...,326.6,234.9,369.6,149.5,236.4,347.9,226.7,252.9,135.1,191.8
4,121_at,hsa:7849,1009.4,2083.3,1203.4,1104.4,1043.3,1117.6,1085.4,1423.1,...,1041.3,1195.6,751.5,1117.8,1022.4,1127.4,1071.8,1178.5,1256.7,1284.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22640,AFFX-HUMISGF3A/M97935_3_at,hsa:6772,2908.8,6087.7,4264.5,3496.5,3952.3,1585.9,1396.4,2782.3,...,1074.5,2120.3,1548.9,1460.8,2515.1,3991.3,1361.4,1917.7,1880.2,2040.0
22641,AFFX-HUMISGF3A/M97935_5_at,hsa:6772,29.8,16.1,96.3,71.1,36.6,26.1,8.8,64.7,...,26.9,108.7,43.5,23.4,53.7,46.7,11.1,33.1,8.4,12.0
22642,AFFX-HUMISGF3A/M97935_MA_at,hsa:6772,403.9,79.3,837.3,1024.2,969.1,205.4,388.2,423.0,...,40.6,263.3,136.2,27.5,92.1,399.2,143.1,119.7,533.8,172.4
22643,AFFX-HUMISGF3A/M97935_MB_at,hsa:6772,389.9,274.5,937.3,982.5,656.3,371.9,283.1,765.2,...,71.9,293.9,363.9,124.8,186.6,329.5,281.9,295.9,491.1,150.0


In [133]:
# Change column names
labels=gse2034.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [5]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=gse2034.loc[gse2034['Gene'] == i]['KEGG-ID']
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
    gene_dict.update({i:tmp_list})
print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

Gene dictionary (key: Genes, values: KEGG-IDs): {'1007_s_at': ['hsa:100616237', 'hsa:780'], '1053_at': ['hsa:5982'], '117_at': ['hsa:3310'], '121_at': ['hsa:7849'], '1255_g_at': ['hsa:2978'], '1294_at': ['hsa:7318'], '1316_at': ['hsa:7067'], '1320_at': ['hsa:11099'], '1405_i_at': ['hsa:6352'], '1431_at': ['hsa:1571'], '1438_at': ['hsa:2049'], '1487_at': ['hsa:2101'], '1494_f_at': ['hsa:1548'], '1598_g_at': ['hsa:2621'], '160020_at': ['hsa:4323'], '1729_at': ['hsa:8717'], '1773_at': ['hsa:100529261', 'hsa:2342'], '177_at': ['hsa:5337'], '179_at': ['hsa:441263'], '1861_at': ['hsa:572'], '200000_s_at': ['hsa:10594'], '200001_at': ['hsa:826'], '200002_at': ['hsa:11224'], '200003_s_at': ['hsa:6158'], '200004_at': ['hsa:1982'], '200005_at': ['hsa:8664'], '200006_at': ['hsa:11315'], '200007_at': ['hsa:6727'], '200008_s_at': ['hsa:2665'], '200009_at': ['hsa:2665'], '200010_at': ['hsa:6135'], '200011_s_at': ['hsa:377'], '200012_x_at': ['hsa:6144', 'hsa:619499', 'hsa:26771', 'hsa:100131205'], '2

In [134]:
# transpose dataframe so that the columns indicate the genes' KEGG-IDs 
# and rows correspond to samples (class: ERpos or ERneg)
genes=gse2034['Gene']
gse2034.drop('KEGG-ID', inplace=True, axis=1)
gse2034=np.transpose(gse2034.iloc[:,1:])
gse2034.columns=genes.values.tolist()
gse2034

Unnamed: 0,1007_s_at,1007_s_at.1,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,...,AFFX-HSAC07/X00351_5_at,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERneg,4043.7,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,...,16474.5,38913.5,23342.8,9439.4,18474.2,3496.5,71.1,1024.2,982.5,807.1
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERneg,4043.7,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,...,16474.5,38913.5,23342.8,9439.4,18474.2,3496.5,71.1,1024.2,982.5,807.1


In [135]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034=gse2034.groupby(level=0,axis=1).mean()
gse2034

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-HSAC07/X00351_5_at,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERneg,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,90.7,...,16474.5,38913.5,23342.8,9439.4,18474.2,3496.5,71.1,1024.2,982.5,807.1
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,7287.1,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5
ERneg,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,90.7,...,16474.5,38913.5,23342.8,9439.4,18474.2,3496.5,71.1,1024.2,982.5,807.1


### 1.2. All-subpaths

In [8]:
all_subpaths_raw = pd.read_csv('Data/All-subpaths.zip')
all_subpaths_raw

Unnamed: 0,SubPathID
0,208200_at#hsa:3552 210118_s_at#hsa:3552 205067...
1,208438_s_at#hsa:2268 208018_s_at#hsa:3055 2026...
2,207072_at#hsa:8807 206618_at#hsa:8809 -->20923...
3,206109_at#hsa:2523 208505_s_at#hsa:2524 210608...
4,213464_at#hsa:25759 noProbe#hsa:399694 206330_...
...,...
41604,206890_at#hsa:3594-->206118_at#hsa:6775-->2078...
41605,221271_at#hsa:59067-->219971_at#hsa:50615 2216...
41606,219971_at#hsa:50615 221658_s_at#hsa:50615-->20...
41607,216857_at#hsa:51561 217326_x_at#hsa:51561 2173...


In [9]:
# Split each subpath, get the max length of the paths and create the dataframe with NaN values for pathways with 
# length smaller that the max length.
max_len=0
all_subpaths_list=[]
for x in all_subpaths_raw['SubPathID']:
    # Subpaths are splitted based on the activation symbol (-->). Inhibition is handled next. 
    tmp=x.split(relation_dict['Activation']) 
    all_subpaths_list.append(tmp)
    if(len(tmp)>max_len):
        max_len=len(tmp)
        
all_subpaths = pd.DataFrame(np.nan, index=np.arange(all_subpaths_raw.size), columns=list(range(max_len)))
for x in range(len(all_subpaths_list)):
    for y in range(len(all_subpaths_list[x])):
        all_subpaths.loc[x,y]=all_subpaths_list[x][y]

In [10]:
all_subpaths # rows are the subpaths and columns are the edges (containing one or more genes)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,208200_at#hsa:3552 210118_s_at#hsa:3552 205067...,202948_at#hsa:3554 215561_s_at#hsa:3554 205403...,203901_at#hsa:10454,206853_s_at#hsa:6885 206854_s_at#hsa:6885 2115...,209666_s_at#hsa:1147 209341_s_at#hsa:3551 2093...,,,,,,...,,,,,,,,,,
1,208438_s_at#hsa:2268 208018_s_at#hsa:3055 2026...,207821_s_at#hsa:5747 208820_at#hsa:5747,,,,,,,,,...,,,,,,,,,,
2,207072_at#hsa:8807 206618_at#hsa:8809,209239_at#hsa:4790 201783_s_at#hsa:5970 209878...,,,,,,,,,...,,,,,,,,,,
3,206109_at#hsa:2523 208505_s_at#hsa:2524 210608...,214088_s_at#hsa:2525 216010_x_at#hsa:2525,,,,,,,,,...,,,,,,,,,,
4,213464_at#hsa:25759 noProbe#hsa:399694 206330_...,215075_s_at#hsa:2885,207112_s_at#hsa:2549 214987_at#hsa:2549,220566_at#hsa:23533 204369_at#hsa:5290 212688_...,212607_at#hsa:10000 212609_s_at#hsa:10000 2193...,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41604,206890_at#hsa:3594,206118_at#hsa:6775,207849_at#hsa:3558,,,,,,,,...,,,,,,,,,,
41605,221271_at#hsa:59067,219971_at#hsa:50615 221658_s_at#hsa:50615,208991_at#hsa:6774 208992_s_at#hsa:6774,210426_x_at#hsa:6095 210479_s_at#hsa:6095,noProbe#hsa:112744 208402_at#hsa:3605 216876_s...,,,,,,...,,,,,,,,,,
41606,219971_at#hsa:50615 221658_s_at#hsa:50615,208991_at#hsa:6774 208992_s_at#hsa:6774,210426_x_at#hsa:6095 210479_s_at#hsa:6095,noProbe#hsa:112744 208402_at#hsa:3605 216876_s...,,,,,,,...,,,,,,,,,,
41607,216857_at#hsa:51561 217326_x_at#hsa:51561 2173...,206890_at#hsa:3594,208991_at#hsa:6774 208992_s_at#hsa:6774,221271_at#hsa:59067,219971_at#hsa:50615 221658_s_at#hsa:50615,,,,,,...,,,,,,,,,,


### 1.3. Selected

In [11]:
selected_raw = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
selected_raw

Unnamed: 0,SubPathID
0,208200_at#hsa:3552 210118_s_at#hsa:3552 205067...
1,208438_s_at#hsa:2268 208018_s_at#hsa:3055 2026...
2,207072_at#hsa:8807 206618_at#hsa:8809 -->20923...
3,206109_at#hsa:2523 208505_s_at#hsa:2524 210608...
4,213464_at#hsa:25759 noProbe#hsa:399694 206330_...
...,...
41603,206890_at#hsa:3594-->206118_at#hsa:6775-->2078...
41604,221271_at#hsa:59067-->219971_at#hsa:50615 2216...
41605,219971_at#hsa:50615 221658_s_at#hsa:50615-->20...
41606,216857_at#hsa:51561 217326_x_at#hsa:51561 2173...


In [12]:
# Split each subpath, get the max length of the paths and create the dataframe with NaN values for pathways with 
# length smaller that the max length
max_len=0
selected_list=[]
for x in selected_raw['SubPathID']:
    # Subpaths are splitted based on the activation symbol (-->). Inhibition is handled next.
    tmp=x.split(relation_dict['Activation'])
    selected_list.append(tmp)
    if(len(tmp)>max_len):
        max_len=len(tmp)
        
selected = pd.DataFrame(np.nan, index=np.arange(selected_raw.size), columns=list(range(max_len)))
for x in range(len(selected_list)):
    for y in range(len(selected_list[x])):
        selected.loc[x,y]=selected_list[x][y]

In [13]:
selected # rows are the subpaths and columns are the nodes (containing one or more genes)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,208200_at#hsa:3552 210118_s_at#hsa:3552 205067...,202948_at#hsa:3554 215561_s_at#hsa:3554 205403...,203901_at#hsa:10454,206853_s_at#hsa:6885 206854_s_at#hsa:6885 2115...,209666_s_at#hsa:1147 209341_s_at#hsa:3551 2093...,,,,,,...,,,,,,,,,,
1,208438_s_at#hsa:2268 208018_s_at#hsa:3055 2026...,207821_s_at#hsa:5747 208820_at#hsa:5747,,,,,,,,,...,,,,,,,,,,
2,207072_at#hsa:8807 206618_at#hsa:8809,209239_at#hsa:4790 201783_s_at#hsa:5970 209878...,,,,,,,,,...,,,,,,,,,,
3,206109_at#hsa:2523 208505_s_at#hsa:2524 210608...,214088_s_at#hsa:2525 216010_x_at#hsa:2525,,,,,,,,,...,,,,,,,,,,
4,213464_at#hsa:25759 noProbe#hsa:399694 206330_...,215075_s_at#hsa:2885,207112_s_at#hsa:2549 214987_at#hsa:2549,220566_at#hsa:23533 204369_at#hsa:5290 212688_...,212607_at#hsa:10000 212609_s_at#hsa:10000 2193...,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41603,206890_at#hsa:3594,206118_at#hsa:6775,207849_at#hsa:3558,,,,,,,,...,,,,,,,,,,
41604,221271_at#hsa:59067,219971_at#hsa:50615 221658_s_at#hsa:50615,208991_at#hsa:6774 208992_s_at#hsa:6774,210426_x_at#hsa:6095 210479_s_at#hsa:6095,noProbe#hsa:112744 208402_at#hsa:3605 216876_s...,,,,,,...,,,,,,,,,,
41605,219971_at#hsa:50615 221658_s_at#hsa:50615,208991_at#hsa:6774 208992_s_at#hsa:6774,210426_x_at#hsa:6095 210479_s_at#hsa:6095,noProbe#hsa:112744 208402_at#hsa:3605 216876_s...,,,,,,,...,,,,,,,,,,
41606,216857_at#hsa:51561 217326_x_at#hsa:51561 2173...,206890_at#hsa:3594,208991_at#hsa:6774 208992_s_at#hsa:6774,221271_at#hsa:59067,219971_at#hsa:50615 221658_s_at#hsa:50615,,,,,,...,,,,,,,,,,


## 2. Discretization process
Transform gene expression values into high (expressed / up-regulated) or low (not-expressed / down-regulated) gene expression binary equivalents

### 2.1. The expression values of a gene over the total number of input samples are sorted in descending order;

In [14]:
gse2034_dis = pd.DataFrame(np.nan, index=gse2034.columns, columns=['Expression value'])
gse2034_dis['Expression value']=gse2034.sum()/gse2034.shape[0] # sum of expression values of a gene / total number of samples
gse2034_dis=gse2034_dis.sort_values(by=['Expression value'],ascending=False)
gse2034_dis

Unnamed: 0,Expression value
210646_x_at,48931.611538
207783_x_at,47760.203846
212869_x_at,45788.103846
208825_x_at,43967.907692
201492_s_at,43252.146154
...,...
211213_at,1.573077
215309_at,1.188462
205421_at,1.130769
204704_s_at,1.030769


### 2.2. The midpoints between each two consecutive values are calculated;

In [15]:
def midpoint(num1,num2):
    return (num1+num2)/2

midpoints_dict={} # Key corresponds to the position of the first gene. The second gene is in the next position (i+1).
for i in range(gse2034_dis.shape[0]-1):
    midpoints_dict.update({i:midpoint(gse2034_dis.iloc[i]['Expression value'],gse2034_dis.iloc[i+1]['Expression value'])})

midpoints_dict

{0: 48345.90769230768,
 1: 46774.15384615384,
 2: 44878.00576923076,
 3: 43610.02692307692,
 4: 43160.05,
 5: 42642.30769230769,
 6: 41937.725,
 7: 41548.7,
 8: 41379.399999999994,
 9: 41246.6673076923,
 10: 40503.536538461536,
 11: 39740.715384615374,
 12: 39501.851923076916,
 13: 39023.592307692306,
 14: 38444.06538461537,
 15: 38107.65192307692,
 16: 37854.41346153847,
 17: 37012.73653846154,
 18: 36193.81153846154,
 19: 35968.015384615384,
 20: 35793.876923076925,
 21: 35633.09423076923,
 22: 35558.99038461539,
 23: 35507.43653846154,
 24: 35439.03269230769,
 25: 35412.56153846154,
 26: 35263.99038461539,
 27: 34946.3673076923,
 28: 34727.14423076922,
 29: 34663.49999999999,
 30: 34424.76346153845,
 31: 34122.18076923076,
 32: 34011.855769230766,
 33: 33818.613461538465,
 34: 33216.94807692308,
 35: 32724.57884615385,
 36: 32571.01346153846,
 37: 32356.69038461538,
 38: 32210.365384615383,
 39: 31987.77115384616,
 40: 31720.915384615386,
 41: 31599.007692307692,
 42: 31435.93653846

### 2.3. For each midpoint, μi, the Information Gain (IG) of the system is computed. Let IG(S,μi) to denote the IG of the system for midpoint μi.

In [136]:
classes=sorted(set(gse2034.index)) # Τhe classes to which a sample may belong
samples=gse2034.index # the samples class

# Calculate the proportion of samples in S that belong in Class C
def P(C,S):
    return list(S).count(C)/len(S)

def E(S,m=1):
    # m not given: calculate the entropy of the system taking into account the prior assignment of sample cases into phenotype classes
    # m given: calculate the respective entropy of the system taking into account its division into subgroups around midpoint μi
    tmp=0
    for c in classes:
        # P(c,S) must be greater than zero
        tmp+=P(c,S)*math.log(P(c,S))/m
    return -(tmp)

entropy=E(samples)
print('Dataset Entropy: %.3f bits' % entropy)

Dataset Entropy: 0.582 bits


In [137]:
# Calculate the Information Gain (IG) of the system
def IG(S,m):
    return E(S)-E(S,m)

information_gain=[]
for m in list(midpoints_dict.values()):
    information_gain.append(IG(samples,m))
print('Information Gain: '+str(information_gain))

Information Gain: [0.582480195455803, 0.5824797905919346, 0.5824792644257821, 0.5824788870428537, 0.5824787477874378, 0.582478583924339, 0.5824783544275832, 0.5824782243790821, 0.5824781670195665, 0.5824781217199007, 0.5824778626162204, 0.5824775865684465, 0.5824774979372257, 0.5824773172160036, 0.5824770922032104, 0.5824769584445176, 0.5824768561879756, 0.5824765062687137, 0.5824761501877475, 0.5824760491564591, 0.5824759703685393, 0.5824758969396149, 0.5824758628730817, 0.5824758390892143, 0.5824758074249435, 0.5824757951385671, 0.5824757258381503, 0.5824755757077289, 0.5824754704860304, 0.5824754396890806, 0.5824753231515897, 0.5824751731048768, 0.5824751177320143, 0.5824750198717836, 0.5824747078897048, 0.5824744440456829, 0.5824743601232355, 0.5824742416654206, 0.5824741598850762, 0.5824740340434218, 0.5824738808510813, 0.5824738100072401, 0.5824737143831219, 0.5824736308465888, 0.5824734728316598, 0.5824733161094879, 0.5824732837809045, 0.5824732337597643, 0.5824731982262168, 0.5

In [138]:
# The midpoint with the highest information gain is selected as the discretization point
max_value = max(information_gain)
max_mid_pos = information_gain.index(max(information_gain))
dis_point=midpoints_dict.get(max_mid_pos)
print('Discretization point: %.3f' %dis_point)

Discretization point: 48345.908


### 2.4. The sample cases with expression values lower than the discretization point are assigned the '0' value (meaning that the gene is under-expressed), and the sample cases with expression values bigger that the discretization point are assigned the '1' value (the gene is over-expressed).
The discretization process is applied for each gene separately, and the final dataset is a matrix of discretized, actually binarized, values.

In [139]:
gse2034_dis=gse2034.copy()
gse2034_dis[gse2034_dis<dis_point]=0 # under-expressed
gse2034_dis[gse2034_dis>=dis_point]=1 # over-expressed
gse2034_dis=gse2034_dis.astype('int')
gse2034_dis

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-HSAC07/X00351_5_at,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERneg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERpos,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERneg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3. Matching sub-paths with gene expression profiles

In [173]:
class Subpath:
    def __init__(self, id, sub_id, graph):
        self.id=id
        self.sub_id=sub_id
        self.graph=graph
    #def calc_expr_profile(self):

class Graph:
    def __init__(self, num_of_nodes, directed=True):
        self.m_num_of_nodes = num_of_nodes
        self.m_nodes = range(self.m_num_of_nodes)

        # Define the type of a graph
        self.m_directed = directed

        self.m_adj_list = {node: set() for node in self.m_nodes}      

    def add_edge(self, node1, node2, edge, weight=1):
        self.m_adj_list[node1].add((node2, weight, edge))
        
        if not self.m_directed:
            self.m_adj_list[node2].add((node1, weight, edge))

    def print_adj_list(self):
        for key in self.m_adj_list.keys():
            print("node", key, ": ", self.m_adj_list[key])
            
subpath_len=selected.iloc[0].count()
my_graph=Graph(subpath_len)
for node in selected.iloc[0,:subpath_len]:
    tmp_node=node.split('--|')
    print(tmp_node)

#Graph(selected.)            
            
#tmp_subpath=Subpath(0,0)

#def functional_subpaths():
    
#for subpath in range(selected.shape[0]):
#    tmp_subpath=[]
#    for edge in range(selected.shape[1]):
#        if(pd.isna(selected.iloc[subpath,edge])): # Check if subpath reached its end
#            break;
#        genes=selected.iloc[subpath,edge].split(' ')
#        genes = [i for i in genes if i]
#        print('Genes set')
#        for g in genes:
#            print(g.split('#'))

['208200_at#hsa:3552 210118_s_at#hsa:3552 205067_at#hsa:3553 39402_at#hsa:3553 ']
['202948_at#hsa:3554 215561_s_at#hsa:3554 205403_at#hsa:7850 211372_s_at#hsa:7850 ']
['203901_at#hsa:10454']
['206853_s_at#hsa:6885 206854_s_at#hsa:6885 211536_x_at#hsa:6885 211537_x_at#hsa:6885']
['209666_s_at#hsa:1147 209341_s_at#hsa:3551 209342_s_at#hsa:3551 211027_s_at#hsa:3551 209929_s_at#hsa:8517 36004_at#hsa:8517 ']


In [218]:
tmp_subpath=selected_raw.iloc[0]['SubPathID']
tmp_subpath=tmp_subpath.split('-->')
for i in tmp_subpath:
    tmp_s1=i.split('--|')
    
    # Only activation relation
    if(len(tmp_s1)==1):
        tmp_s2=[x for x in tmp_s1[0].split(' ') if x]
        print(tmp_s2)
        for j in tmp_s2:
            print(j)
    else:
        print('There was inhibition')

['208200_at#hsa:3552', '210118_s_at#hsa:3552', '205067_at#hsa:3553', '39402_at#hsa:3553']
208200_at#hsa:3552
210118_s_at#hsa:3552
205067_at#hsa:3553
39402_at#hsa:3553
['202948_at#hsa:3554', '215561_s_at#hsa:3554', '205403_at#hsa:7850', '211372_s_at#hsa:7850']
202948_at#hsa:3554
215561_s_at#hsa:3554
205403_at#hsa:7850
211372_s_at#hsa:7850
['203901_at#hsa:10454']
203901_at#hsa:10454
['206853_s_at#hsa:6885', '206854_s_at#hsa:6885', '211536_x_at#hsa:6885', '211537_x_at#hsa:6885']
206853_s_at#hsa:6885
206854_s_at#hsa:6885
211536_x_at#hsa:6885
211537_x_at#hsa:6885
['209666_s_at#hsa:1147', '209341_s_at#hsa:3551', '209342_s_at#hsa:3551', '211027_s_at#hsa:3551', '209929_s_at#hsa:8517', '36004_at#hsa:8517']
209666_s_at#hsa:1147
209341_s_at#hsa:3551
209342_s_at#hsa:3551
211027_s_at#hsa:3551
209929_s_at#hsa:8517
36004_at#hsa:8517


In [208]:
selected_raw.iloc[0]['SubPathID']

'208200_at#hsa:3552 210118_s_at#hsa:3552 205067_at#hsa:3553 39402_at#hsa:3553 -->202948_at#hsa:3554 215561_s_at#hsa:3554 205403_at#hsa:7850 211372_s_at#hsa:7850 -->203901_at#hsa:10454-->206853_s_at#hsa:6885 206854_s_at#hsa:6885 211536_x_at#hsa:6885 211537_x_at#hsa:6885-->209666_s_at#hsa:1147 209341_s_at#hsa:3551 209342_s_at#hsa:3551 211027_s_at#hsa:3551 209929_s_at#hsa:8517 36004_at#hsa:8517 '

In [222]:
selected_raw.iloc[0]['SubPathID']
def getGraph(subpath_id, nodes):
    subpath_id=0
    tmp_subpath=selected_raw.iloc[subpath_id]['SubPathID']
    tmp_subpath=tmp_subpath.split('-->') # Activation
    for i in tmp_subpath:
        tmp_s1=i.split('--|') # Inhibition
    
        # Only activation relation
        if(len(tmp_s1)==1):
            tmp_s2=[x for x in tmp_s1[0].split(' ') if x]
            print(tmp_s2)
            for j in tmp_s2:
                print(j)
        else:
            print('There was inhibition')
        print(tmp_subpath)

getGraph(0,[])

208200_at#hsa:3552 210118_s_at#hsa:3552 205067_at#hsa:3553 39402_at#hsa:3553 
202948_at#hsa:3554 215561_s_at#hsa:3554 205403_at#hsa:7850 211372_s_at#hsa:7850 
203901_at#hsa:10454
206853_s_at#hsa:6885 206854_s_at#hsa:6885 211536_x_at#hsa:6885 211537_x_at#hsa:6885
209666_s_at#hsa:1147 209341_s_at#hsa:3551 209342_s_at#hsa:3551 211027_s_at#hsa:3551 209929_s_at#hsa:8517 36004_at#hsa:8517 
['208200_at#hsa:3552 210118_s_at#hsa:3552 205067_at#hsa:3553 39402_at#hsa:3553 ', '202948_at#hsa:3554 215561_s_at#hsa:3554 205403_at#hsa:7850 211372_s_at#hsa:7850 ', '203901_at#hsa:10454', '206853_s_at#hsa:6885 206854_s_at#hsa:6885 211536_x_at#hsa:6885 211537_x_at#hsa:6885', '209666_s_at#hsa:1147 209341_s_at#hsa:3551 209342_s_at#hsa:3551 211027_s_at#hsa:3551 209929_s_at#hsa:8517 36004_at#hsa:8517 ']
