In [None]:
import pandas as pd
import numpy as np

# 1. Read data and make them easier to understand

## 1.1. GSE2034

In [None]:
raw_gse2034_df = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"') # Breast cancer
gse2034_df=raw_gse2034_df.copy()

# preprocess dataset
gse2034_df[['Gene','KEGG-ID']] = gse2034_df['Class'].str.split('#',expand=True)
gse2034_df.drop('Class', inplace=True, axis=1)
cols = gse2034_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034_df=gse2034_df[cols]

#gse2034_df

In [None]:
# Change column names (estrogen receptor)
labels=gse2034_df.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [None]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034_df['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=gse2034_df.loc[gse2034_df['Gene'] == i]['KEGG-ID']
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
    gene_dict.update({i:tmp_list})
#print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

In [None]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=gse2034_df['Gene']
gse2034_df.drop('KEGG-ID', inplace=True, axis=1)
gse2034_df=np.transpose(gse2034_df.iloc[:,1:])
gse2034_df.columns=genes.values.tolist()
#gse2034_df

In [None]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034_df=gse2034_df.groupby(level=0,axis=1).mean()
gse2034_df

## 1.2. Selected
Cellular processes (15), Signal transduction (Environmental information process) (24), Cancer overview (8).

In [None]:
raw_selected_df = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
#raw_selected_df

In [None]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

In [None]:
from itertools import chain

# Split each pathway based on the relation
def split_path(data,relation):
    s=[]
    cnt=len(data.split(relations_dict[relation]))
    cnt_tmp=1
    # If there is at least one relation, then split data
    if(cnt>0):
        for e in data.split(relations_dict[relation]):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt):
                    s.append(relations_dict[relation])
                cnt_tmp+=1
    return s

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def get_pathway(data):
    s=split_path(data,'Activation')

    for i in range(len(s)):
        tmp_s=split_path(s[i],'Inhibition')
        if(len(s[i])>1):
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

subpaths_list=[get_pathway(row) for row in raw_selected_df['SubPathID']]
#subpaths_list

In [None]:
selected_df=pd.DataFrame(subpaths_list).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes
selected_df

# 2. Methods

## 2.1. PRS
Each node in a pathway has three attributes: Node_genes, Node_value (NV), Node_weight (NW)
### 2.1.1.  Development of the PRS algorithm

#### 2.1.1.1. Node_genes

In [None]:
# Each node in a pathway represents a discrete function mapping to one or more transcript.
# Returns a dictionary corresponding each node of each pathway to its gene or genes.
def Node_genes(df):
    Node_genes={}
    for path in range(df.shape[0]):
        tmp_node=[]
        for node in range(0,len(df.iloc[path][~df.iloc[path].isnull()]),2):
            genes=list(filter(None,[x.strip() for x in df.iloc[path,node].split(' ')]))
            tmp_genes=[]
            for g in genes:
                tmp_genes.append(list(filter(None,[x.strip() for x in g.split('#')]))[0])
            tmp_node.append(tmp_genes)
        Node_genes.update({path:tmp_node})
    return Node_genes

node_genes=Node_genes(selected_df)
node_genes

#### 2.1.1.2. Node_value (NV)

In [None]:
# Calculate fold change for each gene
def fold_change(df):
    fc_dict={}
    
    for gene in df.columns:
        # Get average value for each case
        erneg_av=gse2034_df[gene]['ERneg'].mean()
        erpos_av=gse2034_df[gene]['ERpos'].mean()
        
        # Calculate fold change (B/A)
        cur_fc=erneg_av/erpos_av
        fc_dict.update({gene:cur_fc})
        
    return fc_dict        
    
fc_dict=fold_change(gse2034_df)

In [None]:
gene_df=pd.DataFrame.from_dict(fc_dict,orient='index',columns=['Fold Change'])
gene_df=gene_df.sort_values(['Fold Change'])
gene_df

In [None]:
# Each node is assigned a value derived from expression data. The following values are assigned to the node: 0 if the 
# corresponding gene or genes are not expressed, 1 if they are expressed but remain unchanged (non-significant), or the
# maximum fold-change value if one or more of the mapped transcripts is above threshold.
def expressed_genes(fc_list):
    state={'not expressed':0,'expressed':0,'significant':0}
    for x in fc_list:
        if(x<=0):
            state['not expressed']+=1
        elif(x>0 and x<threshold):
            state['expressed']+=1
        else:
            state['significant']+=1
            
    if(state['significant']>0):
        return max(fc_list)
    else:
        if(max(state, key=state.get)=='not expressed'):
            return 0
        else:
            return 1

def Node_value(node_genes):
    Node_value={}
    
    for path in node_genes:
        cur_path=[]
        
        for node in node_genes[path]:
            cur_node=[]
            
            for gene in node:
                # Check if gene is 'noProbe' ot not in genes' list
                if(gene=='noProbe'):
                    cur_node.append(0)
                    continue
                elif(not(gene in gene_dict.keys())):
                    cur_node.append(0)
                    continue
                
                cur_node.append(gene_df.loc[gene]['Fold Change'])
            
            # Call function and calculate new value of node
            cur_path.append(expressed_genes(cur_node))
            
        Node_value.update({path:cur_path})
            
    return Node_value
        
threshold=1.5   
node_value=Node_value(node_genes)
node_value

#### 2.1.1.3. Node_weight (NW)

In [None]:
# Generate each sub-paths in the form of a graph.
path_graph={}
for path in node_value:
    graph=[]
    for n in range(len(node_value[path])-1):
        graph.append([node_value[path][n],node_value[path][n+1]])
    path_graph.update({path:graph})
            
graphs=path_graph
graphs

In [None]:
# All significant (above-threshold) nodes were assigned a weighting that reflected
# their topological strength (i.e., the number of significant downstream nodes that are pointed to, either
# directly or via other significant nodes).
# An initiating child node, ni, was ignored if non-significant, and the algorithm proceeds to the next child.
# Otherwise, we increase the weight counter by 1 and look for children of this node
# and so on. All non-significant nodes have NW = 0.

def sign_children(graph,weight=0):
    if(len(graph)==0):
        return weight
    if(graph[0]<threshold):
        return sign_children(graph[1:],weight)
    else:
        return sign_children(graph[1:],weight+1) # Increase weight, if the current node/child is significant

node_weight={}
for path in node_value:
    cur_node=[]
    cur_weight=0
    for n in range(len(node_value[path])):
        cur_weight=sign_children(node_value[path][n:])
        cur_node.append(cur_weight)  
    node_weight.update({path:cur_node})
node_weight

#### 2.1.1.4. Node_score (NS)
NV and NW values are combined to calculate a Node_Score (NS)

In [None]:
def Node_score(NV,NW):
    node_score={}
    for path in NV:
        cur_path=[]
        for n in range(len(NV[path])):
            if(NV[path][n]>1):
                cur_path.append(NV[path][n]*NW[path][n])
            else:
                cur_path.append(0)
        node_score.update({path:cur_path})
    
    return node_score
node_score=Node_score(node_value,node_weight)
node_score

#### 2.1.1.5. PRS

In [None]:
def PRS(NS):
    prs={}
    for path in NS:
        cur_sum=0
        for n in range(len(NS[path])):
            cur_sum+=NS[path][n]
        prs.update({path:cur_sum})
    return prs

prs=PRS(node_score)
prs

### 2.1.2. Normalizing pathway scores
A normalization step is required to control for two key features: (i)
pathway size and (ii) statistical bias contributed by pathway-specific PRS score null distributions.

#### 2.1.2.1.  Pathway size 
Multiply each PRS score by the ratio of the number of DEGs (NDEGs) in a pathway to the total number of expressed genes (NEGs)

In [None]:
def path_size_PRS(prs):
    new_prs={}
    for path in prs:
        NDEGs=len([element for element in node_value[path] if element > threshold])
        NEGs=len([element for element in node_value[path] if element > 0])
        if(NEGs==0):
            cur_prs=0
        else:
            cur_prs=prs[path]*(NDEGs/NEGs)
        new_prs.update({path:cur_prs})
    return new_prs
        
new_prs=path_size_PRS(prs)
new_prs

#### 2.1.2.2. Statistical bias contributed by pathway-specific PRS score null distributions