# Correct approach

In [None]:
import pandas as pd
import numpy as np
import math

# 1. Read data and make them easier to understand


## 1.1. GSE2034

In [None]:
raw_gse2034_df = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"') # Breast cancer
gse2034_df=raw_gse2034_df.copy()

# preprocess dataset
gse2034_df[['Gene','KEGG-ID']] = gse2034_df['Class'].str.split('#',expand=True)
gse2034_df.drop('Class', inplace=True, axis=1)
cols = gse2034_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034_df=gse2034_df[cols]

#gse2034_df

In [None]:
# Change column names (estrogen receptor)
labels=gse2034_df.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [None]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034_df['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=(gse2034_df.loc[gse2034_df['Gene'] == i]['KEGG-ID']).copy()
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

In [None]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=(gse2034_df['Gene']).copy()
gse2034_df.drop('KEGG-ID', inplace=True, axis=1)
gse2034_df=np.transpose(gse2034_df.iloc[:,1:])
gse2034_df.columns=genes.values.tolist()
gse2034_df

In [None]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034_df=gse2034_df.groupby(level=0,axis=1).mean()
gse2034_df['noProbe']=gse2034_df.mean(axis=1) # Compute 'noProbe' for future use
gse2034_df

In [None]:
# Keep samples' labels
labels=gse2034_df.index

## 1.2. Selected
Cellular processes (15), Signal transduction (Environmental information process) (24), Cancer overview (8).

In [None]:
raw_selected_df = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
#raw_selected_df

In [None]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

In [None]:
from itertools import chain

# Split each pathway based on the relation
def split_path(data,relation):
    s=[]
    cnt=len(data.split(relations_dict[relation]))
    cnt_tmp=1
    # If there is at least one relation, then split data
    if(cnt>0):
        for e in data.split(relations_dict[relation]):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt):
                    s.append(relations_dict[relation])
                cnt_tmp+=1
    return s

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def get_pathway(data):
    s=split_path(data,'Activation')

    for i in range(len(s)):
        tmp_s=split_path(s[i],'Inhibition')
        if(len(s[i])>1):
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

subpaths_list=[get_pathway(row) for row in raw_selected_df['SubPathID']]
#subpaths_list

In [None]:
selected_df=pd.DataFrame(subpaths_list).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes
selected_df

## 1.3. Important values


### 1.3.1. Node genes- all genes of each node

In [None]:
# Each node in a pathway represents a discrete function mapping to one or more transcript.
# Returns a dictionary corresponding each node of each pathway to its gene or genes.
def Node_genes(df):
    Node_genes={}
    for path in range(df.shape[0]):
        tmp_node=[]
        for node in range(0,len(df.iloc[path][~df.iloc[path].isnull()]),2):
            genes=list(filter(None,[x.strip() for x in df.iloc[path,node].split(' ')]))
            tmp_genes=[]
            for g in genes:
                tmp_genes.append(list(filter(None,[x.strip() for x in g.split('#')]))[0])
            tmp_node.append(tmp_genes)
        Node_genes.update({path:tmp_node})
    return Node_genes

node_genes=Node_genes(selected_df)
node_genes

### 1.3.2. Expression value

In [None]:
from statistics import mean

# For each node of a sub-path consisting of more than one genes, get the average value of the expression values.
def sample_expression_value(sample,path):
    cur_sample=[]
    for node in path:
        cur_node=[]
        for gene in node:
            # Check if gene is not in genes' list and assign to it the 'noProbe' value
            if(not(gene in gse2034_df.columns)):
                cur_node.append(sample.loc['noProbe'])
                continue
            cur_node.append(sample.loc[gene])
        cur_sample.append(mean(cur_node))
    return cur_sample

def path_expression_value(path_no,samples):
    cur_path=[]
    for sample in range(samples.shape[0]):
        cur_path.append(sample_expression_value(samples.iloc[sample],node_genes[path_no]))
    return cur_path
         
def get_expression_values(node_genes,samples):
    expression_values={}
    for path in node_genes:
        print(path)
        expression_values.update({path:path_expression_value(path,samples)})  
    return expression_values

#expression_values_dict=get_expression_values(node_genes,gse2034_df) # It takes a lot of time to compute
#expression_values_df=pd.DataFrame(expression_values_dict.values(),index=expression_values_dict.keys(),columns=labels)
#expression_values_df.to_csv(r'C:\Users\Foteini Droumalia\Desktop\Φωτεινή Δρουμαλιά\Project\data\expression_values.csv',index = False, header=True) # Convert to csv for future use

In [None]:
import csv

# Expression values for each subpath and each sample are already calculated and saved is csv form because of its computational time
expression_values_df = pd.read_csv ('Data/expression_values.csv')
expression_values_df

### 1.3.3. P-value and threshold <= 0.05 (gene is significant)

In [None]:
# Convert cell value of string to list
import ast

ast.literal_eval(expression_values_df.iloc[0,0])