# Main script
## Main idea:
- Each method is implemented for a specific sample and a specific sub-path. 
- Because of the large number of data available, we compute the score of a specific sub-path (path_no = 0) and each sample.
- In the end, each methodology will be applied to each sample and subpathway using iterative methods.
- Current results: DataFrame of each method, where rows are samples and column is sub-path #0

## Ideas: 
- Each tool could be implemented in another file and then be imported as a complete function.

In [None]:
# For examples
sample_no=0
path_no=0

In [None]:
import pandas as pd
import numpy as np
import math

# 1. Read data and make them easier to understand

## 1.1. GSE2034

In [None]:
raw_gse2034_df = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"') # Breast cancer
gse2034_df=raw_gse2034_df.copy()

# preprocess dataset
gse2034_df[['Gene','KEGG-ID']] = gse2034_df['Class'].str.split('#',expand=True)
gse2034_df.drop('Class', inplace=True, axis=1)
cols = gse2034_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034_df=gse2034_df[cols]

#gse2034_df

In [None]:
# Change column names (estrogen receptor)
labels=gse2034_df.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [None]:
# Get the genes that are present in the GSE2034 dataset and create a dictionary 
# where the keys are the genes and the values are the corresponding KEGG-IDs
# 'a gene can be mapped to more than one Entrez identifier'
gene_list=sorted(set(gse2034_df['Gene'].tolist()))
gene_dict={}
for i in gene_list:
    tmp=(gse2034_df.loc[gse2034_df['Gene'] == i]['KEGG-ID']).copy()
    tmp_list=[]
    for t in tmp:
        tmp_list.append(t)
    gene_dict.update({i:tmp_list})
#print('Gene dictionary (key: Genes, values: KEGG-IDs): '+str(gene_dict))

In [None]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=(gse2034_df['Gene']).copy()
gse2034_df.drop('KEGG-ID', inplace=True, axis=1)
gse2034_df=np.transpose(gse2034_df.iloc[:,1:])
gse2034_df.columns=genes.values.tolist()
#gse2034_df

In [None]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034_df=gse2034_df.groupby(level=0,axis=1).mean()
gse2034_df['noProbe']=gse2034_df.mean(axis=1) # Compute 'noProbe' for future use
genes=gse2034_df.columns
gse2034_df

In [None]:
classes=gse2034_df.index
classes

## 1.2. Selected
Cellular processes (15), Signal transduction (Environmental information process) (24), Cancer overview (8).

In [None]:
raw_selected_df = pd.read_csv('Data/Selected.zip', compression='zip', header=0, sep='\t', quotechar='"')[['SubPathID']]
#raw_selected_df

In [None]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}
#relations_dict

In [None]:
from itertools import chain

# Split each pathway based on the relation
def split_path(data,relation):
    s=[]
    cnt=len(data.split(relations_dict[relation]))
    cnt_tmp=1
    # If there is at least one relation, then split data
    if(cnt>0):
        for e in data.split(relations_dict[relation]):
            if e:
                s.append(e)
                # Remove the final relation
                if(cnt_tmp<cnt):
                    s.append(relations_dict[relation])
                cnt_tmp+=1
    return s

# Make the selected dataset easier to understand, by splitting each pathway based on their nodes and edges
def get_pathway(data):
    s=split_path(data,'Activation')

    for i in range(len(s)):
        tmp_s=split_path(s[i],'Inhibition')
        if(len(s[i])>1):
            s[i]=tmp_s
            
    return list(chain.from_iterable(s))

subpaths_list=[get_pathway(row) for row in raw_selected_df['SubPathID']]
#subpaths_list

In [None]:
selected_df=pd.DataFrame(subpaths_list).fillna(value=np.nan) # Rows: pathways, Cols: edges and nodes
selected_df

In [None]:
# Two types of nodes relations
relations_dict={'Activation':'-->','Inhibition':'--|'}

## 1.3. Important values

### 1.3.1. Node genes- all genes of each node

In [None]:
# Each node in a pathway represents a discrete function mapping to one or more transcript.
# Returns a dictionary corresponding each node of each pathway to its gene or genes.
def Node_genes(df):
    Node_genes={}
    for path in range(df.shape[0]):
        tmp_node=[]
        for node in range(0,len(df.iloc[path][~df.iloc[path].isnull()]),2):
            genes=list(filter(None,[x.strip() for x in df.iloc[path,node].split(' ')]))
            tmp_genes=[]
            for g in genes:
                tmp_genes.append(list(filter(None,[x.strip() for x in g.split('#')]))[0])
            tmp_node.append(tmp_genes)
        Node_genes.update({path:tmp_node})
    return Node_genes

node_genes=Node_genes(selected_df)
node_genes

### 1.3.2. Expression value
For each subpath assign each sample's genes expression values (3D: samples,subpaths,nodes)

In [None]:
from statistics import mean

# For a specific subpath match the expression values of each gene based on the values in a specific sample.
# In case of multiple genes in one node assign the average value.
def sample_expression_value(path,sample):
    sample_expr_val=[]
    for node in range(len(path)):
        node_genes=[]
        for gene in range(len(path[node])):
            node_genes.append(sample[gene])
        sample_expr_val.append(mean(node_genes)) # Calculate the average expression value of a node's genes
    return sample_expr_val

#sample_no=0
#path_no=0
#sample_expression_value(node_genes[path_no],gse2034_df.iloc[sample_no])

In [None]:
# For a specific subpath assign the expression values based on each sample and return a 2D array
def subpath_expression_value(path,samples):
    
    # Create a 2D array, where rows are the sample size and columns are the subpath's nodes
    subpath_expr_val=[]
    
    # For each sample
    for sample in range(samples.shape[0]):
        subpath_expr_val.append(sample_expression_value(path,samples.iloc[sample]))
        
    return subpath_expr_val

Because of the time complexity, the expression value is calculated only for one subpath and each training sample

In [None]:
path_no=0
subpath_expr_val=subpath_expression_value(node_genes[path_no],gse2034_df)
subpath_expr_val

### 1.3.3. P-value and threshold <= 0.05 (gene is significant)
For each gene find the p-value (ttest_ind)

In [None]:
import scipy.stats as stats

#gse2034_df2=gse2034_df.copy().T # Columns: samples and rows: genes
#gse2034_df2

#### 1.3.3.1. For each gene find cancer and normal mean value and calculate their difference

In [None]:
#gse2034_df2['ERnegMean']=gse2034_df2['ERneg'].mean(axis=1)
#gse2034_df2['ERposMean']=gse2034_df2['ERpos'].mean(axis=1)
#gse2034_df2['Diffs']=gse2034_df2['ERnegMean']-gse2034_df2['ERposMean']
#gse2034_df2

In [None]:
def pvalue005(genes):
    pvalArray=np.empty(genes.shape[0])
    pvalArrayUnder005=np.array([])
    indices005=np.array([])
    for x in range(genes.shape[0]):
        st,pval=stats.ttest_ind(genes['ERneg'].iloc[x],genes['ERpos'].iloc[x])
        pvalArray[x]=pval
        if(pval<0.05):
            pvalArrayUnder005=np.append(pvalArrayUnder005,pval)
            indices005=np.append(indices005,x)
    return pvalArray,pvalArrayUnder005,indices005

#pvalList2,pvalListUnder005_2,indices005_2=pvalue005(gse2034_df2)

In [None]:
#gse2034_df2['P-Value']=pvalList2
#gse2034_df2

In [None]:
# For each node of a sub-path consisting of more than one genes, get the average value of the p-values.
def path_pvalue(path,df):
    cur_path=[]
    for node in path:
        cur_node=[]
        for gene in node:
            # Check if gene is not in genes' list
            if(not(gene in df.index)):
                cur_node.append(df.loc['noProbe']['P-Value'])
                continue
            cur_node.append(df.loc[gene]['P-Value'])
        cur_path.append(np.mean(cur_node))
    return cur_path
         
def get_pvalues(node_genes,df):
    pvalues={}
    for path in node_genes:
        pvalues.update({path:path_pvalue(node_genes[path],df)})  
    return pvalues

pvalue_threshold=0.05

#pvalues_dict=get_pvalues(node_genes,gse2034_df2) # All sub-paths
#pvalues_dict

#pvalues=path_pvalue(node_genes[path_no],gse2034_df2) # One sub-path
#pvalues

### 1.3.4. Fold Change and Log Fold Change
Add fold change column to genes_df

In [None]:
# Calculate fold change for each gene
def fold_change(df):
    fc_dict={}
    
    for gene in df.index:
        # Get average value for each case
        erneg_av=df.loc[gene]['ERneg'].mean()
        erpos_av=df.loc[gene]['ERpos'].mean()
        
        # Calculate fold change (B/A)
        cur_fc=erneg_av/erpos_av
        fc_dict.update({gene:cur_fc})
        
    return fc_dict        

#fc=fold_change(gse2034_df2)
#gse2034_df2['Fold Change']=fold_change(gse2034_df2).values()
#gse2034_df2['Log FC']=[math.log(fc+1-min(gse2034_df2['Fold Change'])) for fc in gse2034_df2['Fold Change']] # Calculate log fold change
#gse2034_df2

In [None]:
# For each node of a sub-path consisting of more than one genes, get the average value of the fold change.
def path_fc(path,df):
    fc=[]
    log_fc=[]
    for node in path:
        cur_fc=[]
        cur_log_fc=[]
        for gene in node:
            # Check if gene is not in genes' list
            if(not(gene in df.index)):
                cur_fc.append(df.loc['noProbe']['Fold Change'])
                cur_log_fc.append(df.loc['noProbe']['Log FC'])
                continue
            cur_fc.append(df.loc[gene]['Fold Change'])
            cur_log_fc.append(df.loc[gene]['Log FC'])
        fc.append(np.mean(cur_fc))
        log_fc.append(np.mean(cur_log_fc))
    return fc,log_fc
            

def get_fc(node_genes,df):
    fc={}
    log_fc={}
    for path in node_genes:
        cur_fc,cur_log_fc=path_fc(node_genes[path],df)
        fc.update({path:cur_fc})  
        log_fc.update({path:cur_log_fc})  
    return fc,log_fc

log_fc_threshold=1.5 

#fc_dict,log_fc_dict=get_fc(node_genes,gse2034_df2) # Return fold change and log fold change in dictionary form for all sub-paths

#fc,log_fc=path_fc(node_genes[path_no],gse2034_df2) # Specific sub-path

### 1.3.5. Extract the DataFrame for future use and create dictionaries for sub-paths based on the previous implemented functions
- Values from 2.3.3. to 2.3.4. are extracted to GSE2034_data.csv so they don't have to be computed every time.
- Create the dictionaries for each subpath using the GSE2034_data file.

In [None]:
#gse2034_df2.to_csv(r'C:\Users\Foteini Droumalia\Desktop\Φωτεινή Δρουμαλιά\Project\data\GSE2034_data.csv',header=True)
gse2034_df2 = pd.read_csv ('Data\GSE2034_data.csv').iloc[:,1:]
new_columns=list(gse2034_df.index)+list(gse2034_df2.columns[-6:])
gse2034_df2.columns=new_columns
gse2034_df2.index=genes
gse2034_df2

#### 1.3.5.1. P-Value corresponding to each path's nodes 

In [None]:
pvalues=path_pvalue(node_genes[path_no],gse2034_df2) # Specific sub-path
pvalues

In [None]:
pvalues_dict=get_pvalues(node_genes,gse2034_df2) # All sub-paths
pvalues_dict

#### 1.3.5.2. Fold-Change and Log fold-Change corresponding to each path's nodes

In [None]:
fc,log_fc=path_fc(node_genes[path_no],gse2034_df2) # Specific sub-path

In [None]:
fc_dict,log_fc_dict=get_fc(node_genes,gse2034_df2) # Return fold change and log fold change in dictionary form for all sub-paths

### 1.3.6. Differentially Expressed Genes (DEG)

In [None]:
# Get the differentialy expressed genes of a specific sub-path
def path_de_genes(pvalues):
    de_genes=[]
    for node in range(len(pvalues)):
        if(pvalues[node]<=pvalue_threshold):
            de_genes.append(1)
        else:
            de_genes.append(0)
    return de_genes

# Get the differentially expressed genes of all sub-paths in dictionary form
def get_de_genes(pvalues_dict):
    de_genes={}
    for path in pvalues_dict:
        de_genes.update({path:path_de_genes(pvalues_dict[path])})
    return de_genes

de_genes=path_de_genes(pvalues) # Specific sub-path
de_genes_dict=get_de_genes(pvalues_dict) # All sub-paths

# 2. Methods

## 2.1. MinePath

### 2.1.1. Discretization of gene expression values
Transform gene expression values into high (expressed / up-regulated) or low (not-expressed / down-regulated) gene expression binary equivalents

#### 2.1.1.1. The midpoints between each two consecutive values are calculated;

In [None]:
def midpoint(num1,num2):
    return (num1+num2)/2

# Returns the midpoints for a specific gene
def gene_midpoints(gene):
    tmp_gene=gene.copy()
    tmp_gene=tmp_gene.reset_index().drop(['index'],axis=1) # Keep previous positions
    
    # The expression values of a gene over the total number of input samples are sorted in descending order;
    sorted_gene=(tmp_gene/tmp_gene.shape[0]).sort_values(by=gene.name,ascending=False)
    
    midpoints_dict={}
    for i in range(sorted_gene.shape[0]-1):
        midpoints_dict.update({sorted_gene.index[i]:midpoint(sorted_gene.iloc[i][gene.name],sorted_gene.iloc[i+1][gene.name])})
    return midpoints_dict

# Returns the midpoints for all the samples
def get_midpoints(df):
    midpoints=[] 
    for gene in range(df.shape[1]):
        midpoints.append(gene_midpoints(df.iloc[:,gene]))
    return midpoints_dict

#### 2.1.1.2. For each midpoint, μi, the Information Gain (IG) of the system is computed. Let IG(S,μi) to denote the IG of the system for midpoint μi.

In [None]:
import math 

# Calculate the proportion of samples in S that belong in Class C
def P(C,S):
    return list(S).count(C)/len(S)

def E(S,classes,m=1):
    # m not given: calculate the entropy of the system taking into account the prior assignment of sample cases into phenotype classes
    # m given: calculate the respective entropy of the system taking into account its division into subgroups around midpoint μi
    tmp=0
    for c in classes:
        # P(c,S) must be greater than zero
        tmp+=P(c,S)*math.log(P(c,S))/m
    return -(tmp)

# Calculate the Information Gain (IG) of the system
def gene_IG(gene,midpoints):
    classes=sorted(set(gene.index))
    S=gene.index # the samples class
    
    information_gain={}
    for m in midpoints:
        information_gain.update({m:E(S,classes)-E(S,classes,midpoints[m])})
    
    return information_gain

# Calculate the Information Gain (IG) of the system
def IG(df):
    new_df=df
    information_gain={}
    for gene in new_df.columns:
        information_gain.update({gene:gene_IG(df[gene])})
    return information_gain

#### 2.1.1.3. The sample cases with expression values lower than the discretization point are assigned the '0' value (meaning that the gene is under-expressed), and the sample cases with expression values bigger that the discretization point are assigned the '1' value (the gene is over-expressed).
The discretization process is applied for each gene separately, and the final dataset is a matrix of discretized, actually binarized, values.

In [None]:
def discretization_point(midpoints,information_gain):
    # The midpoint with the highest information gain is selected as the discretization point
    max_value=max(information_gain.values())
    max_mid_pos = max(information_gain, key=information_gain.get)
    dis_point=midpoints.get(max_mid_pos)
    return dis_point

def gene_discretization(gene):
    midpoints=gene_midpoints(gene)
    information_gain=gene_IG(gene,midpoints)
    dis_point=discretization_point(midpoints,information_gain)
    
    gene_dis=gene.copy()
    gene_dis[gene_dis<dis_point]=0 # under-expressed
    gene_dis[gene_dis>=dis_point]=1 # over-expressed
    gene_dis=gene_dis.astype('int')
    
    return gene_dis

def discretization(genes):
    dis_genes={}
    for gene in genes.columns:
        dis_genes.update({gene:list(gene_discretization(genes[gene]))})
    return dis_genes

In [None]:
# Compute discretized gene expression values
#dis_gse2034_df=pd.DataFrame(discretization(gse2034_df),index=gse2034_df.index)
#dis_gse2034_df

### 2.1.2. Functional sub-paths: Matching sub-paths with gene expression profiles
Due to the large volume of data, we refer only to one patient and in the future we will apply the analysis to the rest

In [None]:
import statistics

# Get the nodes of a specific sub-path and sample in binary form
def sample_functional_subpath(sample,path):
    expr_prof=[]
    for node in path:
        tmp_node=[]
        for gene in node:
            if(~(gene in sample.index)):
                tmp_node.append(sample['noProbe'])
                continue
            tmp_node.append(sample[gene])
        expr_prof.append(statistics.mean(tmp_node))
    return expr_prof

#funct_subpath_0=sample_functional_subpath(dis_gse2034_df.iloc[sample_no],node_genes[0])
#funct_subpath_0

In [None]:
# The following functions compute the 'and' and 'xor' boolean operations
def and_boolean_op(num1,num2):
    return num1*num2
    
def xor_boolean_op(num1,num2):
    return 1 if(num1 and not num2) or (not num1 and num2) else 0

In [None]:
# Calculate the pathway expression of a specific sample and sub-path with boolean operations
def calc_pathway_expression(path,edges,prev_result):
    operations_dict={'Activation':and_boolean_op,'Inhibition':xor_boolean_op}
    
    # Two types of nodes relations
    relations_dict={'Activation':'-->','Inhibition':'--|'}
    
    if(len(path)>1):
        relation=list(relations_dict.keys())[list(relations_dict.values()).index(edges[0])] # Get the current edge type
        next_node=path[1]
        result=operations_dict[relation](prev_result,next_node)
        calc_pathway_expression(path[1:],edges[1:],result)
    return prev_result

# Calculate a specific sub-path's expression for each sample with boolean operations
def calc_all_samples_expression(path,edges,samples):
    subpath_expr=[]
    for sample in range(samples.shape[0]):
        funct_subpath=sample_functional_subpath(samples.iloc[sample],path)
        subpath_expr.append(calc_pathway_expression(funct_subpath,edges,funct_subpath[0]))
    return subpath_expr

In [None]:
# For a specific sub-path and each sample calculate the sub-path's expression 
# Rows: samples, column: pathway #0
'''
path_no=0
path=node_genes[path_no]
edges=list(selected_df.iloc[path_no][~selected_df.iloc[path_no].isnull()][1::2])
subpath_0_expression=calc_all_samples_expression(path,edges,dis_gse2034_df)
    
print('Expression of sub-path #%d : %s'%(path_no,subpath_0_expression))
'''

In [None]:
# For each sample and each sub-path calculate the sub-paths' expression
'''
subpaths_expression={}
for path in node_genes:
    print((path+1)*100/len(node_genes))
    cur_path=node_genes[path]
    edges=list(selected_df.iloc[path][~selected_df.iloc[path].isnull()][1::2])
    sample_expression=[]
    for sample in range(dis_gse2034_df.shape[0]):
        funct_subpath=sample_functional_subpath(dis_gse2034_df.iloc[sample],cur_path)
        sample_expression.append(calc_pathway_expression(funct_subpath,edges,funct_subpath[0]))
    subpaths_expression.update({path:sample_expression})
'''

## 2.2. DataFrame with final score of each sample for a specific sub-path

In [None]:
#minepath_df=pd.DataFrame(subpath_0_expression,columns=[path_no])
#minepath_df.to_csv('Results/MinePath.csv',index=False) # Already computed

minepath_df = pd.read_csv ('Results/MinePath.csv')
minepath_df.index=gse2034_df.index
minepath_df

## 2.2. TAPPA

### 2.2.1. Adjacency matrix

In [None]:
# The adjacency matrix is defined as A=(a_ij), where a_ij=1 if i=j or (g_i, g_j) belongs to E and a_ij=0 if (g_i, g_j) does 
# not belong to E.
def path_adjacency_matrix(nodes):
    tmp_adj=[]
    
    #i=j -> a_ij=1
    for i in range(len(nodes)):
        tmp_adj.append([0]*len(nodes))
        for j in range(len(nodes)):
            if(i==j):
                tmp_adj[i][j]=1
                continue
                
    # (g_i,g_j) belongs to E (current sub-paths are linear) -> a_ij=1          
    for i in range(len(nodes)-1):
        tmp_adj[i][i+1]=1
        tmp_adj[i+1][i]=1
            
    return tmp_adj

# Returns the adjacency matrices of each sub-path
def adjacency_matrix(paths):
    adj_matrices={}
    for path in paths:
        adj_matrices.update({path:path_adjacency_matrix(paths[path])})
    return adj_matrices

#### 2.2.1.2. Define PCI
Assuming that x_is is the normalized log expression measurement for gene i in sample s.

In [None]:
def sigmoid_normalization(df):
    # Each column expression values are normalized to zero mean.
    tmp_df=df.copy()
    tmp_df=(tmp_df-tmp_df.mean())/tmp_df.std()

    # Further normalize to (-0.5,0.5) with Sigmoid function (Sigmoid (x_is) - 0.5) to lower the effects of extremely large/small 
    # values for gene i in sample s.
    x={}
    for gene in tmp_df.columns:
        tmp_gene=[]
        for sample in range(tmp_df.shape[0]):
            tmp_gene.append((1 / (1 + math.exp(-df[gene].iloc[sample]))-0.5))
        x.update({gene:tmp_gene})
    return x

#gene_expression_df=pd.DataFrame(sigmoid_normalization(gse2034_df),index=gse2034_df.index)

In [None]:
from statistics import mean

# Define x for a specific sub-path and sample
def path_x(path,sample):
    x=[]
    for node in path:
        cur_node=[]
        for gene in node:
            if(not(gene in sample.index)):
                cur_node.append(sample['noProbe'])
                continue
            cur_node.append(sample[gene])
        #Each node consists of one or more genes, so each node gets the average value.
        x.append(mean(cur_node))
    return x

# Define x for a specific sub-path and each sample
def get_x(path,samples):
    x=[]
    for sample in range(samples.shape[0]):
        x.append(path_x(path,samples.iloc[sample]))
    return x
        
# Node_genes was initialized on a previous method (PRS)
#x=get_x(node_genes[path_no],gene_expression_df)

In [None]:
def path_PCI(path,x,a):
    # Number of genes (ignore the edges)
    N=len(path)
    pci=0
    for i in range(N):
        for j in range(N):
            pci+=np.sign(x[i]+x[j])*(abs(x[i])**0.5)*a[i][j]*(abs(x[j])**0.5)
    return pci

# Calculate PCI for a specific sub-path and each sample
def PCI(path,x,a):
    pci=[]
    for sample in range(len(x)):
        pci.append(path_PCI(path,x[sample],a))
    return pci

#pci=PCI(node_genes[path_no],x,path_adjacency_matrix(node_genes[path_no]))

### 2.2.2. Normalize PCI (divided by the gene number in pathway)

In [None]:
def normalize_PCI(pci,path):
    norm_pci=[]
    for sample in range(len(pci)):
        cur_len=len(path)
        norm_pci.append(pci[sample]/cur_len)
    return norm_pci

#norm_pci=normalize_PCI(pci,node_genes[path_no])
#tappa_df=pd.DataFrame(norm_pci,columns=[path_no])

### 2.2.3. DataFrame with final score of each sub-path for a specific sample

In [None]:
#tappa_df.to_csv('Results/TAPPA.csv',index=False) # Already somputed
tappa_df = pd.read_csv ('Results/TAPPA.csv')
tappa_df.index=gse2034_df.index
tappa_df

## 2.3. HiPathia

### 2.3.1. Normalize the gene expression values

In [None]:
# The normalization process is not defined,so the normalized gene expression values calculated for the TAPPA method are used.
#hipathia_norm_gse2034_df=gene_expression_df.copy()

### 2.3.2. The Hipathia mechanistic model

In [None]:
# Normalized gene expression value for specific sub-path and sample
def sample_u(path,sample):
    u=[]
    for node in path:
        cur_node=[]
        for gene in node:
            if(not gene in sample.index):
                cur_node.append(sample['noProbe'])
                continue
            cur_node.append(sample[gene])
        u.append(mean(cur_node))
    return u

# Normalized gene expression for specific sub-path and each sample
def get_u(path,samples):
    u=[]
    for sample in range(samples.shape[0]):
        u.append(sample_u(path,samples.iloc[sample]))
    return u

#u=get_u(node_genes[path_no],hipathia_norm_gse2034_df)

In [None]:
# Signal intensity of each node for a specific sub-path and sample
def sample_mechanistic_model(path,edges,u,relations_dict):
    cur_s=[u[0]]
    for node in range(1,len(u)):
        cur_s_a=1
        cur_s_i=1
        for prev_node in range(node):
            # Activation signals
            if(edges[prev_node]==relations_dict['Activation']): 
                cur_s_a=cur_s_a*(1-cur_s[prev_node])
            # Inhibition signals
            else:
                cur_s_i=cur_s_i*(1-cur_s[prev_node])
        new_u=u[node]*(1-cur_s_a)*cur_s_i
        cur_s.append(new_u)
    return cur_s[node]

# Signal intensity of each node for specific sub-path and each sample
def mechanistic_model(path,edges,u,relations_dict):
    s=[]
    for sample in range(len(u)):
        # Changes in the activity of the nodes will be reflected (or remain unnoticed) in the last effector node
        s.append(sample_mechanistic_model(path,edges,u[sample],relations_dict))
    return s

#edges=list(selected_df.iloc[path_no][~selected_df.iloc[path_no].isnull()][1::2])
#s=mechanistic_model(node_genes[path_no],edges,u,relations_dict)

### 2.3.3. Create DataFrame DataFrame with final score of each sub-path for a specific sample

In [None]:
#hipathia_df=pd.DataFrame(s,columns=[path_no])
#hipathia_df.to_csv('Results/HiPathia.csv',index=False) # Already somputed
hipathia_df = pd.read_csv ('Results/HiPathia.csv')
hipathia_df.index=gse2034_df.index
hipathia_df

## 2.4. TEAK
### 2.4.1. SubPathway Ranking
#### 2.4.1.1. Conditional Probability Distribution
- Node Y with m continuous parents X1,...,Xm
- β0, ... , βm are the regression coefficients
- σ^2 is the variance

In [None]:
# Regression coefficients (b1=Σ[(xi-x)(yi-y)]/Σ[(xi-x)], where x and y are the mean values of x and y respectively)
# Since our subpaths are linear, each node Y has one parent node x, and so only 2 coefficients (β0,β1).
def estimate_coef(x, y):
    # number of observations/points
    n = np.size(x)
  
    # mean of x and y vector
    m_x = np.mean(x)
    m_y = np.mean(y)
    
    if(n==1):
        # calculating cross-deviation and deviation about x
        SS_xy = (y*x)[0]
        SS_xx = (x*x)[0]
    else:
        # calculating cross-deviation and deviation about x
        SS_xy = sum(y*x) - n*m_y*m_x
        SS_xx = sum(x*x) - n*m_x*m_x
    
    # calculating regression coefficients
    if(SS_xx): # If SS_xx is not zero the proceed with the division
        b_1 = SS_xy / SS_xx
    else: 
        b_1=0
    b_0 = m_y - b_1*m_x
  
    return (b_0, b_1)

In [None]:
import statistics
import scipy.stats

# Calculate the Conditional Probability Distribution for a specific node and sub-path
def node_cond_prob_distr(node_no,expr_vals,reg_coef,variance):
    distr_mean=reg_coef[0]
    if(node_no==0): # If current node is root (without parents), then return β0
        return distr_mean
    distr_mean+=reg_coef[1]*expr_vals[node_no-1]
    
    distribution = scipy.stats.norm(loc=distr_mean,scale=variance)
    sample = distribution.rvs(size=1)[0] # linear networks: size=1

    return sample
    
# Calculate the Conditonal Probability Distribution for a specific sample and sub-path
def sample_cond_prob_distr(expr_vals):
    x = np.array(expr_vals[:-1]) # Continuous parents x1,...,xm
    y = np.array(expr_vals[1:]) # Continuous nodes Y
    reg_coef=estimate_coef(y,x)

    if(len(y)>1):
        variance=statistics.variance(y)
    else:
        variance=y[0]

    cond_prob=[]
    for node in range(len(expr_vals)):
        cond_prob.append(node_cond_prob_distr(node,expr_vals,reg_coef,variance))
    return cond_prob
    
# Calculate the Conditional Probability Distribution for a specific sub-path and each sample
def cond_prob_distr(expr_vals):
    cond_prob=[]
    for samle in range(len(expr_vals)):
        cond_prob.append(sample_cond_prob_distr(expr_vals[sample]))
    return cond_prob

#cond_prob=cond_prob_distr(subpath_expr_val)

#### 2.4.1.2.  Score_BIC=logP(D|θ)-0.5d*logN
- D: gene expression data
- θ: maximum likelihood estimate of the parameters used to represent the linear Gaussian node
- d: number of parameters
- N: number of samples in the gene expression data

In [None]:
# Get node's BIC score for a specific sub-path and sample
def node_score_BIC(cond_prob_distr,node_no,N):
    if(node_no==0):
        return math.log(cond_prob_distr[node_no]+1-min(cond_prob_distr)) # Only 1 parent node
    # Else: All other nodes have only 1 parent node (d=1)
    # Logarithm of negative values: Translate, then Transform (log(Y + 1 - min(Y)))
    # Source: https://blogs.sas.com/content/iml/2011/04/27/log-transformations-how-to-handle-negative-data-values.html
    score=math.log(cond_prob_distr[node_no]+1-min(cond_prob_distr))-0.5*math.log(N)
    return score

# Get BIC score for a specific sub-path and sample
def sample_score_BIC(cond_prob_distr,N):
    score=[]
    for node in range(len(cond_prob_distr)):
        score.append(node_score_BIC(cond_prob_distr,node,N))
        
    # Sum all nodes' scores and return final result
    return sum(score)

# Get BIC score for a specific sub-path and each sample
def score_BIC(cond_prob,N):
    score=[]
    for sample in range(len(cond_prob)):
        score.append(sample_score_BIC(cond_prob[sample],N))
    return score
    
#bic=score_BIC(cond_prob,gse2034_df.shape[1])

#### 2.4.1.3. Normalize scores
The score for each subpathway is normalized by its number of nodes, so that the scores are comparable.

In [None]:
def get_normalized_scores(scores,path):
    normalized_scores=[]
    for sample in range(len(scores)):
        normalized_scores.append(scores[sample]/len(path))
    return normalized_scores

#norm_score_BIC=get_normalized_scores(bic,node_genes[path_no])

### 2.4.2. Create DataFrame DataFrame with final score of each sub-path for a specific sample

In [None]:
#teak_df=pd.DataFrame(norm_score_BIC,columns=[path_no])
#teak_df.to_csv('Results/TEAK.csv',index=False) # Already somputed
teak_df = pd.read_csv ('Results/TEAK.csv')
teak_df.index=gse2034_df.index
teak_df

In [None]:
# Calculate score for each subpath and sample
'''
all_norm_pci={}
for path in node_genes:
    print((path+1)*100/len(node_genes))
    subpath_expr_val=subpath_expression_value(node_genes[path],gse2034_df)
    cond_prob=cond_prob_distr(subpath_expr_val)
    bic=score_BIC(cond_prob,gse2034_df.shape[1])
    all_norm_pci.update({path:get_normalized_scores(bic,node_genes[path])})
'''

## 2.5. DEAP

### 2.5.1. Expression data
- Formula: E=d(μ+g)+e

In [None]:
# The indicator of whether a gene is ‘on’ or ‘off’
def get_d(pvalues,fc):
    d=[]
    on_genes=[]
    for p in range(len(pvalues)):
        # fold change > 0 indicates up-regulation and fold change < 0 indicates down-regulation of genes
        # pvalue <= 0.05 indicates that the gene is on, else gene is off
        if(pvalues[p]>pvalue_threshold): # Gene is off
            d.append(0)
        else: # Gene is on
            on_genes.append(fc[p])
            if(fc[p]>0): # up-regulated
                d.append(1)
            else: # down-regulated
                d.append(-1)
    return d

#get_d(pvalues,fc)

In [None]:
# The mean of the absolute value of expression for ‘on’ genes (pathway effect) of a specific sub-path and sample
def get_m(pvalues,pvalue_threshold,expr_vals,samples):
    m=[]
    for node in range(len(expr_vals)):
        if(pvalues[node]<=pvalue_threshold): # If the gene is turned on
            m.append(abs(expr_vals[node])) # Get the absolute expression value of the node
    if(len(m)):
        return statistics.mean(m)
    return 0 # If there are no on genes

#get_m(pvalues,pvalue_threshold,subpath_expr_val[0],gse2034_df)

In [None]:
# Calculate the variance of individual gene for a specific sub-path and sample
def variance(data): # σ
    # Number of observations
    n = len(data)
    # Mean of the data
    mean = sum(data) / n
    # Square deviations
    deviations = [(x - mean) ** 2 for x in data]
    # Variance
    variance = sum(deviations) / n
    return variance

# Normal distribution of specific sub-path, sample and node
def normal_dist(x , mean , sd):
    prob_density = (np.pi*sd) * np.exp(-0.5*((x-mean)/sd)**2)
    return prob_density

# The variable g for a specific sub-path and sample is assumed to come from a normal distribution with mean 0 and variance σ
def sample_g(expr_val):
    mean=0
    s=variance(expr_val)
    if(s==0): # If the variance equals to zero, then the variable g is also zero, since division by zero is not possible. 
        g=len(expr_val)*[0]
    else:
        g=[]
        for node in range(len(expr_val)):
            g.append(normal_dist(expr_val[node],mean,s))
    return g

def get_g(samples_expr_val):
    g=[]
    for sample in range(len(samples_expr_val)):
        g.append(sample_g(samples_expr_val[sample]))
    return g

#sample_g(subpath_expr_val[0])

In [None]:
# Calculate e for a specific sub-path and sample
def sample_e(expr_val):
    mean=0
    s=1
    e=[]
    for node in range(len(expr_val)):
        e.append(normal_dist(expr_val[node],mean,s))
    return e

# Calculate e for a specific sub-path and each sample
def get_e(samples_expr_val):
    e=[]
    for sample in range(len(samples_expr_val)):
        e.append(sample_e(samples_expr_val[sample]))
    return e

#get_e(subpath_expr_val)

In [None]:
# Expression data (presumably on a log scale) for each gene in a pathway was simulated using a multivariate normal distribution
def sample_E(expr_val,d,m):
    g=sample_g(expr_val)
    e=sample_e(expr_val)
    
    node_E=[]
    for node in range(len(expr_val)):
        node_E.append(d[node]*(m+g[node])+e[node])
    return node_E
        
# E for a specific sub-path and each sample
def E(samples_expr_val,pvalues,pvalue_threshold,fc,genes):
    E=[]
    for sample in range(len(samples_expr_val)):
        d=get_d(pvalues,fc)
        m=get_m(pvalues,pvalue_threshold,samples_expr_val[sample],genes)
        E.append(sample_E(samples_expr_val[sample],d,m))
    return E

#E(subpath_expr_val,pvalues,pvalue_threshold,fc,gse2034_df)

### 2.5.2. Calculate differential expression
- A recursive function calculates the differential expression for each path by adding or subtracting all downstream nodes with catalytic or inhibitory relationships, respectively.
- The absolute value of the expression level is utilized as the DEAP score.

In [None]:
# B1+(B2*relation+(B3*relation+(...)))
def sample_deap_score(expr_val,edges,relations_dict,path=[]):
    if(len(path)>0): # Check if there is a specific subpath provided
        expr_val=path
        
    score=expr_val[-1]
    for node in range(len(expr_val)-2,-1,-1): # Recursive: start from the final node
        e=len(edges)-(len(expr_val)-node-1)
        if(edges[e]==relations_dict['Activation']): # Activation: +1
            score+=(expr_val[node]*1)
        else: # Inhibition: -1
            score+=(expr_val[node]*-1)
    # Return the absolute value of the score
    return abs(score)

def deap_score(samples_expr_val,edges,relations_dict):
    deap_score=[]
    for sample in range(len(samples_expr_val)):
        deap_score.append(sample_deap_score(samples_expr_val[sample],edges,relations_dict))
    return deap_score

#score=deap_score(subpath_expr_val,edges,relations_dict)

### 2.5.3. Random rotation
- Rotate data n times and recalculate DEAP score for every rotation sample.

In [None]:
def rotateList(arr,d=1):
    n=len(arr)
    arr[:]=arr[d:n]+arr[0:d]
    return arr

# Rotate specific sub-path with specific sample's expression values
def sample_random_rotation(expr_val,edges,relations_dict,score_list=[],k=0,n=100):
    score_list.append(sample_deap_score(expr_val,edges,relations_dict))
    
    if(k==n):
        return score_list
        
    tmp_expr_val=list.copy(expr_val)
    rotated_expr_val=rotateList(tmp_expr_val)
    rotated_score=sample_deap_score(rotated_expr_val,edges,relations_dict)
    
    return sample_random_rotation(rotated_expr_val,edges,relations_dict,score_list,k+1)

# Rotate specific sub-path for eahc sample
def random_rotation(samples_expr_val,edges,relations_dict):
    new_score=[]
    for sample in range(len(samples_expr_val)):
        new_score.append(statistics.mean(sample_random_rotation(samples_expr_val[sample],edges,relations_dict)))
    return new_score

#rotated_score=random_rotation2(subpath_expr_val,edges,relations_dict)

### 2.5.4. Create DataFrame DataFrame with final score of each sub-path for a specific sample

In [None]:
#deap_df=pd.DataFrame(rotated_score,columns=[path_no])
#deap_df.to_csv('Results/DEAP.csv',index=False) # Already somputed
deap_df = pd.read_csv ('Results/DEAP.csv')
deap_df.index=gse2034_df.index
deap_df

## 2.6. PRS
Each node in a pathway has three attributes: Node_genes, Node_value (NV), Node_weight (NW)
### 2.6.1.  Development of the PRS algorithm
#### 2.6.1.1. Node_genes

In [None]:
# Already calculated on 1.3.1.
node_genes

#### 2.6.1.2. Node_value (NV)
Instead of p-value, the z-score was computed for each gene and sample, in order to categorize them as non-expressed, expressed but non-significant and expressed and significant (above threshold). The value selected for z-score threshold is 1.96 (corresponding to p-value's threshold=0.05).

In [None]:
# Calculate for each gene and sample the z-score and determine the threshold.

import scipy.stats as stats
z_threshold=1.96
z_score_df=stats.zscore(gse2034_df, axis=1)
z_score_df

In [None]:
# For a specific sub-path and sample assign to each node consisting of one or more genes the corresponding z-values
def Sample_Z_Score(path,sample_z_score):
    z_score=[]
    for node in path:
        z_node=[]
        for gene in node:
            if(not gene in sample_z_score.index):
                z_node.append(sample_z_score['noProbe'])
                continue
            z_node.append(sample_z_score[gene])
        z_score.append(z_node)
    return z_score

# For a specific sub-path and each sample assign the corresponding z-score values
def Z_Score(path,sample_z):
    z=[]
    for sample in range(sample_z.shape[0]):
        z.append(sample_z_score(path,sample_z.iloc[sample]))
    return z

z_score=Z_Score(node_genes[path_no],z_score_df)
z_score

In [None]:
# For a specific sub-path and sample calculate the Node_value.
# Each node is assigned a value derived from expression data. The following values are assigned to the node: 0 if the 
# corresponding gene or genes are not expressed (z-score < 0), 1 if they are expressed but remain unchanged (z-score > 0 
# and z-score < threshold), or the maximum fold-change value if one or more of the mapped transcripts is above threshold
# (z-score > threshold).

def gene_status(z,threshold):
    if(z<0):
        return "not expressed"
    # Expressed (z-score > 0)
    if(z<threshold): # non-significant
        return "non-significant"
    else: # significant
        return "significant"

# For a specific sub-path and sample
def sample_Node_Value(path,sample,z,threshold,fc):
    node_value=[]
    for node in path:
        status={}
        node_fc=[]
        for gene in node:
            if(not gene in sample.index):
                status.update({gene:gene_status(z['noProbe'],threshold)})
                node_fc.append(fc['noProbe'])
                continue
            status.update({gene:gene_status(z[gene],threshold)})
            node_fc.append(fc[gene])
            
        # If one or more genes are significant assign the maximum fold-change value as node_value
        if(list(status.values()).count('significant')):
            node_value.append(max(node_fc))
        else:
            tmp_value=[]
            for gene in status:
                if(status[gene]=='non-significant'): # 1 (significant)
                    tmp_value.append(1)
                else: # 0 (not expressed)
                    tmp_value.append(0)
            node_value.append(statistics.mean(tmp_value)) # Get average
    return node_value
 
# For a specific sub-path and each sample 
def Node_Value(path,samples,z,threshold,fc):
    node_value=[]
    for sample in range(samples.shape[0]):
        node_value.append(sample_Node_Value(path,samples.iloc[sample],z.iloc[sample],z_threshold,fc))
    return node_value

#NV=Node_Value(node_genes[path_no],gse2034_df,z_score_df,z_threshold,gse2034_df2['Fold Change'])

In [None]:
# Each node is assigned a value derived from expression data. The following values are assigned to the node: 0 if the 
# corresponding gene or genes are not expressed, 1 if they are expressed but remain unchanged (non-significant), or the
# maximum fold-change value if one or more of the mapped transcripts is above threshold.
# Different approach

'''
def gene_status(data,log_fc_threshold,pvalue_threshold):
    if(data['Log FC']>log_fc_threshold): # not expressed
        return 'not expressed'
    # Expressed
    if(data['P-Value']<pvalue_threshold): # non-significant
        return 'non-significant'
    else: # significant
        return 'significant'

# Assign node values for a specific sub-path and sample
def sample_expressed_genes(path,sample,data,log_fc_threshold,pvalue_threshold):
    node_value=[]
    for node in path:
        gene_value=[]
        genes_status={}
        for gene in node:
            # Check if gene exists in dataset
            if(not(gene in sample.index)):
                gene_data=data.loc['noProbe']
            else:
                gene_data=data.loc[gene]
                
            genes_status.update({gene:gene_status(gene_data,log_fc_threshold,pvalue_threshold)})
        
        # Check if one or more are significant
        if(list(genes_status.values()).count('significant')):
            node_value.append(max([sample[k] for k in genes_status if genes_status[k]=='significant']))
        else:
            tmp_value=[]
            for gene in genes_status:
                if(genes_status[gene]=='non-significant'): # 1*gene expression value
                    tmp_value.append(sample[gene])
                else: # 0*gene expression value (not expressed)
                    tmp_value.append(0)
            node_value.append(statistics.mean(tmp_value))
    return node_value

def Node_Value(path,samples,data,lof_fc_threshold,pvalue_threshold):
    node_value=[]
    for sample in range(samples.shape[0]):
        node_value.append(sample_expressed_genes(path,samples.iloc[sample],data,log_fc_threshold,pvalue_threshold))
    return node_value

NV=Node_Value(node_genes[path_no],gse2034_df,gse2034_df2.iloc[:,-6:],log_fc_threshold,pvalue_threshold)
NV
'''

#### 2.6.1.3. Node_weight (NW)

In [None]:
# All significant (above-threshold) nodes were assigned a weighting that reflected
# their topological strength (i.e., the number of significant downstream nodes that are pointed to, either
# directly or via other significant nodes).
# An initiating child node, n_i, was ignored if non-significant, and the algorithm proceeds to the next child.
# Otherwise, we increase the weight counter by 1 and look for children of this node
# and so on. All non-significant nodes have NW = 0.

# Returns the number of significant children a node has for a specific sub-path and sample    
def significant_children(nodes,z,threshold,weight=0):
    if(len(nodes)==0): # Reached the end
        return weight
    
    status=[]
    for gene in nodes[0]:
        if(gene not in z.index):
            status.append(gene_status(z['noProbe'],threshold))
            continue
        status.append(gene_status(z[gene],threshold))

    if(status.count('significant')): # Significant
        return significant_children(nodes[1:],z,threshold,weight+1)
    else:
        return significant_children(nodes[1:],z,threshold,weight)

def sample_Node_Weight(path,z,threshold):
    node_weight=[]
    for node in range(len(path)):
        node_weight.append(significant_children(path[node:],z,threshold))
    return node_weight

def Node_Weight(path,z_samples,threshold):
    node_weight=[]
    for sample in range(z_samples.shape[0]):
        node_weight.append(sample_Node_Weight(path,z_samples.iloc[sample],threshold))
    return node_weight

#NW=Node_Weight(node_genes[path_no],z_score_df,z_threshold)

#### 2.6.1.4. Node_score (NS)
NV and NW values are combined to calculate a Node_Score (NS)

In [None]:
def sample_Node_Score(NV,NW):
    node_score=[]
    for node in range(len(NV)):
        if(NV[node]>1):
            node_score.append(NV[node]*NW[node])
        else:
            node_score.append(0)
    return node_score

def Node_Score(NV,NW):
    node_score=[]
    for sample in range(len(NV)):
        node_score.append(sample_Node_Score(NV[sample],NW[sample]))
    return node_score

#NS=Node_Score(NV,NW)

#### 2.6.1.5. PRS

In [None]:
def PRS(NS):
    prs=[]
    for sample in range(len(NS)):
        prs.append(sum(NS[sample]))
    return prs

#prs=PRS(NS)

### 2.6.2. Normalizing pathway scores
A normalization step is required to control for two key features: 
- pathway size 
- statistical bias contributed by pathway-specific PRS score null distributions.

#### 2.6.2.1.  Pathway size 
Multiply each PRS score by the ratio of the number of DEGs (NDEGs) in a pathway to the total number of expressed genes (NEGs)

In [None]:
# Return the number of expressed genes and significant genes for a node in a specific sub-path with a specific sample's data
def node_status(node,z,threshold):
    status=[]
    for gene in node:
        if(not gene in z.index):
            node_status.append(gene_status(z['noProbe'],threshold))
            continue
        status.append(gene_status(z[gene],threshold))
    NDEGs=status.count('significant')
    NEGs=NDEGs+status.count('non-significant')
    return NEGs,NDEGs

# Return the number of expressed genes and significant genes for each node in a specific sub-path with a specific sample's data
def sample_NEG_NDEG(path,z,threshold):
    NEGs=0
    NDEGs=0
    for node in path:
        node_NEGs,node_NDEGs=node_status(node,z,threshold)
        NEGs+=node_NEGs
        NDEGs+=node_NDEGs
    return NEGs,NDEGs

In [None]:
# Normalize a specific pathway to control their pathway size for a specific sample
def sample_Path_Size_Normalization(path,prs,z,threshold):
    NEGs,NDEGs=sample_NEG_NDEG(path,z,threshold)
    
    # Handle division by zero
    if(NEGs==0):
        return 0.0
    return prs*(NDEGs/NEGs)

# Normalize a specific pathway to control their pathway size for each sample
def Path_Size_Normalization(path,prs,z,threshold):
    norm_prs=[]
    for sample in range(len(prs)):
        norm_prs.append(sample_Path_Size_Normalization(path,prs[sample],z.iloc[sample],threshold))
    return norm_prs

#normalized_prs=Path_Size_Normalization(node_genes[path_no],prs,z_score_df,z_threshold)

#### 2.6.2.2. Statistical bias contributed by pathway-specific PRS score null distributions
Computational time problem

### 2.6.3. Create DataFrame DataFrame with final score of each sub-path for a specific sample

In [None]:
#prs_df=pd.DataFrame(normalized_prs,columns=[path_no])
#prs_df.to_csv('Results/PRS.csv',index=False) # Already somputed
prs_df = pd.read_csv ('Results/PRS.csv')
prs_df.index=gse2034_df.index
prs_df

# 3. Pathway Ranking
## 3.1. Select the 5 first subpaths and calculate the score for each sample
In this case we used the scores computed with the PRS methology.

In [None]:
prs_scores={}
subpaths_no=250
for path in range(subpaths_no):
    NV=Node_Value(node_genes[path],gse2034_df,z_score_df,z_threshold,gse2034_df2['Fold Change'])
    NW=Node_Weight(node_genes[path],z_score_df,z_threshold)
    NS=Node_Score(NV,NW)
    prs=PRS(NS)
    prs_scores.update({path:Path_Size_Normalization(node_genes[path],prs,z_score_df,z_threshold)})
prs_scores

In [None]:
prs_df=pd.DataFrame(prs_scores) # Rows: samples and columns: sub-paths
prs_df.index=gse2034_df.index
prs_df

### 3.2. Split data in training and testing sets (size reduction)
- Training set size: 70%
- Testing set size: remaining 30%

In [None]:
from sklearn.model_selection import train_test_split

training_data, testing_data, training_labels, testing_labels = train_test_split(prs5_df, prs5_df.index, test_size = 0.3 ,random_state = 0)

## 3.3.  Machine Learning

### 3.3.1. K-nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(training_data, training_labels)
    pred_i = knn.predict(testing_data)
    error.append(np.mean(pred_i != testing_labels))

In [None]:
# k equals the number of neighbors that have the lowest errors
k=error.index(min(error))
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(training_data, training_labels)
predictions=knn.predict(testing_data)
knn.score(testing_data, testing_labels)