# 1.  Discretization of gene expression values
Transform gene expression values into high (expressed / up-regulated) or low (not-expressed / down-regulated) gene expression binary equivalents

#### 1.1. The midpoints between each two consecutive values are calculated;

In [None]:
def midpoint(num1,num2):
    return (num1+num2)/2

# Returns the midpoints for a specific sample
def sample_midpoints(sample):
    # The expression values of a gene over the total number of input samples are sorted in descending order;
    sorted_sample=sample.sort_values(ascending=False)
    
    midpoints_dict=[] # Key corresponds to the position of the first gene. The second gene is in the next position (i+1).
    for i in range(sorted_sample.shape[0]-1):
        midpoints_dict.append(midpoint(sorted_sample[i],sorted_sample[i+1]))
    return midpoints_dict

# Returns the midpoints for all the samples
def get_midpoints(df):
    midpoints_dict={} # Key corresponds to the position of the first gene. The second gene is in the next position (i+1).
    for sample in range(df.shape[0]):
        midpoints_dict.update({sample:sample_midpoints(df.iloc[sample])})
    return midpoints_dict

#### 1.2. For each midpoint, μi, the Information Gain (IG) of the system is computed. Let IG(S,μi) to denote the IG of the system for midpoint μi.

In [None]:
import math 

# Calculate the proportion of samples in S that belong in Class C
def P(C,S):
    return list(S).count(C)/len(S)

def E(S,classes,m=1):
    # m not given: calculate the entropy of the system taking into account the prior assignment of sample cases into phenotype classes
    # m given: calculate the respective entropy of the system taking into account its division into subgroups around midpoint μi
    tmp=0
    for c in classes:
        # P(c,S) must be greater than zero
        tmp+=P(c,S)*math.log(P(c,S))/m
    return -(tmp)

In [None]:
# Calculate the Information Gain (IG) of the system for a specific sample
def IG(df,midpoints):
    classes=sorted(set(df.index)) # Τhe classes to which a sample may belong
    S=df.index # the samples class
    
    information_gain=[]
    for m in midpoints:
        information_gain.append(E(S,classes)-E(S,classes,m))
    
    return information_gain

In [None]:
# The midpoint with the highest information gain is selected as the discretization point
#max_value = max(information_gain)
#max_mid_pos = information_gain.index(max(information_gain))
#dis_point=midpoints_dict.get(max_mid_pos)
#print('Discretization point: %.3f' %dis_point)

def discretization_point(df,midpoints):
    information_gain=IG(df,midpoints)
    max_value=max(information_gain)
    max_mid_pos=information_gain.index(max(information_gain))
    dis_point=midpoints[max_mid_pos]
    return dis_point

#### 1.4. The sample cases with expression values lower than the discretization point are assigned the '0' value (meaning that the gene is under-expressed), and the sample cases with expression values bigger that the discretization point are assigned the '1' value (the gene is over-expressed).
The discretization process is applied for each gene separately, and the final dataset is a matrix of discretized, actually binarized, values.

In [None]:
def discretization(df):
    new_df=df.copy()
    for sample in range(new_df.shape[0]):
        midpoints=sample_midpoints(new_df.iloc[sample])
        dis_point=discretization_point(new_df,midpoints)
        new_df.iloc[sample][new_df.iloc[sample]<dis_point]=0 # under-expressed
        new_df.iloc[sample][new_df.iloc[sample]>=dis_point]=1 # over-expressed
    new_df=new_df.astype('int')
    return new_df

# 2. Functional sub-paths: Matching sub-paths with gene expression profiles
...