# Data preparation

In this notebook, we will create the datasets to be used for the algorithms, once downloaded the Kaggle data:
1. Data splitting into training and test datasets --> useful for the Off-The-Shelf algorithms

2. Dosage-Time data partition --> useful for the Off-The-Shelf algorithms

4. Data files creation for SBM --> useful for the stochastic block model

## 1. Data splitting into training and test datasets

In order to obtain the same training and test datasets (given that it was created randomly), we saved a file (`test_conditions.txt`) with the numbers accounting for each condition that belongs to the test set. 

In [61]:
import pandas as pd
features = pd.read_csv("../lish-moa/train_features.csv", index_col=0)
targets = pd.read_csv("../lish-moa/train_targets_scored.csv", index_col=0)

#cp_type:
features.loc[features.cp_type == "trt_cp", "cp_type"] = 0
features.loc[features.cp_type == "ctl_vehicle", "cp_type"] = 1
#cp_dose:
features.loc[features.cp_dose == "D1", 'cp_dose'] = 0
features.loc[features.cp_dose == "D2", 'cp_dose'] = 1

In [14]:
fh=open('test_conditions.txt','r')
igot=fh.readlines()
cond_test = []
for line in igot:
    cond_test.append(int(line))
fh.close()

In [17]:
xtest = features.iloc[cond_test,:]
ytest = targets.iloc[cond_test,:]

#the rest of conditions, belong to the training sets
xtrain = features.iloc[[c for c in range(len(features)) if c not in cond_test],:]
ytrain = targets.iloc[[c for c in range(len(features)) if c not in cond_test],:]

In [19]:
ytrain.to_csv(r'ytrain.csv')
ytest.to_csv(r'ytest.csv')
xtrain.to_csv(r'xtrain.csv')
xtest.to_csv(r'xtest.csv')

## 2. Dosage-Time data partition

We will create different datasets: each one having a specific combination of cp_time and cp_dose. So, in the end, there will be 6 combinations.

In [20]:
#only the treated samples!
x_tr = xtrain[xtrain['cp_type'] == 0] 
y_tr = ytrain[xtrain['cp_type'] == 0]

In [21]:
features_DoseTime = []
targets_DoseTime = []

dataset_DoseTime_names = [] #just in case we want to know the combination of dose and time that each dataframe has been created with

for dose in sorted(x_tr.cp_dose.unique().tolist()):
    for time in sorted(x_tr.cp_time.unique().tolist()):
        features_DoseTime.append(x_tr[(x_tr['cp_dose'] == dose) & (x_tr['cp_time'] == time)])
        targets_DoseTime.append(y_tr[(x_tr['cp_dose'] == dose) & (x_tr['cp_time'] == time)])
        dataset_DoseTime_names.append("Dose"+str(dose)+"-Time"+str(time))

In [23]:
features_DoseTime[0].to_csv(r'xtrain_D0T24.csv')
features_DoseTime[1].to_csv(r'xtrain_D0T48.csv')
features_DoseTime[2].to_csv(r'xtrain_D0T72.csv')
features_DoseTime[3].to_csv(r'xtrain_D1T24.csv')
features_DoseTime[4].to_csv(r'xtrain_D1T48.csv')
features_DoseTime[5].to_csv(r'xtrain_D1T72.csv')

targets_DoseTime[0].to_csv(r'ytrain_D0T24.csv')
targets_DoseTime[1].to_csv(r'ytrain_D0T48.csv')
targets_DoseTime[2].to_csv(r'ytrain_D0T72.csv')
targets_DoseTime[3].to_csv(r'ytrain_D1T24.csv')
targets_DoseTime[4].to_csv(r'ytrain_D1T48.csv')
targets_DoseTime[5].to_csv(r'ytrain_D1T72.csv')

## 3. Data files creation for SBM

We must discretise gene expression and cell viability and then we will create different files that will contain each combination of condition-moa, condition-gene and condition-cell (each file will consider one value for the combination, for example, we will have two files for the combination condition-moa--> 0 and 1).

### Discretization (given the z-scores):

In [25]:
g_columns = [g for g in features.columns.tolist() if g.startswith('g-')]
c_columns = [c for c in features.columns.tolist() if c.startswith('c-')]

In [26]:
def z_scores_over_controls(features):
    
    #calculating mean and standard deviation for each column (only control samples)
    controls = features.loc[features.cp_type == 1]
    means_genes = pd.Series(index = g_columns, dtype=object)
    means_cells = pd.Series(index = c_columns, dtype=object)

    std_genes = pd.Series(index = g_columns, dtype=object)
    std_cells = pd.Series(index = c_columns, dtype=object)

    for gene in g_columns:
        means_genes[gene] = controls[gene].mean()
        std_genes[gene] = controls[gene].std()

    for cell in c_columns:
        means_cells[cell] = controls[cell].mean()
        std_cells[cell] = controls[cell].std()
        
    #computing z-scores for all the samples (including treated samples)
    zscores = pd.DataFrame()
    
    for gene in g_columns:
        column = []
        for i in range(features.shape[0]):
            column.append((features[gene][i]-means_genes[gene])/std_genes[gene])
        zscores[gene] = column
        
    for cell in c_columns:
        column = []
        for i in range(features.shape[0]):
            column.append((features[cell][i]-means_cells[cell])/std_cells[cell])
        zscores[cell] = column
        
    zscores['cp_type'] = features['cp_type'].tolist() #adding a column with the cp type

    return(zscores)

In [27]:
z_scores = z_scores_over_controls(features)

In [28]:
def gene_discretization(names_cols, z_scores, threshold):
    new_df = pd.DataFrame(index = z_scores.index, columns = names_cols)
    for element in names_cols:
        new_column = []
        for i in range(z_scores.shape[0]):
            if (z_scores[element][i] < -threshold):
                new_column.append(-1)
            elif (z_scores[element][i] > threshold):
                new_column.append(1)
            else:
                new_column.append(0)
        new_df[element] = new_column        
    return(new_df)

In [77]:
genes_discretized = gene_discretization(g_columns, z_scores, 5)
genes_discretized.to_csv('Genes_discretization_5.csv')

In [30]:
def cell_discretization(names_cols, z_scores, threshold):
    new_df = pd.DataFrame(index = z_scores.index, columns = names_cols)
    for element in names_cols:
        new_column = []
        for i in range(z_scores.shape[0]):
            if (z_scores[element][i] < -threshold):
                new_column.append(0)
            else:
                new_column.append(1)
        new_df[element] = new_column        
    return(new_df)

In [78]:
cells_discretized = cell_discretization(c_columns, z_scores, 6)
cells_discretized.to_csv('Cells_discretization_6.csv')

### Condition-MoA files:

Besides the files contaning the combinations that have a 1 or a 0, we will save the ids.

In [75]:
fh=open('../lish-moa/train_targets_scored.csv',"r")
igot = fh.readlines() #reading each row as a string, list of strings
names=igot[0].strip().split(',') #names = names of MoAs = list of strings

#### MOA - INTEGER NUMBER ASSIGNATION, OUTPUT FILE ####
fout=open('MoAsID.csv','w') #write!
del names[0] #remove the first string (i.e., 'sig_id')

for i in range(len(names)):
	fout.write('%s,%s\n' % (names[i],i)) #name of MoA and the integer number assigned (0,1,2,3...)
fout.close()

del igot[0] #We're done with 1st line processing (MoA names)

nc=[]
nm=[]

for line in igot:
	about = line.strip().split(',') #list of strings (actually, 0s or 1s)
	for z in range(len(about)-1): #notice that the first element of each 'about' is actually the id!
		nc.append(about[0]) #nc will contain all the ids (no matter what)

#### CONDITION - INTEGER NUMBER ASSIGNATION, OUTPUT FILE ####
fout2=open('conditionsID.csv','w')
nc=sorted(set(nc))  #nc = whole set of ids * 206 (#MoAs), we only want unique ids
ii=0
cond={} #ids
for e in nc:
	cond[e]=ii #creating the dictionary which will be used later when creating the files with 0s and 1s
	fout2.write('%s,%s\n' % (e,ii))
	ii=ii+1
fout2.close()

In [76]:
fh=open('ytrain.csv',"r")
igot = fh.readlines() #reading each row as a string, list of strings
names=igot[0].strip().split(',') #names = names of MoAs = list of strings
del igot[0] #We're done with 1st line processing (MoA names)

l0=[]
l1=[]

for line in igot:
	about = line.strip().split(',') #list of strings (actually, 0s or 1s)
	for z in range(len(about)-1): #notice that the first element of each 'about' is actually the id!
		if int(about[z+1])==0: #we must look for the element starting from index 1 (there's where we have 0s or 1s)
			l0.append((about[0],z)) #saving the id and the index of the MoA assigned to 0
		else:
			l1.append((about[0],z))

#### CONDITIONS/MOA WITH 0s ####
fout3=open('c-moastrain0.csv','w')
for e1,e2 in l0:
	fout3.write('%s,%s\n' % (cond[e1],e2)) #cond[e1] will give us the integer assigned to that condition; e2, the number of MoA
fout3.close()

#### CONDITIONS/MOA WITH 1s ####
fout4=open('c-moastrain1.csv','w')
for e1,e2 in l1:
	fout4.write('%s,%s\n' % (cond[e1],e2))
fout4.close()

### Condition-gene files (whole dataset, including training and test):

In [79]:
fh=open('Genes_discretization_5.csv',"r") 
igot = fh.readlines() #reading each row as a string, list of strings
#igot[0] #1st element: all the column names (sig_id + genes)
names=igot[0].strip().split(',') #names = names of genes = list of strings
#### GENES - INTEGER NUMBER ASSIGNATION, OUTPUT FILE ####
fout=open('GenesID.csv','w') #write!
del names[0] #remove the first string (i.e., 'sig_id')

for i in range(len(names)):
	fout.write('%s,%s\n' % (names[i],i)) #name of gene and the integer number assigned (0,1,2,3...)
fout.close()

del igot[0] #We're done with 1st line processing (MoA names)

l_1=[] #link -1: list with tupples (condition, index of the gene) linked with -1
l0=[]  #link 0
l1=[]  #link 1
nc=[]  #list containing all the conditions ids (* genes)
nm=[]  

for line in igot:
	about = line.strip().split(',') #list of strings (actually, -1s, 0s or 1s)
	for z in range(len(about)-1): #notice that the first element of each 'about' is actually the id!
		if int(about[z+1])== -1: #we must look for the element starting from index 1
			l_1.append((about[0],z)) #saving the condition id and the index of the gene assigned to -1
			nc.append(about[0]) #nc will contain all the ids (no matter what)
		elif int(about[z+1])== 0:
			l0.append((about[0],z)) #saving the condition id and the index of the gene assigned to 0
			nc.append(about[0])         
		else: #link 1
			l1.append((about[0],z)) #saving the condition id and the index of the gene assigned to 1
			nc.append(about[0]) 
            
#### CONDITION - INTEGER NUMBER ASSIGNATION, OUTPUT FILE ####
fout2=open('conditionsID_genes.csv','w')
nc=sorted(set(nc))  #nc = whole set of ids * 772 (#genes), we only want unique ids
ii=0
cond={} #ids
for e in nc:
	cond[e]=ii #creating the dictionary which will be used later when creating the files with 0s and 1s
	fout2.write('%s,%s\n' % (e,ii))
	ii=ii+1
fout2.close() #this file and conditionsID (the one created when parsing the file of targets) are the same

#### CONDITIONS/GENES WITH -1s ####
fout3=open('c-genestrain-1.csv','w')
for e1,e2 in l_1:
	fout3.write('%s,%s\n' % (cond[e1],e2)) #cond[e1] will give us the integer assigned to that condition; e2, the number of gene
fout3.close()

#### CONDITIONS/GENES WITH 0s ####
fout4=open('c-genestrain0.csv','w')
for e1,e2 in l0:
	fout4.write('%s,%s\n' % (cond[e1],e2)) 
fout4.close()

#### CONDITIONS/GENES WITH 1s ####
fout5=open('c-genestrain1.csv','w')
for e1,e2 in l1:
	fout5.write('%s,%s\n' % (cond[e1],e2))
fout5.close()

### Condition-cell files (whole dataset, including training and test):

In [82]:
fh=open('Cells_discretization_6.csv',"r") 
igot = fh.readlines() #reading each row as a string, list of strings
#igot[0] #1st element: all the column names (sig_id + genes)
names=igot[0].strip().split(',') #names = names of genes = list of strings
#### CELL - INTEGER NUMBER ASSIGNATION, OUTPUT FILE ####
fout=open('CellsID.csv','w') #write!
del names[0] #remove the first string (i.e., 'sig_id')

for i in range(len(names)):
	fout.write('%s,%s\n' % (names[i],i)) #name of gene and the integer number assigned (0,1,2,3...)
fout.close()

del igot[0] #We're done with 1st line processing (MoA names)

l_1=[] #link -1: list with tupples (condition, index of the gene) linked with -1
l0=[]  #link 0
l1=[]  #link 1
nc=[]  #list containing all the conditions ids (* genes)
nm=[]  

for line in igot:
	about = line.strip().split(',') #list of strings (actually, 0s or 1s)
	for z in range(len(about)-1): #notice that the first element of each 'about' is actually the id!
		if int(about[z+1])== 0:
			l0.append((about[0],z)) #saving the condition id and the index of the cell assigned to 0
			nc.append(about[0])         
		else: #link c
			l1.append((about[0],z)) #saving the condition id and the index of the cell assigned to 1
			nc.append(about[0]) 
            
#### CONDITION - INTEGER NUMBER ASSIGNATION, OUTPUT FILE ####
fout2=open('conditionsID_cells.csv','w')
nc=sorted(set(nc))  #nc = whole set of ids * 772 (#genes), we only want unique ids
ii=0
cond={} #ids
for e in nc:
	cond[e]=ii #creating the dictionary which will be used later when creating the files with 0s and 1s
	fout2.write('%s,%s\n' % (e,ii))
	ii=ii+1
fout2.close() #this file and conditionsID (the one created when parsing the file of targets) are the same

#### CONDITIONS/CELLS WITH 0s ####
fout3=open('c-cellstrain1.csv','w')
for e1,e2 in l1:
	fout3.write('%s,%s\n' % (cond[e1],e2)) #cond[e1] will give us the integer assigned to that condition; e2, the number of gene
fout3.close()

#### CONDITIONS/CELLS WITH 1s ####
fout4=open('c-cellstrain0.csv','w')
for e1,e2 in l0:
	fout4.write('%s,%s\n' % (cond[e1],e2)) 
fout4.close()