## Normalization Script

Author: Kristina Preuer

This script shows how the data was split and how the features were normalized. The data is then saved in a pickle file. Which will be loaded during the cross validation procedure.

In [1]:
import numpy as np
import pandas as pd
import pickle 
import gzip

##### Define the parameters for data generation: folds for testing and validation and normalization strategy

In [2]:
# in this example tanh normalization is used
# fold 0 is used for testing and fold 1 for validation (hyperparamter selection)
norm = 'tanh'

#### Define nomalization function
It normalizes the input data X. If X is used for training the mean and the standard deviation is calculated during normalization. If X is used for validation or testing, the previously calculated mean and standard deviation of the training data should be used. If "tanh_norm" is used as normalization strategy, then the mean and standard deviation are calculated twice. The features with a standard deviation of 0 are filtered out. 

In [3]:
def normalize(X, means1=None, std1=None, means2=None, std2=None, feat_filt=None, norm='tanh_norm'):
    if std1 is None:
        std1 = np.nanstd(X, axis=0)
    if feat_filt is None:
        feat_filt = std1!=0
    X = X[:,feat_filt]
    X = np.ascontiguousarray(X)
    if means1 is None:
        means1 = np.mean(X, axis=0)
    X = (X-means1)/std1[feat_filt]
    if norm == 'norm':
        return(X, means1, std1, feat_filt)
    elif norm == 'tanh':
        return(np.tanh(X), means1, std1, feat_filt)
    elif norm == 'tanh_norm':
        X = np.tanh(X)
        if means2 is None:
            means2 = np.mean(X, axis=0)
        if std2 is None:
            std2 = np.std(X, axis=0)
        X = (X-means2)/std2
        X[:,std2==0]=0
        return(X, means1, std1, means2, std2, feat_filt)        

#### Data

In [4]:
data_name = "all_test"
with open(f"cv_example/{data_name}.pkl", "rb") as f:
    file = pickle.load(f)
# train_dc, y_train, test_dc, y_test, val_dc, y_val = file
train_dc, y_train, test_dc, y_test, val_dc, y_val, mix_val, y_mix_val,\
mix_test, y_mix_test, blind_cell_val, y_blind_cell_val,\
blind_cell_test, y_blind_cell_test, blind_1_drug_val, y_blind_1_drug_val,\
blind_1_drug_test, y_blind_1_drug_test, blind_1_drug_cell_val, y_blind_1_drug_cell_val,\
blind_1_drug_cell_test, y_blind_1_drug_cell_test, blind_2_drug_val, y_blind_2_drug_val,\
blind_2_drug_test, y_blind_2_drug_test, blind_all_val, y_blind_all_val,\
blind_all_test, y_blind_all_test = file

#### Normalize training and validation data for hyperparameter selection

In [5]:
if norm == "tanh_norm":
    train_dc, mean, std, mean2, std2, feat_filt = normalize(X_tr, norm=norm)
    test_dc, mean, std, mean2, std2, feat_filt = normalize(test_dc, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    val_dc, mean, std, mean2, std2, feat_filt = normalize(val_dc, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    mix_val, mean, std, mean2, std2, feat_filt = normalize(mix_val, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    mix_test, mean, std, mean2, std2, feat_filt = normalize(mix_test, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_cell_val, mean, std, mean2, std2, feat_filt = normalize(blind_cell_val, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_cell_test, mean, std, mean2, std2, feat_filt = normalize(blind_cell_test, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_1_drug_val, mean, std, mean2, std2, feat_filt = normalize(blind_1_drug_val, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_1_drug_test, mean, std, mean2, std2, feat_filt = normalize(blind_1_drug_test, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_1_drug_cell_val, mean, std, mean2, std2, feat_filt = normalize(blind_1_drug_cell_val, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_1_drug_cell_test, mean, std, mean2, std2, feat_filt = normalize(blind_1_drug_cell_test, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_2_drug_val, mean, std, mean2, std2, feat_filt = normalize(blind_2_drug_val, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_2_drug_test, mean, std, mean2, std2, feat_filt = normalize(blind_2_drug_test, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_all_val, mean, std, mean2, std2, feat_filt = normalize(blind_all_val, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
    blind_all_test, mean, std, mean2, std2, feat_filt = normalize(blind_all_test, mean, std, mean2, std2, 
                                                          feat_filt=feat_filt, norm=norm)
else:
    train_dc, mean, std, feat_filt = normalize(train_dc, norm=norm)
    test_dc, mean, std, feat_filt = normalize(test_dc, mean, std, feat_filt=feat_filt, norm=norm)
    val_dc, mean, std, feat_filt = normalize(val_dc, mean, std, feat_filt=feat_filt, norm=norm)    
    mix_val, mean, std, feat_filt = normalize(mix_val, mean, std, feat_filt=feat_filt, norm=norm)    
    mix_test, mean, std, feat_filt = normalize(mix_test, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_cell_val, mean, std, feat_filt = normalize(blind_cell_val, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_cell_test, mean, std, feat_filt = normalize(blind_cell_test, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_1_drug_val, mean, std, feat_filt = normalize(blind_1_drug_val, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_1_drug_test, mean, std, feat_filt = normalize(blind_1_drug_test, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_1_drug_cell_val, mean, std, feat_filt = normalize(blind_1_drug_cell_val, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_1_drug_cell_test, mean, std, feat_filt = normalize(blind_1_drug_cell_test, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_2_drug_val, mean, std, feat_filt = normalize(blind_2_drug_val, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_2_drug_test, mean, std, feat_filt = normalize(blind_2_drug_test, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_all_val, mean, std, feat_filt = normalize(blind_all_val, mean, std, feat_filt=feat_filt, norm=norm)    
    blind_all_test, mean, std, feat_filt = normalize(blind_all_test, mean, std, feat_filt=feat_filt, norm=norm)        

#### Normalize training and test data for methods comparison

#### Save data as pickle file

In [8]:
data_for_deepsy = (train_dc, y_train, test_dc, y_test, val_dc, y_val, mix_val, y_mix_val,\
mix_test, y_mix_test, blind_cell_val, y_blind_cell_val,\
blind_cell_test, y_blind_cell_test, blind_1_drug_val, y_blind_1_drug_val,\
blind_1_drug_test, y_blind_1_drug_test, blind_1_drug_cell_val, y_blind_1_drug_cell_val,\
blind_1_drug_cell_test, y_blind_1_drug_cell_test, blind_2_drug_val, y_blind_2_drug_val,\
blind_2_drug_test, y_blind_2_drug_test, blind_all_val, y_blind_all_val,\
blind_all_test, y_blind_all_test)
# data_for_deepsy = (train_dc, y_train, test_dc, y_test, val_dc, y_val) 
with open(f"cv_example/{data_name}.pkl", "wb") as f:
    pickle.dump(data_for_deepsy, f)

In [9]:
data_name

'all_test'

In [7]:
len(data_for_deepsy)

30