## Normalization Script

Author: Kristina Preuer

This script shows how the data was split and how the features were normalized. The data is then saved in a pickle file. Which will be loaded during the cross validation procedure.

In [1]:
import numpy as np
import pandas as pd
import pickle 
import os

##### Define the parameters for data generation: folds for testing and validation and normalization strategy

#### Define nomalization function
It normalizes the input data X. If X is used for training the mean and the standard deviation is calculated during normalization. If X is used for validation or testing, the previously calculated mean and standard deviation of the training data should be used. If "tanh_norm" is used as normalization strategy, then the mean and standard deviation are calculated twice. The features with a standard deviation of 0 are filtered out. 

#### Load features and labels

In [2]:
with open('data'+os.sep+'dict_cl2features.pkl', 'rb') as pk:
    dict_cl2features = pickle.load(pk)
dict_cl2features.keys()

dict_keys(['A2058', 'A2780', 'A375', 'A427', 'CAOV3', 'COLO320DM', 'DLD1', 'EFM192B', 'ES2', 'HCT116', 'HT144', 'HT29', 'KPL1', 'LNCAP', 'LOVO', 'MDAMB436', 'MSTO', 'NCIH1650', 'NCIH2122', 'NCIH23', 'NCIH460', 'NCIH520', 'OCUBM', 'OV90', 'OVCAR3', 'PA1', 'RKO', 'RPMI7951', 'SKMEL30', 'SKMES1', 'SKOV3', 'SW620', 'SW837', 'T47D', 'UACC62', 'UWB1289BRCA1', 'UWB1289', 'VCAP', 'ZR751'])

In [3]:
with open('data'+os.sep+'dict_drug2features.pkl', 'rb') as pk:
    dict_drug2features = pickle.load(pk)
dict_drug2features.keys()

dict_keys(['5-FU', 'ABT-888', 'AZD1775', 'BEZ-235', 'BORTEZOMIB', 'CARBOPLATIN', 'CYCLOPHOSPHAMIDE', 'DASATINIB', 'DEXAMETHASONE', 'DINACICLIB', 'DOXORUBICIN', 'ERLOTINIB', 'ETOPOSIDE', 'GELDANAMYCIN', 'GEMCITABINE', 'L778123', 'LAPATINIB', 'METFORMIN', 'METHOTREXATE', 'MITOMYCINE', 'MK-2206', 'MK-4541', 'MK-4827', 'MK-5108', 'MK-8669', 'MRK-003', 'OXALIPLATIN', 'PACLITAXEL', 'PD325901', 'SN-38', 'SORAFENIB', 'SUNITINIB', 'TEMOZOLOMIDE', 'TOPOTECAN', 'VINBLASTINE', 'VINORELBINE', 'ZOLINZA', 'MK-8776'])

In [4]:
#contains synergy values and fold split (numbers 0-4)
labels = pd.read_csv('data'+os.sep+'CSV3_cv5folds_cvrandom_concat_regression.csv', index_col=0)
print(len(labels))
labels.head()

26752


Unnamed: 0,drug_a_name,drug_b_name,cell_line,synergy,fold,fold_random
0,5-FU,DINACICLIB,A2058,4.331695,0,0
1,5-FU,MK-8669,A2058,32.336232,0,2
2,5-FU,PD325901,A2058,2.264835,0,1
4,5-FU,AZD1775,A2058,13.052687,1,1
5,5-FU,BEZ-235,A2058,13.679112,1,4


In [5]:
X = []
for da, db, cl, s, f1, f2 in labels.values:
    X.append(np.concatenate((dict_drug2features[da],dict_drug2features[db],dict_cl2features[cl])))
    
X = np.array(X)

In [6]:
X.shape

(26752, 12758)

#### normalization function

In [7]:
def normalize(X, means1=None, std1=None, means2=None, std2=None, feat_filt=None, norm='tanh_norm'):
    if std1 is None:
        std1 = np.nanstd(X, axis=0)
    if feat_filt is None:
        feat_filt = std1!=0
        #print(X.shape)
    X = X[:,feat_filt]
    #print(X.shape)
    X = np.ascontiguousarray(X)
    #print(X.shape)
    if means1 is None:
        means1 = np.mean(X, axis=0)
    X = (X-means1)/std1[feat_filt]
    if norm == 'norm':
        return(X, means1, std1, feat_filt)
    elif norm == 'tanh':
        return(np.tanh(X), means1, std1, feat_filt)
    elif norm == 'tanh_norm':
        X = np.tanh(X)
        if means2 is None:
            means2 = np.mean(X, axis=0)
        if std2 is None:
            std2 = np.std(X, axis=0)
        X = (X-means2)/std2
        X[:,std2==0]=0
        return(X, means1, std1, means2, std2, feat_filt)

#### Define indices for splitting

In [36]:
test_fold = 4
val_fold = 0

In [37]:
#indices of training data for hyperparameter selection: fold 2, 3, 4
idx_tr = np.where(np.logical_and(labels['fold_random']!=test_fold, labels['fold_random']!=val_fold))
#indices of validation data for hyperparameter selection: fold 1
idx_val = np.where(labels['fold_random']==val_fold)


#indices of training data for model testing: fold 1, 2, 3, 4
idx_train = np.where(labels['fold_random']!=test_fold)
#indices of test data for model testing: fold 0
idx_test = np.where(labels['fold_random']==test_fold)

#### Split data 

In [38]:
X_tr = X[idx_tr]
X_val = X[idx_val]
X_train = X[idx_train]
X_test = X[idx_test]

y_tr = labels.iloc[idx_tr]['synergy'].values
y_val = labels.iloc[idx_val]['synergy'].values
y_train = labels.iloc[idx_train]['synergy'].values
y_test = labels.iloc[idx_test]['synergy'].values

In [39]:
print(X_tr.shape, X_val.shape)
print(X_train.shape, X_test.shape)

(16050, 12758) (5342, 12758)
(21392, 12758) (5360, 12758)


#### Normalize training and validation data for hyperparameter selection

#### Normalize training and test data for methods comparison

In [40]:
norm = 'tanh_norm'

In [41]:
if norm == "tanh_norm":
    X_tr, mean, std, mean2, std2, feat_filt = normalize(X_tr, norm=norm)
    X_val, mean, std, mean2, std2, feat_filt = normalize(X_val, mean, std, mean2, std2, feat_filt=feat_filt, norm=norm)
else:
    X_tr, mean, std, feat_filt = normalize(X_tr, norm=norm)
    X_val, mean, std, feat_filt = normalize(X_val, mean, std, feat_filt=feat_filt, norm=norm)
    
print(X_tr.shape, X_val.shape)

if norm == "tanh_norm":
    X_train, mean, std, mean2, std2, feat_filt = normalize(X_train, norm=norm, feat_filt=feat_filt)
    X_test, mean, std, mean2, std2, feat_filt = normalize(X_test, mean, std, mean2, std2, feat_filt=feat_filt, norm=norm)
else:
    X_train, mean, std, feat_filt = normalize(X_train, norm=norm)
    X_test, mean, std, feat_filt = normalize(X_test, mean, std, feat_filt=feat_filt, norm=norm)
    
print(X_train.shape, X_test.shape)

(16050, 8466) (5342, 8466)
(21392, 8466) (5360, 8466)


#### Save data as pickle file

In [42]:

path_data_cv = 'data_cv'+os.sep
pickle.dump((X_tr, X_val, X_train, X_test, y_tr, y_val, y_train, y_test), 
            open(path_data_cv+'data_test_fold%d_%s.p'%(test_fold, norm), 'wb'))