In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
from sys import argv
from comboFM_core_data.utils import concatenate_features, standardize
from sklearn.ensemble import RandomForestRegressor

In [10]:
seed = 123 # Random seed
data_dir = "comboFM_core_data/data/data/"

nfolds_outer = 10 # Number of folds in the outer loop
nfolds_inner = 5 # Number of folds in the inner loop

# Experiment: 1) new_dose-response_matrix_entries, 2) new_dose-response_matrices, 3) new_drug_combinations"""
experiment = "new_drug_combinations"

In [11]:
id_in = 2
print("\nJob ID: %d" %id_in)

 # Features in position 1: Drug A - Drug B
features_tensor_1 = (
    "drug1_concentration__one-hot_encoding.csv", 
    "drug2_concentration__one-hot_encoding.csv", 
    "drug1__one-hot_encoding.csv", 
    "drug2__one-hot_encoding.csv", 
    "cell_lines__one-hot_encoding.csv"
)
features_auxiliary_1 = (
    "drug1_drug2_concentration__values.csv", 
    "drug1__estate_fingerprints.csv", 
    "drug2__estate_fingerprints.csv", 
    "cell_lines__gene_expression.csv"
)
X_tensor_1 = concatenate_features(data_dir, features_tensor_1)
X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1)
X_1 = np.concatenate((X_tensor_1, X_auxiliary_1), axis = 1)

# Features in position 2: Drug B - Drug A
features_tensor_2 = (
    "drug2_concentration__one-hot_encoding.csv", 
    "drug1_concentration__one-hot_encoding.csv", 
    "drug2__one-hot_encoding.csv", 
    "drug1__one-hot_encoding.csv", 
    "cell_lines__one-hot_encoding.csv"
)
features_auxiliary_2 =(
    "drug2_drug1_concentration__values.csv", 
    "drug2__estate_fingerprints.csv", 
    "drug1__estate_fingerprints.csv", 
    "cell_lines__gene_expression.csv"
)
X_tensor_2 = concatenate_features(data_dir, features_tensor_2)
X_auxiliary_2 = concatenate_features(data_dir, features_auxiliary_2)
X_2 = np.concatenate((X_tensor_2, X_auxiliary_2), axis = 1)

# Concatenate the features from both positions vertically
X = np.concatenate((X_1, X_2), axis=0)
print('Dataset shape: {}'.format(X.shape))
print('Non-zeros rate: {:.05f}'.format(np.mean(X != 0)))
print('Number of one-hot encoding features: {}'.format(X_tensor_1.shape[1]))
print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1]))
i_aux = X_tensor_1.shape[1]
del X_tensor_1, X_auxiliary_1, X_tensor_2, X_auxiliary_2, X_1, X_2


Job ID: 2
Reading file: drug1_concentration__one-hot_encoding.csv
Reading file: drug2_concentration__one-hot_encoding.csv
Reading file: drug1__one-hot_encoding.csv
Reading file: drug2__one-hot_encoding.csv
Reading file: cell_lines__one-hot_encoding.csv
... done!
Reading file: drug1_drug2_concentration__values.csv
Reading file: drug1__estate_fingerprints.csv
Reading file: drug2__estate_fingerprints.csv
Reading file: cell_lines__gene_expression.csv
... done!
Reading file: drug2_concentration__one-hot_encoding.csv
Reading file: drug1_concentration__one-hot_encoding.csv
Reading file: drug2__one-hot_encoding.csv
Reading file: drug1__one-hot_encoding.csv
Reading file: cell_lines__one-hot_encoding.csv
... done!
Reading file: drug2_drug1_concentration__values.csv
Reading file: drug2__estate_fingerprints.csv
Reading file: drug1__estate_fingerprints.csv
Reading file: cell_lines__gene_expression.csv
... done!
Dataset shape: (1110600, 400)
Non-zeros rate: 0.25707
Number of one-hot encoding featur

In [12]:
# Read responses
y  = np.loadtxt("../data/data/responses.csv", delimiter = ",", skiprows = 1)
y = np.concatenate((y, y), axis=0)

inner_folds = list(range(1, nfolds_inner+1))
outer_folds = list(range(1, nfolds_outer+1))

outer_fold = outer_folds[id_in]
te_idx = np.loadtxt('cross-validation_folds/%s/test_idx_outer_fold-%d.txt'%(experiment, outer_fold)).astype(int)
tr_idx = np.loadtxt('cross-validation_folds/%s/train_idx_outer_fold-%d.txt'%(experiment, outer_fold)).astype(int)

X_tr, X_te, y_tr, y_te = X[tr_idx,:], X[te_idx,:], y[tr_idx], y[te_idx]

print('Training set shape: {}'.format(X_tr.shape))
print('Test set shape: {}'.format(X_te.shape))

OSError: ../data/data/responses.csv not found.

In [None]:
data = {
    "X_tr": X_tr,
    "X_te": X_te,
    "y_tr": y_tr,
    "y_te": y_te
}

In [None]:
import pickle

In [None]:
pickle.dump(data, open("split0.pkl", "wb"))