# Merged data
Loads the bdm data calculated in the different servers and saves it in a single file to be loaded by the deep learning model. 

In [10]:
import numpy as np
import scipy.sparse as sp
import pickle
import shelve
from pybdm import BDM
from algorithms import PerturbationExperiment, NodePerturbationExperiment
import datetime

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Loading data
The final version can receive as parameter the filename of the data shelf 

### Importing of toy data
Run the following cell if the desired dataset is the toy dataset

In [None]:
filename = './data_structures/DS_toy_DSE_genes16271_drugs639_se964'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

### Importing of real data
Run the following cell if the desired dataset is the real dataset from DECAGON

In [2]:
filename = './data_structures/DS_real_DSE_NPF_genes16271_drugs639_se964'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully
norm_prot_feat Imported successfully


### BDM DDI

In [None]:
filename = './data_structures/ddi_bdm_se964_drugs_639...'
with open(filename, 'rb') as f:
    ddi = pickle.load(f)
    for key in ddi.keys():
        globals()[key]=ddi[key]
        print(key,"Imported successfully")

In [None]:
# concatenation ddi-bdm feature vectors
nodebdm_ddi_array = np.hstack([i.reshape(-1,1) for i in nodebdm_ddi_list])
edgebdm_ddi_array = np.hstack([i.reshape(-1,1) for i in edgebdm_ddi_list])

In [None]:
print(nodebdm_ddi_array.shape)
print(edgebdm_ddi_array.shape)
nd, se = nodebdm_ddi_array.shape

### BDM DTI

In [None]:
filename = 'data_structures/dti_bdm_genes16271_drugs_639...'
with open(filename, 'rb') as f:
    dti = pickle.load(f)
    for key in dti.keys():
        globals()[key]=dti[key]
        print(key,"Imported successfully")

### BDM PPI

In [4]:
filename = 'data_structures/PPI_BDM_genes16271_juadia48'
with open(filename, 'rb') as f:
    ppi = pickle.load(f)
    for key in ppi.keys():
        globals()[key]=ppi[key]
        print(key,"Imported successfully")

nodebdm_ppi Imported successfully
edgebdm_ppi Imported successfully
vms_ppi Imported successfully
rss_ppi Imported successfully
time_ppi Imported successfully
jobs_ppi Imported successfully


In [15]:
print(nodebdm_ppi.shape)
print(edgebdm_ppi.shape)
print(datetime.timedelta(seconds=time_ppi),jobs_ppi,'cores')
print('Virtual memroy:',vms_ppi*1e-9,'GB. RSS memory:',rss_ppi*1e-9,'GB')

(16271,)
(16271,)
1 day, 19:54:46.236318 48 cores
Virtual memroy: 6.8048855040000005 GB. RSS memory: 4.434952192 GB


# Concatenation fo features

In [43]:
# Protein Features
prot_feat = np.column_stack((prot_feat.todense(),nodebdm_ppi.reshape(-1,1),
                             edgebdm_ppi.reshape(-1,1),nodebdm_dti.reshape(-1,1),
                             edgebdm_dti.reshape(-1,1)))
print(prot_feat.shape)

(16271, 7)


In [24]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [25]:
gene_nonzero_feat, gene_num_feat = 2*[norm_prot_feat.shape[1]]
drug_nonzero_feat, drug_num_feat = 2*[drug_feat.shape[1]]
dsrug_feat = sparse_to_tuple(drug_feat.tocoo())
gene_feat = sparse_to_tuple(norm_prot_feat.tocoo())

In [26]:
print(gene_nonzero_feat,gene_num_feat,drug_nonzero_feat,drug_num_feat)

3 3 9702 9702


# Creation of Decagon dictionaries

In [27]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

In [28]:
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [29]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

In [30]:
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [31]:
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

In [32]:
num_edge_types = sum(edge_types.values())
print("Edge types:", "%d" % num_edge_types)

Edge types: 1932


In [33]:
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

In [34]:
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

In [35]:
feat = {
    0: gene_feat,
    1: drug_feat,
}

## Exporting

In [37]:
filename = './data_structures/DECAGON_real_DSE_NPF'

In [38]:
data_structures = {}
# Graph data structures
data_structures['adj_mats_orig'] = adj_mats_orig
data_structures['degrees'] = degrees
data_structures['edge_type2dim'] = edge_type2dim
data_structures['edge_type2decoder'] = edge_type2decoder
data_structures['edge_types'] = edge_types
data_structures['num_edge_types'] = num_edge_types
# Feature data structures
data_structures['num_feat'] = num_feat
data_structures['nonzero_feat'] = nonzero_feat
data_structures['feat'] = feat
# Dictionaries
data_structures['gene2idx'] = gene2idx
data_structures['drug2idx'] = drug2idx
data_structures['se_mono_name2idx'] = se_mono_name2idx
data_structures['se_combo_name2idx'] = se_combo_name2idx

In [39]:
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(data_structures, f, protocol=2)

In [None]:
filename