# Merged data
Loads the bdm data calculated in the different servers and saves it in a single file to be loaded by the deep learning model. 

## Python 3

In [1]:
import numpy as np
import scipy.sparse as sp
import pickle
import datetime

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Loading data
The final version can receive as parameter the filename of the data shelf 

#### Importing of toy data
Run the following cell if the desired dataset is the toy dataset

In [3]:
filename = './data_structures/DS_toy_genes16271_drugs639_se664'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")
drug_feat = drug_feat.todense()
prot_feat = prot_feat.todense()

ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


#### Importing of real data
Run the following cell if the desired dataset is the real dataset from DECAGON

In [None]:
filename = './data_structures/DS_real_DSE_NPF_genes16271_drugs639_se964'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

### BDM DDI

In [None]:
filename = './data_structures/DDI_BDM_se964_drugs639_juadia48'
with open(filename, 'rb') as f:
    ddi = pickle.load(f)
    for key in ddi.keys():
        globals()[key]=ddi[key]
        print(key,"Imported successfully")

In [None]:
print('PPI resources')
print('Time:',datetime.timedelta(seconds=time_ddi),'in',jobs_ddi,'cores')
print('Virtual memory:',vms_ddi*1e-9,'GB. RSS memory:',rss_ddi*1e-9,'GB')

In [None]:
# concatenation ddi-bdm feature vectors
node_ddi = np.hstack([i.reshape(-1,1) for i in nodebdm_ddi_list])
edge_ddi = np.hstack([i.reshape(-1,1) for i in edgebdm_ddi_list])
to_add_bdm_ddi = np.hstack([node_ddi,edge_ddi])
#verif
print(to_add_bdm_ddi.shape,type(to_add_bdm_ddi))

### BDM DTI

In [None]:
# Este archivo tiene los nombres de genes y drugs cambiados
filename = './data_structures/DTI_BDM_genes16271_drugs639_juadia16'
with open(filename, 'rb') as f:
    dti = pickle.load(f)
    for key in dti.keys():
        globals()[key]=dti[key]
        print(key,"Imported successfully")

In [None]:
print('DTI resources')
print('Time:',datetime.timedelta(seconds=time_dti),'in',jobs_dti,'cores')
print('Virtual memory:',vms_dti*1e-9,'GB. RSS memory:',rss_dti*1e-9,'GB')

In [None]:
# Taking into account that the arrays were saved with the wrong names
to_add_bdm_genes_dti = np.hstack([nodebdm_drugs_dti.reshape(-1,1),
                                  edgebdm_drugs_dti.reshape(-1,1)])
to_add_bdm_drugs_dti = np.hstack([nodebdm_genes_dti.reshape(-1,1),
                                  edgebdm_genes_dti.reshape(-1,1)])
#verif
print(np.shape(to_add_bdm_genes_dti),type(to_add_bdm_genes_dti))
print(np.shape(to_add_bdm_drugs_dti),type(to_add_bdm_drugs_dti))

### BDM PPI

In [None]:
filename = 'data_structures/PPI_BDM_genes16271_juadia48'
with open(filename, 'rb') as f:
    ppi = pickle.load(f)
    for key in ppi.keys():
        globals()[key]=ppi[key]
        print(key,"Imported successfully")

In [None]:
print('PPI resources')
print('Time:',datetime.timedelta(seconds=time_ppi),'in',jobs_ppi,'cores')
print('Virtual memory:',vms_ppi*1e-9,'GB. RSS memory:',rss_ppi*1e-9,'GB')

In [None]:
to_add_bdm_ppi = np.hstack([nodebdm_ppi.reshape(-1,1),edgebdm_ppi.reshape(-1,1)])
#verif
print(np.shape(to_add_bdm_ppi),type(to_add_bdm_ppi))

## Concatenation of features

In [None]:
# Protein Features
prot_feat = np.hstack([prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
#verif
print(prot_feat.shape,type(prot_feat))

In [None]:
# Normalized Protein features
norm_prot_feat = np.hstack([norm_prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
print(norm_prot_feat.shape,type(norm_prot_feat))

In [None]:
# Drug features
drug_feat = np.asarray(np.hstack([drug_feat.todense(),to_add_bdm_drugs_dti,to_add_bdm_ddi]))
#verif
print(drug_feat.shape, type(drug_feat))

## Feature matrix processing

In [10]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [11]:
# Drugs
drug_nonzero_feat, drug_num_feat = 2*[drug_feat.shape[1]]
drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat))
# Use proteins
gene_nonzero_feat, gene_num_feat = 2*[prot_feat.shape[1]]
gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat))
# Use normalized proteins
#gene_nonzero_feat, gene_num_feat = 2*[norm_prot_feat.shape[1]]
#gene_feat = sparse_to_tuple(sp.coo_matrix(norm_prot_feat))
#verif
print(gene_nonzero_feat,gene_num_feat,drug_nonzero_feat,drug_num_feat)

16269 16269 630 630


## Creation of Decagon dictionaries

In [12]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

In [13]:
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [14]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

In [15]:
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [16]:
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

In [17]:
num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

Edge types: 16


In [18]:
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

In [19]:
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

In [20]:
feat = {
    0: gene_feat,
    1: drug_feat,
}

## Exporting

In [21]:
filename = './data_structures/DECAGON_toy_red'
#filename = './data_structures/DECAGON_real_DSE_NPF_BDM'

In [22]:
data_structures = {}
# Graph data structures
data_structures['adj_mats_orig'] = adj_mats_orig
data_structures['degrees'] = degrees
data_structures['edge_type2dim'] = edge_type2dim
data_structures['edge_type2decoder'] = edge_type2decoder
data_structures['edge_types'] = edge_types
data_structures['num_edge_types'] = num_edge_types
# Feature data structures
data_structures['num_feat'] = num_feat
data_structures['nonzero_feat'] = nonzero_feat
data_structures['feat'] = feat
# Dictionaries
#data_structures['gene2idx'] = gene2idx
#data_structures['drug2idx'] = drug2idx
#data_structures['se_mono_name2idx'] = se_mono_name2idx
#data_structures['se_combo_name2idx'] = se_combo_name2idx

In [23]:
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(data_structures, f, protocol=2)