# Selected data structures
Creates a file with DECAGON data structures of any size, taking the matrices calculated for the complete dataset. The code selects the parts of the matrices calculated for the complete dataset and assembles data structures for a given subset of the data.

## Python 3

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import pickle
import datetime

## Parameters

In [2]:
# Parameters
input_file = './data_structures/DS/DS_toy_DSE_9688_PF_5_genes_16266_drugs_627_se_6'
prot_status = None # can be 'PF', 'NPF' or None
DSE = True
BDM = True
# Generate BDM filenames
if BDM:
    PPI_file = './data_structures/BDM/PPI_BDM_toy_genes_16271_juadia48'
    DTI_file = './data_structures/BDM/DTI_BDM_toy_genes_16271_drugs_639_juadia8'
    DDI_file = './data_structures/BDM/DDI_BDM_toy_se_964_drugs_639_juadia48'

In [3]:
# Sentinel initialization
PF = False
NPF = False
toy = False
red = False

In [4]:
# Name import and decomposition
words = input_file.split('_')
print(words)

['./data', 'structures/DS/DS', 'toy', 'DSE', '9688', 'PF', '5', 'genes', '16266', 'drugs', '627', 'se', '6']


In [5]:
# Deterimes the valuye of toy
if 'toy' in words:
    toy=True
# Protein sentinel update
if prot_status == 'NPF':
    NPF=True
elif prot_status == 'PF':
    PF=True
if toy and NPF:
    raise ValueError("Toy model does not have normalized protein features") 
if int(words[-1]) < 964:
    red = True

In [6]:
# Generate full filename if reduced, otherwise use given filename
if red:
    words [4] = '9702'
    words [-5] = '16271'
    words [-3] = '639'
    words [-1] = '964'
    filename_full = '_'.join(words)
else: filename_full = input_file
print(filename_full)

./data_structures/DS/DS_toy_DSE_9702_PF_5_genes_16271_drugs_639_se_964


## Import datasets

In [7]:
# Import full dataset
with open(filename_full, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


In [8]:
# Change names of loaded datasets to avoid overwritting when loading reduced dataset
if red:
    ddi_adj_listcmp = ddi_adj_list
    ddi_degrees_listcmp = ddi_degrees_list
    dti_adjcmp = dti_adj
    ppi_adjcmp = ppi_adj
    ppi_degreescmp = ppi_degrees
    drug_featcmp = drug_feat
    gene2idxcmp = gene2idx
    drug2idxcmp = drug2idx
    se_mono_name2idxcmp = se_mono_name2idx
    se_combo_name2idxcmp = se_combo_name2idx
    prot_featcmp = prot_feat
    if not toy: norm_prot_featcmp = norm_prot_feat
    with open(input_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    # New dictionaries
    gene2idx = { gene: gene2idxcmp[gene] for gene in gene2idx}
    
    drug2idx = { drug: drug2idxcmp[drug] for drug in drug2idx}
    
    se_mono_name2idx = { sem: se_mono_name2idxcmp[sem] for sem in se_mono_name2idx}
    se_combo_name2idx = { sec: se_combo_name2idxcmp[sec] for sec in se_combo_name2idx}


gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


In [9]:
idx_genes = list(gene2idx.values())
idx_drugs = list(drug2idx.values())
idx_se = list(se_combo_name2idx.values())
n_drugs = len(drug2idx)
n_genes = len(gene2idx)
n_se_combo = len(se_combo_name2idx)
n_se_mono = len(se_mono_name2idx)

In [10]:
# No feature case
if prot_status==None:
    prot_feat = sp.identity(n_genes)
if not DSE:
    drug_feat = sp.identity(n_drugs)

## BDM Features

In [11]:
if BDM:
    # PPI BDM dataset import
    with open(PPI_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_ppi = nodebdm_ppi[idx_genes]
    add_edgebdm_ppi = add_edgebdm_ppi[idx_genes]
    rem_edgebdm_ppi = rem_edgebdm_ppi[idx_genes]
    to_add_bdm_ppi = np.hstack([nodebdm_ppi.reshape(-1,1),add_edgebdm_ppi.reshape(-1,1),
                                rem_edgebdm_ppi.reshape(-1,1)])
    print(np.shape(to_add_bdm_ppi))
    # DTI BDM dataset import
    with open(DTI_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_drugs_dti = nodebdm_drugs_dti[idx_drugs]
    nodebdm_genes_dti = nodebdm_genes_dti[idx_genes]
    add_edgebdm_drugs_dti = add_edgebdm_drugs_dti[idx_drugs]
    add_edgebdm_genes_dti = add_edgebdm_genes_dti[idx_genes]
    rem_edgebdm_drugs_dti = rem_edgebdm_drugs_dti[idx_drugs]
    rem_edgebdm_genes_dti = rem_edgebdm_genes_dti[idx_genes]
    to_add_bdm_drugs_dti = np.hstack([nodebdm_drugs_dti.reshape(-1,1),
                                      add_edgebdm_drugs_dti.reshape(-1,1),
                                      rem_edgebdm_drugs_dti.reshape(-1,1)])
    to_add_bdm_genes_dti = np.hstack([nodebdm_genes_dti.reshape(-1,1),
                                      add_edgebdm_genes_dti.reshape(-1,1),
                                      rem_edgebdm_genes_dti.reshape(-1,1)])
    #verif
    print('Dimension checking')
    print('Should be ~16k,3',np.shape(to_add_bdm_genes_dti))
    print('Should be ~630,3',np.shape(to_add_bdm_drugs_dti))
    # DDI BDM dataset import
    with open(DDI_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_ddi_list = [nodebdm_ddi_list[i][idx_drugs] for i in idx_se]
    add_edgebdm_ddi_list = [add_edgebdm_ddi_list[i][idx_drugs] for i in idx_se]
    rem_edgebdm_ddi_list = [rem_edgebdm_ddi_list[i][idx_drugs] for i in idx_se]
    node_ddi = np.hstack([i.reshape(-1,1) for i in nodebdm_ddi_list])
    add_edge_ddi = np.hstack([i.reshape(-1,1) for i in add_edgebdm_ddi_list])
    rem_edge_ddi = np.hstack([i.reshape(-1,1) for i in rem_edgebdm_ddi_list])
    to_add_bdm_ddi = np.hstack([node_ddi,add_edge_ddi,rem_edge_ddi])
    print(np.shape(to_add_bdm_ddi))

nodebdm_ppi Imported successfully
add_edgebdm_ppi Imported successfully
rem_edgebdm_ppi Imported successfully
vms_ppi Imported successfully
rss_ppi Imported successfully
time_ppi Imported successfully
jobs_ppi Imported successfully
(16266, 3)
nodebdm_drugs_dti Imported successfully
nodebdm_genes_dti Imported successfully
add_edgebdm_drugs_dti Imported successfully
add_edgebdm_genes_dti Imported successfully
rem_edgebdm_drugs_dti Imported successfully
rem_edgebdm_genes_dti Imported successfully
vms_dti Imported successfully
rss_dti Imported successfully
time_dti Imported successfully
jobs_dti Imported successfully
Dimension checking
Should be ~16k,3 (16266, 3)
Should be ~630,3 (627, 3)
nodebdm_ddi_list Imported successfully
add_edgebdm_ddi_list Imported successfully
rem_edgebdm_ddi_list Imported successfully
vms_ddi Imported successfully
rss_ddi Imported successfully
time_ddi Imported successfully
jobs_ddi Imported successfully
(627, 18)


In [12]:
# Protein Features
if BDM:
    if PF:
        prot_feat = np.hstack([prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
    # Normalized Protein features
    elif NPF:
        prot_feat = np.hstack([norm_prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
    else:
        prot_feat = np.hstack([to_add_bdm_genes_dti,to_add_bdm_ppi])
    # Drug features
    if DSE:
        drug_feat = np.asarray(np.hstack([drug_feat.todense(),
                                          to_add_bdm_drugs_dti,to_add_bdm_ddi]))
    else:
        drug_feat = np.hstack([to_add_bdm_drugs_dti,to_add_bdm_ddi])
print(np.shape(drug_feat))
print(np.shape(prot_feat))

(627, 9709)
(16266, 6)


## Feature matrix processing

In [13]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [14]:
# Drug features
drug_num_feat = drug_feat.shape[1]
drug_nonzero_feat = len(np.nonzero(drug_feat)[0])
drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat))

In [15]:
# Protein features
gene_num_feat = prot_feat.shape[1]
gene_nonzero_feat = len(np.nonzero(prot_feat)[0])
gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat))

## Creation of Decagon dictionaries

In [16]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

In [17]:
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [18]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

In [19]:
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [20]:
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

In [21]:
num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

Edge types: 16


In [22]:
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

In [23]:
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

In [24]:
feat = {
    0: gene_feat,
    1: drug_feat,
}

In [25]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

feat = {
    0: gene_feat,
    1: drug_feat,
}

Edge types: 16


## Saving

In [26]:
real = not toy
data_str = toy*'_toy' + real*'_real'
PF_str = PF*'_PF_5'+NPF*'_NPF_3'

In [27]:
filename_out = './data_structures/DECAGON/DECAGON' + data_str + DSE*('_DSE_'+str(n_se_mono)) +\
PF_str + BDM*'_BDM' + '_genes_' + str(n_genes) + '_drugs_' + str(n_drugs) + '_se_' +\
str(n_se_combo)
print(filename_out)

./data_structures/DECAGON/DECAGON_toy_DSE_9688_BDM_genes_16266_drugs_627_se_6


In [28]:
data_structures = {}
# Graph data structures
data_structures['adj_mats_orig'] = adj_mats_orig
data_structures['degrees'] = degrees
data_structures['edge_type2dim'] = edge_type2dim
data_structures['edge_type2decoder'] = edge_type2decoder
data_structures['edge_types'] = edge_types
data_structures['num_edge_types'] = num_edge_types
# Feature data structures
data_structures['num_feat'] = num_feat
data_structures['nonzero_feat'] = nonzero_feat
data_structures['feat'] = feat
# Dictionaries
data_structures['gene2idx'] = gene2idx
data_structures['drug2idx'] = drug2idx
data_structures['se_mono_name2idx'] = se_mono_name2idx
data_structures['se_combo_name2idx'] = se_combo_name2idx

In [29]:
with open(filename_out, 'wb') as f:
    pickle.dump(data_structures, f, protocol=2)