# Selected data structures
Creates a file with DECAGON data structures of a reduced dataset from the data from the original and reduced matrices, as well as from BDM complete files

## Python 3

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import pickle
import datetime

In [2]:
# Parameters
filename = './data_structures/DS_toy_genes16269_drugs630_se6'
prot_status = None # can be 'PF', 'NPF' or None
BDM = False

In [3]:
# Sentinel initialization
PF = False
NPF = False
red = False
toy = False
DSE = False

In [4]:
# Name import and decomposition
words = filename.split('_')
print(words)

['./data', 'structures/DS', 'toy', 'genes16269', 'drugs630', 'se6']


In [5]:
# Sentinel update
if 'toy' in words:
    toy=True
if 'DSE' in words:
    DSE=True
if prot_status == 'NPF':
    NPF=True
if prot_status == 'PF':
    PF=True
# Number of side effects
SE = int(words[-1][2:])
if 964-SE > 0:
    red = True

In [6]:
# Generate full filename if reduced, otherwise use given filename
if red:
    filename_full=''
    for w in words[:-3]:
        filename_full += w + '_'
    filename_full += 'genes16271_drugs639_se964'
else:
    filename_full=filename
print(filename_full)

./data_structures/DS_toy_genes16271_drugs639_se964


In [7]:
# Import full dataset
with open(filename_full, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")
if toy:
    drug_feat = drug_feat.todense()
    norm_prot_feat = prot_feat.todense()
    prot_feat = prot_feat.todense() #Necessary?
    gene2idx = {i:i for i in range(n_genes)}
    drug2idx = {i:i for i in range(n_drugs)}
    se_mono_name2idx = {i:i for i in range(n_se_mono)}
    se_combo_name2idx = {i:i for i in range(n_se_combo)}

n_genes Imported successfully
n_drugs Imported successfully
n_se_combo Imported successfully
n_se_mono Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


In [8]:
# Change names if red
if red:
    ddi_adj_listcmp = ddi_adj_list
    ddi_degrees_listcmp = ddi_degrees_list
    dti_adjcmp = dti_adj
    ppi_adjcmp = ppi_adj
    ppi_degreescmp = ppi_degrees
    drug_featcmp = drug_feat
    prot_featcmp = prot_feat
    norm_prot_featcmp = norm_prot_feat
    gene2idxcmp = gene2idx
    drug2idxcmp = drug2idx
    se_mono_name2idxcmp = se_mono_name2idx
    se_combo_name2idxcmp = se_combo_name2idx

In [18]:
# Load reduced dataset if red
if red:
    with open(filename, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    if toy:
        drug_feat = drug_feat.todense()
        norm_prot_feat = prot_feat.todense()
        prot_feat = prot_feat.todense() #Necessary?
        gene2idx = {i:i for i in range(n_genes)}
        drug2idx = {i:i for i in range(n_drugs)}
        se_mono_name2idx = {i:i for i in range(n_se_mono)}
        se_combo_name2idx = {i:i for i in range(n_se_combo)}
    # New dictionaries
    gene2idx = { gene: gene2idxcmp[gene] for gene in gene2idx }
    drug2idx = { drug: drug2idxcmp[drug] for drug in drug2idx }
    se_mono = { sem: se_mono_name2idxcmp[sem] for sem in se_mono_name2idx }
    se_combo = { sec: se_combo_name2idxcmp[sec] for sec in se_combo_name2idx }
    idx_genes = list(gene2idx.values())
    idx_drugs = list(drug2idx.values())
    idx_se = list(se_combo.values())

n_genes Imported successfully
n_drugs Imported successfully
n_secombo Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


In [19]:
len(gene2idx)

16269

In [20]:
if prot_status==None:
    prot_feat = sp.identity(n_genes)
if not DSE:
    drug_feat = sp.identity(n_drugs)

## BDM Features

In [11]:
if BDM:
    # PPI BDM dataset import
    filename = './data_structures/PPI_BDM_genes16271_juadia48'
    with open(filename, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_ppi = nodebdm_ppi[idx_genes]
    edgebdm_ppi = edgebdm_ppi[idx_genes]
    to_add_bdm_ppi = np.hstack([nodebdm_ppi.reshape(-1,1),edgebdm_ppi.reshape(-1,1)])
    # DTI BDM dataset import
    filename = './data_structures/DTI_BDM_genes16271_drugs639_juadia16'
    with open(filename, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_drugs_dti = nodebdm_drugs_dti[idx_genes]
    nodebdm_genes_dti = nodebdm_genes_dti[idx_drugs]
    edgebdm_drugs_dti = edgebdm_drugs_dti[idx_genes]
    edgebdm_genes_dti = edgebdm_genes_dti[idx_drugs]
    # Taking into account that the arrays were saved with the wrong names (Already corrected)
    to_add_bdm_genes_dti = np.hstack([nodebdm_drugs_dti.reshape(-1,1),
                                      edgebdm_drugs_dti.reshape(-1,1)])
    to_add_bdm_drugs_dti = np.hstack([nodebdm_genes_dti.reshape(-1,1),
                                      edgebdm_genes_dti.reshape(-1,1)])
    #verif
    print('Dimension checking')
    print('Should be ~16k,2',np.shape(to_add_bdm_genes_dti))
    print('Should be ~630,2',np.shape(to_add_bdm_drugs_dti))
    # DDI BDM dataset import
    filename = './data_structures/DDI_BDM_se964_drugs639_juadia48'
    with open(filename, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_ddi_list = [nodebdm_ddi_list[i][idx_drugs] for i in idx_se]
    edgebdm_ddi_list = [edgebdm_ddi_list[i][idx_drugs] for i in idx_se]
    node_ddi = np.hstack([i.reshape(-1,1) for i in nodebdm_ddi_list])
    edge_ddi = np.hstack([i.reshape(-1,1) for i in edgebdm_ddi_list])
    to_add_bdm_ddi = np.hstack([node_ddi,edge_ddi])
    # Protein Features
    if PF:
        prot_feat = np.hstack([prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
    # Normalized Protein features
    elif NPF:
        prot_feat = np.hstack([norm_prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
    # Drug features
    if DSE:
        drug_feat = np.asarray(np.hstack([drug_feat.todense(),to_add_bdm_drugs_dti,to_add_bdm_ddi]))

## Feature matrix processing

In [12]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [13]:
# Drug features
drug_nonzero_feat, drug_num_feat = 2*[drug_feat.shape[1]]
drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat))

In [14]:
# Protein features
gene_nonzero_feat, gene_num_feat = 2*[prot_feat.shape[1]]
gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat))
print(gene_nonzero_feat,gene_num_feat,drug_nonzero_feat,drug_num_feat)

16271 16271 639 639


## Creation of Decagon dictionaries

In [None]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

In [None]:
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [None]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

In [None]:
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [None]:
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

In [None]:
num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

In [None]:
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

In [None]:
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

In [None]:
feat = {
    0: gene_feat,
    1: drug_feat,
}

In [None]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

feat = {
    0: gene_feat,
    1: drug_feat,
}

In [None]:
if toy:
    data_str='toy_'
else:
    data_str='real_'
PF_str = PF*'PF_'+NPF*'NPF_' 

In [None]:
filename_out = './data_structures/DECAGON_' + data_str + red*'reduced_' + DSE*'DSE_' +\
PF_str + BDM*'BDM'

In [None]:
data_structures = {}
# Graph data structures
data_structures['adj_mats_orig'] = adj_mats_orig
data_structures['degrees'] = degrees
data_structures['edge_type2dim'] = edge_type2dim
data_structures['edge_type2decoder'] = edge_type2decoder
data_structures['edge_types'] = edge_types
data_structures['num_edge_types'] = num_edge_types
# Feature data structures
data_structures['num_feat'] = num_feat
data_structures['nonzero_feat'] = nonzero_feat
data_structures['feat'] = feat
# Dictionaries
data_structures['gene2idx'] = gene2idx
data_structures['drug2idx'] = drug2idx
data_structures['se_mono_name2idx'] = se_mono
data_structures['se_combo_name2idx'] = se_combo

In [None]:
with open(filename_out, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(data_structures, f, protocol=2)