# Selected data structures
Creates a file with DECAGON data structures of a reduced or complete dataset from the data from the original and/or reduced matrices, as well as from BDM complete files

## Python 3

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import pickle
import datetime

## Paths initialization

In [2]:
# Parameters
input_file = './data_structures/DS/DS_real_DSE_9702_PF_5_NPF_3_genes_16271_drugs_639_se_964'
prot_status = None # can be 'PF', 'NPF' or None
DSE = True
BDM = True

In [3]:
# Sentinel initialization
PF = False
NPF = False
toy = False
red = False

In [4]:
# Name import and decomposition
words = input_file.split('_')
print(words)

['./data', 'structures/DS/DS', 'real', 'DSE', '9702', 'PF', '5', 'NPF', '3', 'genes', '16271', 'drugs', '639', 'se', '964']


In [5]:
# Deterimes the valuye of toy
if 'toy' in words:
    toy=True
# Protein sentinel update
if prot_status == 'NPF':
    NPF=True
elif prot_status == 'PF':
    PF=True
if toy and NPF:
    raise ValueError("Toy model does not have normalized protein features") 
if int(words[-1]) > 964:
    red = True

In [6]:
# Control Printing
print('toy',toy)
print('DSE',DSE)
print('prot_status',prot_status)
print('PF',PF)
print('NPF',NPF)
print('red',red)

toy False
DSE True
prot_status PF
PF True
NPF False
red False


In [None]:
# Generate full filename if reduced, otherwise use given filename
if red:
    filename_full='./data_structures/DS/DS_' + data_str +'DSE_9702_' +\
    pf_str + '_genes_16271_drugs_639_se_964'
else: filename_full = input_file
print(filename_full)

In [12]:
# Generate BDM filenames
PPI_file = './data_structures/BDM/PPI_BDM_toy_genes_500_juan8'
DTI_file = './data_structures/BDM/DTI_BDM_toy_genes_500_drugs_400_juan8'
DDI_file = './data_structures/BDM/DDI_BDM_toy_se_3_drugs_400_juan8'

## Import datasets

In [None]:
# Import full dataset
with open(filename_full, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

In [None]:
# Control Printing
print('The PPI adj matrix is filled in a',round(np.sum(ppi_adj)/pow(len(gene2idx),2)*100,5),'%')
print('The DTI adj matrix is filled in a',
      round(np.sum(dti_adj)/(len(gene2idx)*len(drug2idx))*100,2),'%')
print('Drug feat',round(np.sum(drug_feat)/(len(drug2idx)*len(se_mono_name2idx))*100,2),'%')
PF_bin = prot_feat>0
print(np.sum(PF_bin))
print('prot feat',round(np.sum(PF_bin)/(np.shape(PF_bin)[0]*np.shape(PF_bin)[1])*100,2),'%')
print('The DDI adj matrix is filled in average in a',
      round(np.mean(np.fromiter((np.sum(x)/(len(drug2idx)*len(drug2idx))
                      *100 for x in ddi_adj_list),float)),2),'%')

In [None]:
# Change names if red
if red:
    ddi_adj_listcmp = ddi_adj_list
    ddi_degrees_listcmp = ddi_degrees_list
    dti_adjcmp = dti_adj
    ppi_adjcmp = ppi_adj
    ppi_degreescmp = ppi_degrees
    drug_featcmp = drug_feat
    prot_featcmp = prot_feat
    norm_prot_featcmp = norm_prot_feat
    gene2idxcmp = gene2idx
    drug2idxcmp = drug2idx
    se_mono_name2idxcmp = se_mono_name2idx
    se_combo_name2idxcmp = se_combo_name2idx

In [None]:
# Load reduced dataset if red
if red:
    with open(input_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    # New dictionaries
    gene2idx = { gene: gene2idxcmp[gene] for gene in gene2idx}
    
    drug2idx = { drug: drug2idxcmp[drug] for drug in drug2idx}
    
    se_mono_name2idx = { sem: se_mono_name2idxcmp[sem] for sem in se_mono_name2idx}
    se_combo_name2idx = { sec: se_combo_name2idxcmp[sec] for sec in se_combo_name2idx}

idx_genes = list(gene2idx.values())
idx_drugs = list(drug2idx.values())
idx_se = list(se_combo_name2idx.values())
n_drugs = len(drug2idx)
n_genes = len(gene2idx)
n_se_combo = len(se_combo_name2idx)
n_se_mono = len(se_mono_name2idx)

In [None]:
# No feature case
if prot_status==None:
    prot_feat = sp.identity(n_genes)
if not DSE:
    drug_feat = sp.identity(n_drugs)

## BDM Features

In [None]:
if BDM:
    # PPI BDM dataset import
    with open(PPI_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_ppi = nodebdm_ppi[idx_genes]
    add_edgebdm_ppi = add_edgebdm_ppi[idx_genes]
    rem_edgebdm_ppi = rem_edgebdm_ppi[idx_genes]
    to_add_bdm_ppi = np.hstack([nodebdm_ppi.reshape(-1,1),add_edgebdm_ppi.reshape(-1,1),
                                rem_edgebdm_ppi.reshape(-1,1)])

In [None]:
if BDM:
    # DTI BDM dataset import
    with open(DTI_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_drugs_dti = nodebdm_drugs_dti[idx_drugs]
    nodebdm_genes_dti = nodebdm_genes_dti[idx_genes]
    add_edgebdm_drugs_dti = add_edgebdm_drugs_dti[idx_drugs]
    add_edgebdm_genes_dti = add_edgebdm_genes_dti[idx_genes]
    rem_edgebdm_drugs_dti = rem_edgebdm_drugs_dti[idx_drugs]
    rem_edgebdm_genes_dti = rem_edgebdm_genes_dti[idx_genes]
    to_add_bdm_drugs_dti = np.hstack([nodebdm_drugs_dti.reshape(-1,1),
                                      add_edgebdm_drugs_dti.reshape(-1,1),
                                      rem_edgebdm_drugs_dti.reshape(-1,1)])
    to_add_bdm_genes_dti = np.hstack([nodebdm_genes_dti.reshape(-1,1),
                                      add_edgebdm_genes_dti.reshape(-1,1),
                                      rem_edgebdm_genes_dti.reshape(-1,1)])
    #verif
    print('Dimension checking')
    print('Should be ~16k,3',np.shape(to_add_bdm_genes_dti))
    print('Should be ~630,3',np.shape(to_add_bdm_drugs_dti))

In [None]:
if BDM:
    # DDI BDM dataset import
    with open(DTI_file, 'rb') as f:
        DS = pickle.load(f)
        for key in DS.keys():
            globals()[key]=DS[key]
            print(key,"Imported successfully")
    nodebdm_ddi_list = [nodebdm_ddi_list[i][idx_drugs] for i in idx_se]
    add_edgebdm_ddi_list = [add_edgebdm_ddi_list[i][idx_drugs] for i in idx_se]
    rem_edgebdm_ddi_list = [rem_edgebdm_ddi_list[i][idx_drugs] for i in idx_se]
    node_ddi = np.hstack([i.reshape(-1,1) for i in nodebdm_ddi_list])
    add_edge_ddi = np.hstack([i.reshape(-1,1) for i in add_edgebdm_ddi_list])
    rem_edge_ddi = np.hstack([i.reshape(-1,1) for i in rem_edgebdm_ddi_list])
    to_add_bdm_ddi = np.hstack([node_ddi,add_edge_ddi,rem_edge_ddi])
    print(np.shape(to_add_bdm_ddi))

In [None]:
# Protein Features
if BDM:
    if PF:
        prot_feat = np.hstack([prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
    # Normalized Protein features
    elif NPF:
        prot_feat = np.hstack([norm_prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
    else:
        prot_feat = np.hstack([to_add_bdm_genes_dti,to_add_bdm_ppi])
    # Drug features
    if DSE:
        drug_feat = np.asarray(np.hstack([drug_feat.todense(),
                                          to_add_bdm_drugs_dti,to_add_bdm_ddi]))
    else:
        drug_feat = np.hstack([to_add_bdm_drugs_dti,to_add_bdm_ddi])
print(np.shape(drug_feat))
print(np.shape(prot_feat))

## Feature matrix processing

In [None]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [None]:
# Drug features
drug_nonzero_feat, drug_num_feat = 2*[drug_feat.shape[1]]
drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat))

In [None]:
# Protein features
gene_nonzero_feat, gene_num_feat = 2*[prot_feat.shape[1]]
gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat))
print(gene_nonzero_feat,gene_num_feat,drug_nonzero_feat,drug_num_feat)

## Creation of Decagon dictionaries

In [None]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

In [None]:
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [None]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

In [None]:
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [None]:
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

In [None]:
num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

In [None]:
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

In [None]:
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

In [None]:
feat = {
    0: gene_feat,
    1: drug_feat,
}

In [None]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

feat = {
    0: gene_feat,
    1: drug_feat,
}

## Saving

In [8]:
real = not toy
data_str = toy*'_toy' + real*'_real'
PF_str = PF*'_PF_5'+NPF*'_NPF_3'

In [9]:
filename_out = './data_structures/DECAGON' + data_str + DSE*('_DSE_'+str(n_se_mono)) + PF_str\
+ BDM*'_BDM' + '_genes_' + str(n_genes) + '_drugs_' + str(n_drugs) + '_se_' + str(n_se_combo)

In [None]:
data_structures = {}
# Graph data structures
data_structures['adj_mats_orig'] = adj_mats_orig
data_structures['degrees'] = degrees
data_structures['edge_type2dim'] = edge_type2dim
data_structures['edge_type2decoder'] = edge_type2decoder
data_structures['edge_types'] = edge_types
data_structures['num_edge_types'] = num_edge_types
# Feature data structures
data_structures['num_feat'] = num_feat
data_structures['nonzero_feat'] = nonzero_feat
data_structures['feat'] = feat
# Dictionaries
data_structures['gene2idx'] = gene2idx
data_structures['drug2idx'] = drug2idx
data_structures['se_mono_name2idx'] = se_mono_name2idx
data_structures['se_combo_name2idx'] = se_combo_name2idx

In [10]:
print(filename_out)
# Control Printing
print('toy',toy)
print('DSE',DSE)
print('prot_status',prot_status)
print('PF',PF)
print('NPF',NPF)
print('red',red)

./data_structures/DECAGON_real_DSE_7374_PF_5_BDM_genes_80085_drugs_5350_se_69
toy False
DSE True
prot_status PF
PF True
NPF False
red False


In [None]:
with open(filename_out, 'wb') as f:
    pickle.dump(data_structures, f, protocol=2)