In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import os
import psutil
import shelve
import pickle
from pybdm import BDM
from pybdm.utils import decompose_dataset
from joblib import Parallel, delayed
from joblib import parallel_backend
from data.algorithms import PerturbationExperiment, NodePerturbationExperiment
import math
import datetime
from itertools import product

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## Selection of reduced ds from full dataset

In [80]:
# Full dataset import
filename = './data/data_structures/DS_real_DSE_NPF_genes16271_drugs639_se964'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully
norm_prot_feat Imported successfully


In [81]:
# Change names
gene2idxcmp = gene2idx
drug2idxcmp = drug2idx
se_mono_name2idxcmp = se_mono_name2idx
se_combo_name2idxcmp = se_combo_name2idx
ddi_adj_listcmp = ddi_adj_list
ddi_degrees_listcmp = ddi_degrees_list
dti_adjcmp = dti_adj
ppi_adjcmp = ppi_adj
ppi_degreescmp = ppi_degrees
drug_featcmp = drug_feat
prot_featcmp = prot_feat
norm_prot_featcmp = norm_prot_feat

In [82]:
# reduced dataset import
filename = './data/data_structures/DS_real_DSE_NPF_genes16269_drugs630_se6'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully
norm_prot_feat Imported successfully


In [83]:
genes = { gene: gene2idxcmp[gene] for gene in gene2idx }
drugs = { drug: drug2idxcmp[drug] for drug in drug2idx }
se_mono = { sem: se_mono_name2idxcmp[sem] for sem in se_mono_name2idx }
se_combo = { sec: se_combo_name2idxcmp[sec] for sec in se_combo_name2idx }

In [84]:
print(len(genes.values()))
print(max(genes.values()))
print(len(gene2idxcmp.values()))
print(len(drugs.values()))
print(max(drugs.values()))
print(len(drug2idxcmp.values()))
print(len(se_mono.values()))
print(max(se_mono.values()))
print(len(se_mono_name2idxcmp.values()))
print(len(se_combo.values()))
print(max(se_combo.values()))
print(len(se_combo_name2idxcmp.values()))

16269
16270
16271
630
637
639
9688
9701
9702
6
42
964


In [85]:
idx_genes = list(genes.values())
idx_drugs = list(drugs.values())
idx_se = list(se_combo.values())

In [86]:
# PPI BDM dataset import
filename = './data/data_structures/PPI_BDM_genes16271_juadia48'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

nodebdm_ppi Imported successfully
edgebdm_ppi Imported successfully
vms_ppi Imported successfully
rss_ppi Imported successfully
time_ppi Imported successfully
jobs_ppi Imported successfully


In [87]:
nodebdm_ppi = nodebdm_ppi[idx_genes]
edgebdm_ppi = edgebdm_ppi[idx_genes]

In [88]:
to_add_bdm_ppi = np.hstack([nodebdm_ppi.reshape(-1,1),edgebdm_ppi.reshape(-1,1)])
#verif
print(np.shape(to_add_bdm_ppi),type(to_add_bdm_ppi))

(16269, 2) <class 'numpy.ndarray'>


In [89]:
# DTI BDM dataset import
filename = './data/data_structures/DTI_BDM_genes16271_drugs639_juadia16'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

nodebdm_drugs_dti Imported successfully
nodebdm_genes_dti Imported successfully
edgebdm_drugs_dti Imported successfully
edgebdm_genes_dti Imported successfully
vms_dti Imported successfully
rss_dti Imported successfully
time_dti Imported successfully
jobs_dti Imported successfully


In [90]:
nodebdm_drugs_dti = nodebdm_drugs_dti[idx_genes]
nodebdm_genes_dti = nodebdm_genes_dti[idx_drugs]
edgebdm_drugs_dti = edgebdm_drugs_dti[idx_genes]
edgebdm_genes_dti = edgebdm_genes_dti[idx_drugs]

In [91]:
# Taking into account that the arrays were saved with the wrong names
to_add_bdm_genes_dti = np.hstack([nodebdm_drugs_dti.reshape(-1,1),
                                  edgebdm_drugs_dti.reshape(-1,1)])
to_add_bdm_drugs_dti = np.hstack([nodebdm_genes_dti.reshape(-1,1),
                                  edgebdm_genes_dti.reshape(-1,1)])
#verif
print(np.shape(to_add_bdm_genes_dti),type(to_add_bdm_genes_dti))
print(np.shape(to_add_bdm_drugs_dti),type(to_add_bdm_drugs_dti))

(16269, 2) <class 'numpy.ndarray'>
(630, 2) <class 'numpy.ndarray'>


In [92]:
# DDI BDM dataset import
filename = './data/data_structures/DDI_BDM_se964_drugs639_juadia48'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

nodebdm_ddi_list Imported successfully
edgebdm_ddi_list Imported successfully
vms_ddi Imported successfully
rss_ddi Imported successfully
time_ddi Imported successfully
jobs_ddi Imported successfully


In [93]:
nodebdm_ddi_list = [nodebdm_ddi_list[i][idx_drugs] for i in idx_se]
edgebdm_ddi_list = [edgebdm_ddi_list[i][idx_drugs] for i in idx_se]

In [94]:
# concatenation ddi-bdm feature vectors
node_ddi = np.hstack([i.reshape(-1,1) for i in nodebdm_ddi_list])
edge_ddi = np.hstack([i.reshape(-1,1) for i in edgebdm_ddi_list])
to_add_bdm_ddi = np.hstack([node_ddi,edge_ddi])
#verif
print(to_add_bdm_ddi.shape,type(to_add_bdm_ddi))

(630, 12) <class 'numpy.ndarray'>


## Concatenation of features

In [95]:
# Protein Features
prot_feat = np.hstack([prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
#verif
print(prot_feat.shape,type(prot_feat))

(16269, 9) <class 'numpy.matrix'>


In [96]:
# Normalized Protein features
norm_prot_feat = np.hstack([norm_prot_feat.todense(),to_add_bdm_genes_dti,to_add_bdm_ppi])
print(norm_prot_feat.shape,type(norm_prot_feat))

(16269, 7) <class 'numpy.matrix'>


In [97]:
# Drug features
drug_feat = np.asarray(np.hstack([drug_feat.todense(),to_add_bdm_drugs_dti,to_add_bdm_ddi]))
#verif
print(drug_feat.shape, type(drug_feat))

(630, 9702) <class 'numpy.ndarray'>


## Feature matrix processing

In [98]:
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [99]:
# Drugs
drug_nonzero_feat, drug_num_feat = 2*[drug_feat.shape[1]]
drug_feat = sparse_to_tuple(sp.coo_matrix(drug_feat))


In [101]:
# Use proteins
prot_feat = sp.identity(len(genes))
gene_nonzero_feat, gene_num_feat = 2*[prot_feat.shape[1]]
gene_feat = sparse_to_tuple(sp.coo_matrix(prot_feat))
# Use normalized proteins
#gene_nonzero_feat, gene_num_feat = 2*[norm_prot_feat.shape[1]]
#gene_feat = sparse_to_tuple(sp.coo_matrix(norm_prot_feat))
#verif
print(gene_nonzero_feat,gene_num_feat,drug_nonzero_feat,drug_num_feat)

16269 16269 9702 9702


## Creation of Decagon dictionaries

In [102]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

In [103]:
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [104]:
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

In [105]:
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

In [106]:
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

In [107]:
num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

Edge types: 16


In [108]:
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

In [109]:
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

In [110]:
feat = {
    0: gene_feat,
    1: drug_feat,
}

In [111]:
adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}

degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}

edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}

edge_types = {k: len(v) for k, v in adj_mats_orig.items()}

num_edge_types = sum(list(edge_types.values()))
print("Edge types:", "%d" % num_edge_types)

num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}

nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}

feat = {
    0: gene_feat,
    1: drug_feat,
}

Edge types: 16


In [77]:
#filename = './data_structures/DECAGON_toy_test'
filename = './data/data_structures/DECAGON_real_reduced_DSE_NPF_BDM'

In [78]:
data_structures = {}
# Graph data structures
data_structures['adj_mats_orig'] = adj_mats_orig
data_structures['degrees'] = degrees
data_structures['edge_type2dim'] = edge_type2dim
data_structures['edge_type2decoder'] = edge_type2decoder
data_structures['edge_types'] = edge_types
data_structures['num_edge_types'] = num_edge_types
# Feature data structures
data_structures['num_feat'] = num_feat
data_structures['nonzero_feat'] = nonzero_feat
data_structures['feat'] = feat
# Dictionaries
data_structures['gene2idx'] = genes
data_structures['drug2idx'] = drugs
data_structures['se_mono_name2idx'] = se_mono
data_structures['se_combo_name2idx'] = se_combo

In [79]:
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(data_structures, f, protocol=2)

## Visualization of database for ppt

In [None]:
PPI = pd.read_csv('data/original_data/bio-decagon-ppi.csv',sep=',')
PF = pd.read_csv('data/original_data/proteins.csv',sep=';')
DTI = pd.read_csv('data/original_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('data/original_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('data/original_data/bio-decagon-mono.csv',sep=',')

In [None]:
DSE.head()

## Modifying toy data to add drug features (fast)

In [None]:
filename = './data/data_structures/DS_toy_genes16271_drugs639_se964'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

In [None]:
n_genes,n_drugs = np.shape(dti_adj)
n_se = len(ddi_degrees_list)
print(n_genes,n_drugs,n_se)

In [None]:
drug_feat = sp.csr_matrix(np.random.randint(0,2,(637,9702)))

In [None]:
filename = './data/data_structures/DS_toy_DSE_genes'+str(n_genes)+'_drugs'+str(n_drugs)+'_se964'
data = {}
# DDI
data['ddi_adj_list'] = ddi_adj_list
data['ddi_degrees_list'] = ddi_degrees_list
# DTI
data['dti_adj'] = dti_adj
# PPI
data['ppi_adj'] = ppi_adj
data['ppi_degrees'] = ppi_degrees
# DSE
data['drug_feat'] = drug_feat
# PF
data['prot_feat'] = prot_feat

In [None]:
with open(filename, 'wb') as f:
    pickle.dump(data, f, protocol=3)

## Modifying data to remove features

In [None]:
filename = './data/data_structures/DS_real_DSE_NPF_genes16271_drugs639_se964'
with open(filename, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

In [None]:
n_genes,n_drugs = np.shape(dti_adj)
n_se = len(ddi_degrees_list)
print(n_genes,n_drugs,n_se)

In [None]:
# Modify DSE and PF
drug_feat = sp.csr_matrix(np.random.randint(0,2,(n_drugs,n_drugs)))
prot_feat = sp.csr_matrix(np.random.randint(0,2,(n_genes,n_genes)))

In [None]:
# Modify only PF
prot_feat = sp.csr_matrix(np.random.randint(0,2,(n_genes,n_genes)))

In [None]:
# SAVING DATA STRUCTURES
filename = './data/data_structures/DS_real_DSE_genes'+str(n_genes)+'_drugs'+str(n_drugs)+'_se'+str(n_se)
data = {}
# Dictionaries
data['gene2idx'] = gene2idx
data['drug2idx'] = drug2idx
data['se_mono_name2idx'] = se_mono_name2idx
data['se_combo_name2idx'] = se_combo_name2idx
# DDI
data['ddi_adj_list'] = ddi_adj_list
data['ddi_degrees_list'] = ddi_degrees_list
# DTI
data['dti_adj'] = dti_adj
# PPI
data['ppi_adj'] = ppi_adj
data['ppi_degrees'] = ppi_degrees
# DSE
data['drug_feat'] = drug_feat
# PF
data['prot_feat'] = prot_feat
#data['norm_prot_feat'] = norm_prot_feat
with open(filename, 'wb') as f:
    pickle.dump(data, f, protocol=3)