# bin-BDM
This notebook transforms the algorithmic complexity feature vectors (BDM) of the different adjacency matrices involved in DECAGON into sparse feature vectors. This is done replacing the most positive values with a $1$, the most negative with a $-1$ and the ones in the middle with zeros. The thresholds are calculated using one standard deviation from the mean.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
from itertools import product

In [2]:
ppi_in_file = 'data_structures/BDM/PPI_BDM_real_genes_19081'
dti_in_file = 'data_structures/BDM/DTI_BDM_real_genes_19081_drugs_639_juadia16'
ddi_in_file = 'data_structures/BDM/DDI_BDM_real_se_964_drugs_639_juadia72'

# PPI

In [3]:
with open(ppi_in_file,'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

nodebdm_ppi Imported successfully
add_edgebdm_ppi Imported successfully
rem_edgebdm_ppi Imported successfully
vms_ppi Imported successfully
rss_ppi Imported successfully
time_ppi Imported successfully
jobs_ppi Imported successfully


In [4]:
# Mean and std
m_nodes = np.mean(nodebdm_ppi)
s_nodes = np.std(nodebdm_ppi)
m_rem = np.mean(rem_edgebdm_ppi)
s_rem = np.std(rem_edgebdm_ppi)
print('Mean of node BDM is ',m_nodes,' and std is ', s_nodes)
print('Mean of remove edges BDM is ',m_rem,' and std is ', s_rem)

Mean of node BDM is  43.83172647140848  and std is  1105.180424359469
Mean of remove edges BDM is  0.32772930517954074  and std is  1.6333241564528327


In [5]:
# Up and down thresholds
d_thr_nodes = m_nodes-s_nodes
u_thr_nodes = m_nodes+s_nodes
d_thr_rem = m_rem-s_rem
u_thr_rem = m_rem+s_rem

In [6]:
# Node complexity sorting
neg_nodes = nodebdm_ppi<d_thr_nodes
pos_nodes = nodebdm_ppi>u_thr_nodes
bin_nodebdm_ppi = neg_nodes.astype(int)*-1+pos_nodes.astype(int)

In [7]:
# Edge complexity sorting
neg_rem = rem_edgebdm_ppi<d_thr_rem
pos_rem = rem_edgebdm_ppi>u_thr_rem
bin_rembdm_ppi = neg_rem.astype(int)*-1+pos_rem.astype(int)

In [8]:
# Filling proportion of vectors
sp_n = np.count_nonzero(bin_nodebdm_ppi)/len(nodebdm_ppi)
sp_r = np.count_nonzero(bin_rembdm_ppi)/len(rem_edgebdm_ppi)
print('The node feature vector is filled in a ',sp_n*100,'%')
print('The remove edge feature vector is filled in a ',sp_r*100,'%')

The node feature vector is filled in a  17.996960327026883 %
The remove edge feature vector is filled in a  7.756406896913159 %


In [9]:
output_data = {}
output_data['nodebdm_ppi'] = bin_nodebdm_ppi
output_data['rem_edgebdm_ppi'] = bin_rembdm_ppi
output_data['vms_ppi'] = vms_ppi
output_data['rss_ppi'] = rss_ppi
output_data['time_ppi'] = time_ppi
output_data['jobs_ppi'] = jobs_ppi
# Compatibility with previous versions
if 'partition_type' in locals():
    output_data['partition_type'] = partition_type

In [10]:
words = ppi_in_file.split('_BDM_')
ppi_out_file = words[0] + '_BINBDM_' + words[1]
print(ppi_out_file)

data_structures/BDM/PPI_BINBDM_real_genes_19081_juadia64


In [11]:
with open(ppi_out_file, 'wb') as f:
    pickle.dump(output_data, f, protocol=3)

# DTI

In [12]:
with open(dti_in_file,'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

nodebdm_drugs_dti Imported successfully
nodebdm_genes_dti Imported successfully
add_edgebdm_drugs_dti Imported successfully
add_edgebdm_genes_dti Imported successfully
rem_edgebdm_drugs_dti Imported successfully
rem_edgebdm_genes_dti Imported successfully
vms_dti Imported successfully
rss_dti Imported successfully
time_dti Imported successfully
jobs_dti Imported successfully


In [13]:
# Mean and Standard deviation
m_nodes_drugs = np.mean(nodebdm_drugs_dti)
s_nodes_drugs = np.std(nodebdm_drugs_dti)
m_nodes_genes = np.mean(nodebdm_genes_dti)
s_nodes_genes = np.std(nodebdm_genes_dti)
m_rem_drugs = np.mean(rem_edgebdm_drugs_dti)
s_rem_drugs = np.std(rem_edgebdm_drugs_dti)
m_rem_genes = np.mean(rem_edgebdm_genes_dti)
s_rem_genes = np.std(rem_edgebdm_genes_dti)
print('Mean of drug node BDM is ',m_nodes_drugs,' and std is ', s_nodes_drugs)
print('Mean of gene node BDM is ',m_nodes_genes,' and std is ', s_nodes_genes)
print('Mean of remove drug edges BDM is ',m_rem_drugs,' and std is ', s_rem_drugs)
print('Mean of remove gene edges BDM is ',m_rem_genes,' and std is ', s_rem_genes)

Mean of drug node BDM is  -569.4602678542178  and std is  379.02318825640225
Mean of gene node BDM is  -247.65085236073713  and std is  181.51448562851988
Mean of remove drug edges BDM is  -0.09328876250167308  and std is  3.8496203935935607
Mean of remove gene edges BDM is  0.004674763263332341  and std is  0.941720947409857


In [14]:
# Up and down thresholds
d_thr_nodes_drugs = m_nodes_drugs-s_nodes_drugs
u_thr_nodes_drugs = m_nodes_drugs+s_nodes_drugs
d_thr_nodes_genes = m_nodes_genes-s_nodes_genes
u_thr_nodes_genes = m_nodes_genes+s_nodes_genes
d_thr_rem_drugs = m_rem_drugs-s_rem_drugs
u_thr_rem_drugs = m_rem_drugs+s_rem_drugs
d_thr_rem_genes = m_rem_genes-s_rem_genes
u_thr_rem_genes = m_rem_genes+s_rem_genes

In [15]:
# Node complexity sorting
neg_nodes_drugs = nodebdm_drugs_dti<d_thr_nodes_drugs
pos_nodes_drugs = nodebdm_drugs_dti>u_thr_nodes_drugs
bin_nodebdm_drugs_dti = neg_nodes_drugs.astype(int)*-1+pos_nodes_drugs.astype(int)
neg_nodes_genes = nodebdm_genes_dti<d_thr_nodes_genes
pos_nodes_genes = nodebdm_genes_dti>u_thr_nodes_genes
bin_nodebdm_genes_dti = neg_nodes_genes.astype(int)*-1+pos_nodes_genes.astype(int)

In [16]:
# Edge complexity sorting
neg_rem_drugs = rem_edgebdm_drugs_dti<d_thr_rem_drugs
pos_rem_drugs = rem_edgebdm_drugs_dti>u_thr_rem_drugs
bin_rembdm_drugs_dti = neg_rem_drugs.astype(int)*-1+pos_rem_drugs.astype(int)
neg_rem_genes = rem_edgebdm_genes_dti<d_thr_rem_genes
pos_rem_genes = rem_edgebdm_genes_dti>u_thr_rem_genes
bin_rembdm_genes_dti = neg_rem_genes.astype(int)*-1+pos_rem_genes.astype(int)

In [17]:
# Sparsity of vectors
sp_n_drugs = np.count_nonzero(bin_nodebdm_drugs_dti)/len(nodebdm_drugs_dti)
sp_r_drugs = np.count_nonzero(bin_rembdm_drugs_dti)/len(rem_edgebdm_drugs_dti)
print('The drug node feature vector is filled in a ',sp_n_drugs*100,'%')
print('The remove drug edge feature vector is filled in a ',sp_r_drugs*100,'%')
sp_n_genes = np.count_nonzero(bin_nodebdm_genes_dti)/len(nodebdm_genes_dti)
sp_r_genes = np.count_nonzero(bin_rembdm_genes_dti)/len(rem_edgebdm_genes_dti)
print('The gene node feature vector is filled in a ',sp_n_genes*100,'%')
print('The remove drug gene feature vector is filled in a ',sp_r_genes*100,'%')

The drug node feature vector is filled in a  53.20813771517997 %
The remove drug edge feature vector is filled in a  7.668231611893583 %
The gene node feature vector is filled in a  46.868612756144856 %
The remove drug gene feature vector is filled in a  0.7599182432786541 %


In [18]:
output_data = {}
output_data['nodebdm_drugs_dti'] = bin_nodebdm_drugs_dti
output_data['nodebdm_genes_dti'] = bin_nodebdm_genes_dti
output_data['rem_edgebdm_drugs_dti'] = bin_rembdm_drugs_dti
output_data['rem_edgebdm_genes_dti'] = bin_rembdm_genes_dti
output_data['vms_dti'] = vms_dti
output_data['rss_dti'] = vms_dti
output_data['time_dti'] = time_dti
output_data['jobs_dti'] = jobs_dti
# Compatibility with previous versions
if 'partition_type' in locals():
    output_data['partition_type'] = partition_type

In [19]:
words = dti_in_file.split('_BDM_')
dti_out_file = words[0] + '_BINBDM_' + words[1]
print(dti_out_file)

data_structures/BDM/DTI_BINBDM_real_genes_19081_drugs_639_juadia16


In [20]:
with open(dti_out_file, 'wb') as f:
    pickle.dump(output_data, f, protocol=3)

# DDI

In [21]:
with open(ddi_in_file,'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

nodebdm_ddi_list Imported successfully
add_edgebdm_ddi_list Imported successfully
rem_edgebdm_ddi_list Imported successfully
vms_ddi Imported successfully
rss_ddi Imported successfully
time_ddi Imported successfully
jobs_ddi Imported successfully


In [22]:
# Means and std
n_se = len(nodebdm_ddi_list)
m_nodes = np.mean(nodebdm_ddi_list,axis=1)
m_rem = np.mean(rem_edgebdm_ddi_list,axis=1)
s_nodes = np.std(nodebdm_ddi_list,axis=1)
s_rem = np.std(rem_edgebdm_ddi_list,axis=1)

In [23]:
# Up & down thresholds
d_thr_nodes = m_nodes-s_nodes
u_thr_nodes = m_nodes+s_nodes
d_thr_rem = m_rem-s_rem
u_thr_rem = m_rem+s_rem
bin_nodebdm_ddi_list = []
bin_rem_edgebdm_ddi_list = []

In [24]:
# Complexity sorting
for i in range(n_se):
    neg_nodes = nodebdm_ddi_list[i]<d_thr_nodes[i]
    pos_nodes = nodebdm_ddi_list[i]>u_thr_nodes[i]
    bin_nodebdm_ddi_list.append(neg_nodes.astype(int)*-1+pos_nodes.astype(int))
    neg_rem = rem_edgebdm_ddi_list[i]<d_thr_rem[i]
    pos_rem = rem_edgebdm_ddi_list[i]>u_thr_rem[i]
    bin_rem_edgebdm_ddi_list.append(neg_rem.astype(int)*-1+pos_rem.astype(int))

In [25]:
L_inv = 1/len(nodebdm_ddi_list[0])
nm = np.mean(L_inv*np.count_nonzero(bin_nodebdm_ddi_list,axis=1))
rm = np.mean(L_inv*np.count_nonzero(bin_rem_edgebdm_ddi_list,axis=1))

In [26]:
norm = len(bin_nodebdm_ddi_list[0])*n_se
print('The node feature vectors are filled in average a ',nm*100,'%')
print('The remove edge feature vectors are filled in average a ',rm*100,'%')

The node feature vectors are filled in average a  28.587036279456363 %
The remove edge feature vectors are filled in average a  10.750719160514029 %


In [27]:
output_data = {}
output_data['nodebdm_ddi_list'] = bin_nodebdm_ddi_list
output_data['rem_edgebdm_ddi_list'] = bin_rem_edgebdm_ddi_list
output_data['vms_ddi'] = vms_ddi
output_data['rss_ddi'] = rss_ddi
output_data['time_ddi'] = time_ddi
output_data['jobs_ddi'] = jobs_ddi
# Compatibility with previous versions
if 'partition_type' in locals():
    output_data['partition_type'] = partition_type

In [28]:
words = ddi_in_file.split('_BDM_')
ddi_out_file = words[0] + '_BINBDM_' + words[1]
print(ddi_out_file)

data_structures/BDM/DDI_BINBDM_real_se_964_drugs_639_juadia72


In [29]:
with open(ddi_out_file, 'wb') as f:
    pickle.dump(output_data, f, protocol=3)