# Chop network with BDM
Takes the data structures of a network and calculates BDM of the PPI matrix. Discards a given fraction of the edges and updates the DS file with the new ppi matrix.<br>
**Assumptions:**<br>
+ The PPI adjacency matrix is always symmetric and there are no elements in their diagonal (no loops or self-edges)
+ The algorithmic complexity matrix is symmetric

In [1]:
import numpy as np
import scipy.sparse as sp
import pickle
from pybdm import BDM
from pybdm.utils import decompose_dataset
from pybdm.partitions import PartitionIgnore
from pybdm.partitions import PartitionRecursive
from algorithms import PerturbationExperiment, NodePerturbationExperiment

In [2]:
in_file = 'data_structures/DS/DS_toy_DSE_600_genes_500_drugs_400_se_4'
words = in_file.split('_')
sim_type = words[2]
# Fraction of edges to be discarded
cut_frac = 0.25

In [3]:
#Define function for sparse matrices of DECAGON (only for option 2)
# Call it from another file better??
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [41]:
# Import original Data structures
with open(in_file,'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")
old_genes = len(gene2idx)
old_drugs = len(drug2idx)
old_se_combo = len(se_combo_name2idx)
old_se_mono = len(se_mono_name2idx)

gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


In [5]:
ppi_mat = ppi_adj.todense() # High memory requirement for big matrices

In [6]:
# Calculate algorithmic complexity
bdm = BDM(ndim=2, partition=PartitionRecursive)
ppi_per = PerturbationExperiment(bdm,metric='bdm',bipartite_network=False)
ppi_per.set_data(np.array(ppi_mat))
edge_complexity = ppi_per.run()
# Reshape to the adj matrix shape
complexity_mat = edge_complexity.reshape(np.shape(ppi_adj))
print('Is it symmetric?',np.array_equal(complexity_mat,complexity_mat.T))

Is it symmetric? True


In [None]:
# Save the algorithmic complexity
out_file_bdm = 'data_structures/BDM/EDGES_PPI_'+sim_type+'_genes_' + str(old_genes)
print(out_file_bdm)
with open(out_file_bdm,'wb') as f:
    pickle.dump(edge_complexity, f)

In [66]:
coords,_,_ = sparse_to_tuple(ppi_adj)
# Take the upper triangular coordinates
upper_coords = coords[(coords[:,1]-coords[:,0]>0).nonzero()]
# Select abs of the complexity of selected entries
true_cmplx = np.abs(complexity_mat[upper_coords[:,0],upper_coords[:,1]]).squeeze()
# Give an index to the edge
pair = np.array(list(enumerate(true_cmplx)))
# Sort from greatest to lowest complexity
sorted_pair = pair[pair[:,1].argsort()][::-1]
# Select sorted indices
idx = sorted_pair[:,0].astype(int)
# Select a threshold entry according to the cut fraction
threshold = np.floor(len(idx)*(1-cut_frac)).astype(int)
# Select indices above threshold
idx = idx[:threshold]
# Generate row and col indices of full matrix
row_ind = np.concatenate((upper_coords[idx,0],upper_coords[idx,1]),axis=0)
col_ind = np.concatenate((upper_coords[idx,1],upper_coords[idx,0]),axis=0)
# Form the new adjacency matrix
new_ppi_adj = sp.csr_matrix((np.ones(2*threshold), (row_ind, col_ind)),\
                            shape=np.shape(ppi_adj),dtype=int)
print('Nonzero entries before',len(coords))
print('Nonzero entries after',new_ppi_adj.count_nonzero())
print('Is it symmetric?',np.array_equal(new_ppi_adj.todense(),new_ppi_adj.todense().T))

Nonzero entries before 615
Nonzero entries after 450
Is it symmetric? True


### Remove genes and drugs that may have become disconnected

In [67]:
new_ppi_adj = new_ppi_adj.todense()

In [68]:
# Find rows of zeros (indices)
new_ppi_adj = new_ppi_adj.todense()
genes_zero = np.asarray(~new_ppi_adj.any(axis=1)).nonzero()[0]
print('Number of zero rows/columns in PPI matrix: ',len(genes_zero))
# If there are
if len(genes_zero)>0:
    #### PPI ####
    # Delete those rows and columns
    new_ppi_adj = np.delete(np.delete(new_ppi_adj,genes_zero,axis=1),genes_zero,axis=0)
    print('New shape PPI matrix: ',np.shape(new_ppi_adj))
    # Update index dictionary
    gene_dict = {key:val for key, val in gene2idx.items() if val not in genes_zero}
    gene2idx = {gene:i for i, gene in enumerate(gene_dict.keys())}
    # Update degree list
    new_ppi_degrees = np.array(new_ppi_adj.sum(axis=0).astype(int)).squeeze()
    #### DTI ####
    # Deletes the corresponding rows in DTI
    new_dti_adj = dti_adj.todense()
    new_dti_adj = np.delete(new_dti_adj,genes_zero,axis=0)
    print('New shape of DTI matrix: ',np.shape(new_dti_adj))
else:
    print('No further modifications to the matrices are needed')
new_ppi_adj = sp.csr_matrix(new_ppi_adj)
new_dti_adj = sp.csr_matrix(new_dti_adj)

Number of zero rows/columns in PPI matrix:  211
New shape PPI matrix:  (289, 289)
[  1   3   5   6   7  11  12  13  14  15  16  17  19  20  21  22  24  25
  26  27  28  31  32  33  36  37  38  39  41  44  46  48  50  51  52  53
  54  58  59  60  63  64  65  66  68  70  74  75  76  77  79  80  81  82
  83  84  85  86  87  89  91  95  96  97 101 102 104 105 107 108 109 112
 113 116 117 118 120 122 123 124 125 126 130 131 134 136 141 143 144 146
 147 148 149 151 152 153 154 156 157 159 163 165 166 168 169 170 171 172
 173 177 178 181 182 183 184 185 186 187 188 190 191 192 193 197 198 199
 201 202 205 207 210 211 215 217 220 221 222 223 225 226 227 229 231 234
 237 239 240 243 244 246 247 248 249 251 253 254 256 258 259 262 263 264
 268 270 271 272 274 277 278 280 285 286 287 288 289 290 291 293 296 297
 299 301 304 306 307 309 311 313 314 315 321 322 323 326 327 328 329 332
 335 337 341 342 344 345 346 347 349 353 354 355 357 358 360 362 364 366
 368 369 372 373 376 377 378 381 382 384 3

In [None]:
new_drug_feat = sp.csr_matrix(new_drug_feat)
new_ppi_adj = sp.csr_matrix(new_ppi_adj)
new_dti_adj = sp.csr_matrix(new_dti_adj)

In [None]:
n_genes = len(gene2idx)
n_drugs = len(drug2idx)
n_se_combo = len(se_combo_name2idx)
n_se_mono = len(se_mono_name2idx)
print('Previous number of genes: ',old_genes)
print('New number of genes: ',n_genes)

In [None]:
# Dictionaries
data = {}
data['gene2idx'] = gene2idx
data['drug2idx'] = drug2idx
data['se_mono_name2idx'] = se_mono_name2idx
data['se_combo_name2idx'] = se_combo_name2idx
# DDI
data['ddi_adj_list'] = ddi_adj_list
data['ddi_degrees_list'] = ddi_degrees_list
# DTI
data['dti_adj'] = new_dti_adj
# PPI
data['ppi_adj'] = new_ppi_adj
data['ppi_degrees'] = new_ppi_degrees
# DSE
data['drug_feat'] = drug_feat

In [None]:
# SAVING
out_file = 'data_structures/CHOP/DS_' + sim_type + '_cutfrac_'+str(cut_frac) +\
        '_DSE_' + str(n_se_mono) + '_genes_' +str(n_genes) + '_drugs_' + str(n_drugs) +\
        '_se_' + str(n_se_combo)
print(out_file)
with open(out_file,'wb') as f:
    pickle.dump(data, f)