# Chop network with BDM
Takes the data structures of a network and calculates BDM of the PPI matrix. Discards a given fraction of the edges and updates the DS file with the new ppi matrix.

In [1]:
import numpy as np
import scipy.sparse as sp
import pickle
from pybdm import BDM
from pybdm.utils import decompose_dataset
from pybdm.partitions import PartitionIgnore
from pybdm.partitions import PartitionRecursive
from algorithms import PerturbationExperiment, NodePerturbationExperiment

In [2]:
in_file = 'data_structures/DS/DS_toy_DSE_600_genes_500_drugs_400_se_4'
# Fraction of edges to be discarded
cut_frac = 0.25

In [3]:
#Define function for sparse matrices of DECAGON (only for option 2)
# Call it from another file better??
def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

In [4]:
# Import original Data structures
with open(in_file,'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")
old_genes = len(gene2idx)
old_drugs = len(drug2idx)
old_se_combo = len(se_combo_name2idx)
old_se_mono = len(se_mono_name2idx)

gene2idx Imported successfully
drug2idx Imported successfully
se_mono_name2idx Imported successfully
se_combo_name2idx Imported successfully
ddi_adj_list Imported successfully
ddi_degrees_list Imported successfully
dti_adj Imported successfully
ppi_adj Imported successfully
ppi_degrees Imported successfully
drug_feat Imported successfully
prot_feat Imported successfully


In [5]:
ppi_mat = ppi_adj.todense() # High memory requirement for big matrices
# Calculate algorithmic complexity
bdm = BDM(ndim=2, partition=PartitionRecursive)
ppi_per = PerturbationExperiment(bdm,metric='bdm',bipartite_network=False)
ppi_per.set_data(np.array(ppi_mat))
edge_complexity = ppi_per.run()
# Reshape to the adj matrix shape
complexity_mat = edge_complexity.reshape(np.shape(ppi_adj))

In [6]:
# OPTION 1: USE ELEMENTWISE MULT TO FORM DENSE MATRIX 
eps = 0.0001 # The addition of this value makes the number of nonzero to coincide
# Elementwise multiplication
true_cmplx = np.multiply(ppi_mat,complexity_mat+eps)
# Take abs and sort from largest to smallest
cmplx = np.squeeze(np.asarray(np.abs(true_cmplx[true_cmplx != 0])))
sorted_cmplx = np.sort(cmplx)[::-1]
# Get the cutting treshold based on the cutting fraction of data
l = len(sorted_cmplx)
threshold = sorted_cmplx[np.floor(l*(1-cut_frac)).astype(int)]
# Choose the entries that exceed the threshold, discard the rest
new_ppi_adj = (np.abs(true_cmplx)>threshold).astype(int)
print('Nonzero entries before',np.count_nonzero(true_cmplx))
print('Nonzero entries after',np.count_nonzero(new_ppi_adj))
print('Is it symmetric?',np.array_equal(new_ppi_adj,new_ppi_adj.T))

Nonzero entries before 615
Nonzero entries after 360
Is it symmetric? True


In [None]:
# OPTION 2: USE INDICES TO FORM SPARSE MATRIX (FAILS!, THE MATRIX IS NOT NECESSARLY SYMMETRIC)
# Get coordinates and complexities of positive edges
coords,_,_ = sparse_to_tuple(ppi_adj)
l= np.shape(coords)[0]
true_cmplx = np.abs(complexity_mat[coords[:,0],coords[:,1]].reshape(l,1))
# Use dummy column to keep track of indices
a = np.concatenate((np.abs(true_cmplx),np.arange(l).reshape(l,1)),axis=1)
sorted_values = a[a[:,0].argsort()[::-1]]
# Discard the lowest complexity edges
remain = np.arange(np.floor(l*(1-cut_frac)),dtype=int)
new_values = sorted_values[remain,:]
indices = new_values[:,1].astype(int)
new_coords = coords[indices,:]
new_l = np.shape(new_coords)[0]
# New adjacency matrix (sparse)
new_ppi_adj = sp.csr_matrix((np.ones(new_l), (new_coords[:,0], new_coords[:,1])),\
                            shape=np.shape(ppi_adj))
print(np.array_equal(new_ppi_adj.todense(),new_ppi_adj.todense().T))
print(np.count_nonzero(new_ppi_adj.todense()))

### Remove genes and drugs that may have become disconnected

In [7]:
# Find rows of zeros (indices)
genes_zero = np.where(~new_ppi_adj.any(axis=1))[0]
print('Number of zero rows/columns in PPI matrix: ',len(genes_zero))
# If there are
if len(genes_zero)>0:
    #### PPI ####
    # Delete those rows and columns
    new_ppi_adj = np.delete(np.delete(new_ppi_adj,genes_zero,axis=1),genes_zero,axis=0)
    print('New shape PPI matrix: ',np.shape(new_ppi_adj))
     # Update index dictionary
    gene_dict = {key:val for key, val in gene2idx.items() if val not in genes_zero}
    gene2idx = {gene:i for i, gene in enumerate(gene_dict.keys())}
    # Update degree list
    new_ppi_degrees = np.array(new_ppi_adj.sum(axis=0).astype(int)).squeeze()
    #### DTI ####
    # Deletes the corresponding rows in DTI
    new_dti_adj = dti_adj.todense()
    new_dti_adj = np.delete(new_dti_adj,genes_zero,axis=0)
    print('New shape of DTI matrix: ',np.shape(new_dti_adj))
    #### DRUGS ####
    # Finds drugs that became disconnected from network (indices)
    drugs_zero = np.where(~new_dti_adj.any(axis=0))[1]
    print('Number of disconnected drugs: ',len(drugs_zero))
    if len(drugs_zero)>0:
        # Remove drugs from DTI matrix
        new_dti_adj = np.delete(new_dti_adj,drugs_zero,axis=1)
        # Remove drugs from drug feature matrix
        new_drug_feat = drug_feat.todense()
        new_drug_feat = np.delete(new_drug_feat,drugs_zero,axis=0)
        # Find drug side effects that have no drug
        mono_zero = np.where(~new_drug_feat.any(axis=1))[1]
        print('Number of side effects without drug: ',len(mono_zero))
        if len(mono_zero)>0:
            # Remove them from drug feature matrix
            new_drug_feat = np.delete(new_drug_feat,mono_zero,axis=1)
            # Update index dictionary
            mono_dict = {key:val for key,val in se_mono_name2idx.keys() if val not in mono_zero}
            se_mono_name2idx = {se: i for i, se in enumerate(mono_dict.keys())}
        #### DDI ####
        # Remove drugs from adjacency matrices
        new_ddi_degrees_list = []
        new_ddi_adj_list = []
        for i in ddi_adj_list:
            # Remove drugs from DDI matrices
            ddi_mat = np.delete(np.delete(i.todense(),drugs_zero,axis=0),\
                                        drugs_zero,axis=1)
            new_ddi_adj_list.append(sp.csr_matrix(ddi_mat))
            # Update degree list
            new_ddi_degrees_list.append(np.array(ddi_mat.sum(axis=0)).squeeze())
        # Update index dictionary
        drug_dict = {key:val for key, val in drug2idx.items() if val not in drugs_zero}
        drug2idx = {drug: i for i, drug in enumerate(drug_dict.keys())}
        print('New size of DDI matrices: ',np.shape(new_ddi_adj_list[0]))
else:
    print('No further modifications to the matrices are needed')

Number of zero rows/columns in ppi 253
New shape ppi (247, 247)
New shape of DTI (247, 400)
Number of disconnected drugs 255
Number of side effects without drug 0


In [8]:
n_genes = len(gene2idx)
n_drugs = len(drug2idx)
n_se_combo = len(se_combo_name2idx)
n_se_mono = len(se_mono_name2idx)
print('Previous number of genes: ',old_genes)
print('New number of genes: ',n_genes)
print('Previous number of drugs: ',old_drugs)
print('New number of drugs: ',n_drugs)
print('Previous number of joint side effects: ',old_se_combo)
print('New number of joint side effects: ',n_se_combo)
print('Previous number of single side effects: ',old_se_mono)
print('New number of single sige effects: ',n_se_mono)

247 145 4 600


In [9]:
data = {}
# Dictionaries
data['gene2idx'] = gene2idx
data['drug2idx'] = drug2idx
data['se_mono_name2idx'] = se_mono_name2idx
data['se_combo_name2idx'] = se_combo_name2idx
# DDI
data['ddi_adj_list'] = new_ddi_adj_list
data['ddi_degrees_list'] = new_ddi_degrees_list
# DTI
data['dti_adj'] = new_dti_adj
# PPI
data['ppi_adj'] = new_ppi_adj
data['ppi_degrees'] = new_ppi_degrees
# DSE
data['drug_feat'] = new_drug_feat
# BDM
data['ppi_edge_bdm'] = edge_complexity

In [10]:
# SAVING
out_file = 'data_structures/CHOP/DS_' + sim_type + '_cutfrac_'+str(cut_frac) +\
        '_DSE_' + str(n_se_mono) + '_genes_' +str(n_genes) + '_drugs_' + str(n_drugs) +\
        '_se_' + str(n_se_combo)
print(out_file)

data_structures/CHOP/DS_toy_cutfrac_0.25_DSE_600_genes_247_drugs_145_se_4


In [11]:
with open(out_file,'wb') as f:
    pickle.dump(new_ppi_adj, f)