In [1]:
import numpy as np
import networkx as nx
import scipy.sparse as sp
import pandas as pd
from itertools import combinations, chain
import shelve
from getpass import getuser
from pybdm import BDM
from node import NodePerturbationExperiment

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### PF

In [3]:
PF = pd.read_csv('data/clean_data/genes_mini.csv',
                 sep=',',names=['GeneID','Length','Mass','n_helices','n_strands','n_turns'])
genes = pd.unique(PF['GeneID'].values)
gene2idx = {gene: i for i, gene in enumerate(genes)}
n_genes = len(gene2idx)
print('Number of genes in network:',n_genes)
prot_feat = sp.coo_matrix(PF.to_numpy())
print('Protein feature matrix calculated')
print('Important structures: prot_feat, gene2idx')

Number of genes in network: 16227
Protein feature matrix calculated
Important structures: prot_feat, gene2idx


### PPI

In [4]:
PPI = pd.read_csv('data/clean_data/ppi_mini.csv',sep=',',names=["Gene_1", "Gene_2"])
# PPI adjacency matrix
ppi_adj = np.zeros([n_genes,n_genes],dtype=int)
for i in PPI.index:
    row = gene2idx[PPI.loc[i,'Gene_1']]
    col = gene2idx[PPI.loc[i,'Gene_2']]
    ppi_adj[row,col]=ppi_adj[col,row]=1
ppi_degrees = np.sum(ppi_adj,axis=0)
ppi_adj = sp.csr_matrix(ppi_adj)
print('PPI adjacency matrix and degrees calculated')
print('Important structures from PPI: ppi_adj, ppi_degrees,bdm_ppi')

PPI adjacency matrix and degrees calculated
Important structures from PPI: ppi_adj, ppi_degrees,bdm_ppi


### DDI

In [5]:
DDI = pd.read_csv('data/clean_data/combo_mini.csv', sep=','
                  ,names=["STITCH_1", "STITCH_2", "SE", "SE_name"])
drugs = pd.unique(np.hstack((DDI['STITCH_1'].values,DDI['STITCH_2'].values)))
drug2idx = {drug: i for i, drug in enumerate(drugs)}
n_drugs = len(drug2idx)
print('Number of drugs in the network',n_drugs)
se_names = pd.unique(DDI['SE_name'].values)
se_combo_name2idx = {se: i for i, se in enumerate(se_names)}
n_secombo = len(se_combo_name2idx)
print('Number of DDI side effects',n_secombo)
# DDI adjacency matrices
ddi_adj_list = []
for i in se_combo_name2idx.keys():
    m = np.zeros([n_drugs,n_drugs],dtype=int)
    seDDI = DDI[DDI['SE_name'].str.match(i)].reset_index()
    for j in seDDI.index:
        row = drug2idx[seDDI.loc[j,'STITCH_1']]
        col = drug2idx[seDDI.loc[j,'STITCH_2']]
        m[row,col] = m[col,row] = 1
    ddi_adj_list.append(sp.csr_matrix(m))
ddi_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in ddi_adj_list]
print('DDI adjacency matrix list and degree list calculated')
print('Important structures: drug2idx, se_mono_name2idx, ddi_adj_list, ddi_degrees_list')

Number of drugs in the network 357
Number of DDI side effects 3
DDI adjacency matrix list and degree list calculated
Important structures: drug2idx, se_mono_name2idx, ddi_adj_list, ddi_degrees_list


### DTI

In [6]:
DTI = pd.read_csv('data/clean_data/targets_mini.csv',sep=',',names=["STITCH", "GENE"])
dti_drugs = len(pd.unique(DTI['STITCH'].values))
dti_genes = len(pd.unique(DTI['GENE'].values))
print('Number of DTI drugs:',dti_drugs)
print('Number of DTI genes:',dti_genes)
#DTI adjacency matrix
dti_adj = np.zeros([n_genes,n_drugs],dtype=int)
for i in DTI.index:
    row = gene2idx[DTI.loc[i,'GENE']]
    col = drug2idx[DTI.loc[i,'STITCH']]
    dti_adj[row,col] = 1
dti_adj = sp.csr_matrix(dti_adj)
print('DTI adjacency matrix calculated')
print('Important structures: dti_drugs, dti_genes, dti_adj')

Number of DTI drugs: 172
Number of DTI genes: 3464
DTI adjacency matrix calculated
Important structures: dti_drugs, dti_genes, dti_adj


### DSE

In [7]:
DSE = pd.read_csv('data/clean_data/mono_mini.csv', sep=',',names=["STITCH","SE", "SE_name"])
se_mono_names = pd.unique(DSE['SE_name'].values)
se_mono_name2idx = {name: i for i, name in enumerate(se_mono_names)}
n_semono = len(se_mono_name2idx)
print('Number of DSE side effects:',n_semono)
# Drug Feature matrix
drug_feat = np.zeros([n_drugs,n_semono],dtype=int)
for i in DSE.index:
    row = drug2idx[DSE.loc[i,'STITCH']]
    col = se_mono_name2idx[DSE.loc[i,'SE_name']]
    drug_feat[row,col] = 1
drug_feat = sp.csr_matrix(drug_feat)
print('Drug feature matrix calculated')
print('Important structures: drug_feat, se_mono_name2idx')

Number of DSE side effects: 8774
Drug feature matrix calculated
Important structures: drug_feat, se_mono_name2idx


### Save

In [8]:
data = shelve.open('./results/decagon','n',protocol=2)

In [9]:
#PF
data['prot_feat'] = prot_feat
data['gene2idx'] = gene2idx
#PPI
data['ppi_adj'] = ppi_adj
data['ppi_degrees'] = ppi_degrees
#DDI
data['se_mono_name2idx'] = se_mono_name2idx
data['ddi_adj_list'] = ddi_adj_list
data['ddi_degrees_list'] = ddi_degrees_list
data['drug2idx'] = drug2idx
#DTI
data['dti_drugs'] = dti_drugs
data['dti_genes'] = dti_genes
data['dti_adj'] = dti_adj
#DSE
data['drug_feat'] = drug_feat
data['se_mono_name2idx'] = se_mono_name2idx

In [10]:
data.close()