# Reduced Data Structures
This notebook chooses a small consistent subset of the treated dataset to be run in small machines. The generated dataset is limited to a number of **drug-drug interactions** specified by the variable $N$. From this dataset, it generates the corresponding data structures like dictionaries and adjacency matrices to be fed directly to DECAGON.<br>
This code is in part the adaptation in `pandas` of the script `drug_dataset.sh`, merged with `data_structures.ipynb`.

Author: Juan Sebastian Diaz Boada, May 2020

## Python 3

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
from joblib import Parallel, delayed

In [2]:
# Number of side effects 
#(Maximum is 1317, but the max chosen is 964 to be consistents with authors of decagon)
N = 964

In [3]:
# Here we choose specific side effects: Nausea, Diarrhea, Emesis, Increased body temp,
# bleeding, Renal disorder, Leucopenia (in that order)
# ignoring the previous given number 
#idx = [3,6,9,11,58,150,168]
idx = list(range(7))

### Import databases

In [4]:
PPI = pd.read_csv('amir_docking_data/small-decagon-ppi.csv',sep=',')
DF = pd.read_csv('amir_docking_data/small-decagon-docking.csv',sep=',')
DTI = pd.read_csv('amir_docking_data/small-decagon-targets.csv',sep=',')
DDI = pd.read_csv('amir_docking_data/small-decagon-combo.csv',sep=',')
DSE = pd.read_csv('amir_docking_data/small-decagon-mono.csv',sep=',')
SE = pd.read_csv('original_data/bio-decagon-effectcategories.csv',sep=',')

In [5]:
# Number of interactions
orig_ppi = len(PPI.index)
orig_df = len(DF.index)
orig_dti = len(DTI.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)
# Number of nodes
orig_ddi_drugs = len(pd.unique(DDI[['STITCH 1','STITCH 2']].values.ravel()))
orig_ppi_genes = len(pd.unique(PPI[['Gene 1','Gene 2']].values.ravel()))
orig_dti_drugs = len(pd.unique(DTI['STITCH']))
orig_dti_genes = len(pd.unique(DTI['Gene']))
orig_dse_drugs = len(pd.unique(DSE['STITCH'].values))
orig_df_genes = len(pd.unique(DF['Gene'].values))
orig_df_drugs = len(pd.unique(DF['Drug'].values))
# Side effects
orig_se_mono = len(pd.unique(DSE['Side Effect Name']))
orig_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))

### Choose Side effects

In [6]:
# Sort DDI to be consistent with the authors
DDI['freq'] = DDI.groupby('Polypharmacy Side Effect')['Polypharmacy Side Effect']\
            .transform('count')
DDI = DDI.sort_values(by=['freq'], ascending=False).drop(columns=['freq'])
se = pd.unique(DDI['Polypharmacy Side Effect'].values)
se_name = pd.unique(DDI['Side Effect Name'].values)

In [7]:
if 'idx' in locals():
    se = se[idx]
    N = len(se);
    print(N)
else:
    se = se[:N]
    print(len(se))

7


### Select DDIs

In [8]:
DDI = DDI[DDI['Polypharmacy Side Effect'].isin(se)].reset_index(drop=True)
DDI_drugs = pd.unique(DDI[['STITCH 1','STITCH 2']].values.ravel()) # Unique drugs 
drug2idx = {drug: i for i, drug in enumerate(DDI_drugs)}
se_names = pd.unique(DDI['Side Effect Name']) # Unique joint side effects
se_combo_name2idx = {se: i for i, se in enumerate(se_names)}
n_drugs = len(DDI_drugs)

### Select Drug side effects

In [9]:
DSE = DSE[DSE['STITCH'].isin(DDI_drugs)].reset_index(drop=True)
dse_drugs = len(pd.unique(DSE['STITCH'].values))
se_mono_names = pd.unique(DSE['Side Effect Name'].values) # Unique individual side effects
se_mono_name2idx = {name: i for i, name in enumerate(se_mono_names)}
n_semono = len(se_mono_names)

### Select DTIs

In [10]:
DTI = DTI[DTI['STITCH'].isin(DDI_drugs)].reset_index(drop=True)
DTI_genes = pd.unique(DTI['Gene']) # Unique genes in DTI
DTI_drugs = pd.unique(DTI['STITCH']) # Unique drugs in DTI
dti_drugs = len(DTI_drugs)
dti_genes = len(DTI_genes)

### Select Docking features

In [11]:
DF = DF[np.logical_and(DF['Gene'].isin(DTI_genes),DF['Drug'].isin(DTI_drugs))]\
    .reset_index(drop=True)
df_genes = len(pd.unique(DF['Gene'].values))
df_drugs = len(pd.unique(DF['Drug'].values))

### Select PPIs

In [12]:
PPI = PPI[np.logical_or(PPI['Gene 1'].isin(DTI_genes),
                       PPI['Gene 2'].isin(DTI_genes))].reset_index(drop=True)
PPI_genes = pd.unique(PPI[['Gene 1','Gene 2']].values.ravel()) # Unique genes is PPI
gene2idx = {gene: i for i, gene in enumerate(PPI_genes)}
n_genes = len(PPI_genes)

## Adjacency matrices and degrees

In [13]:
# DDI adjacency matrices
def se_adj_matrix(se_name):
    m = np.zeros([n_drugs,n_drugs],dtype=int)
    seDDI = DDI[DDI['Side Effect Name'].str.match(se_name)].reset_index()
    for j in seDDI.index:
        row = drug2idx[seDDI.loc[j,'STITCH 1']]
        col = drug2idx[seDDI.loc[j,'STITCH 2']]
        m[row,col] = m[col,row] = 1
    return sp.csr_matrix(m), se_name
ddi_adj_list,se_name= zip(*Parallel(n_jobs=8)\
    (delayed(se_adj_matrix)(d) for d in se_combo_name2idx.keys()))
ddi_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in ddi_adj_list]

In [14]:
print(se_name)
print(se_combo_name2idx)
print(n_genes)

('arterial pressure NOS decreased', 'Hypoventilation', 'Difficulty breathing', 'High blood pressure', 'Pain', 'nausea', 'Fatigue')
{'arterial pressure NOS decreased': 0, 'Hypoventilation': 1, 'Difficulty breathing': 2, 'High blood pressure': 3, 'Pain': 4, 'nausea': 5, 'Fatigue': 6}
4116


In [15]:
# DTI adjacency matrices (normalized docking value and binding energy)
dti_norm_dock = np.zeros([n_genes,n_drugs])
dti_bind_energy = np.zeros([n_genes,n_drugs])
dti_adj = np.zeros([n_genes,n_drugs],dtype=np.int8)
for i in DF.index:
    gene = DF.loc[i,'Gene']
    drug = DF.loc[i,'Drug']
    row = gene2idx[gene]
    col = drug2idx[drug]
    ndv = DF[np.logical_and(DF['Gene']==gene,DF['Drug']==drug)]['Norm_docking'].values[0]
    be = DF[np.logical_and(DF['Gene']==gene,DF['Drug']==drug)]['Norm_binding_Energy'].values[0]
    dti_norm_dock[row,col] = ndv
    dti_bind_energy[row,col] = be
dti_norm_dock = sp.csr_matrix(dti_norm_dock)
dti_bind_energy = sp.csr_matrix(dti_bind_energy)

In [16]:
for i in DTI.index:
    gene = DTI.loc[i,'Gene']
    drug = DTI.loc[i,'STITCH']
    row = gene2idx[gene]
    col = drug2idx[drug]
    dti_adj[row,col] = 1
dti_adj = sp.csr_matrix(dti_adj)

In [17]:
print(dti_norm_dock.nonzero()[0].shape)
print(dti_bind_energy.nonzero()[0].shape)
print(dti_adj.nonzero()[0].shape)

(6032,)
(6032,)
(2921,)


In [18]:
# PPI adjacency matrix and degrees
ppi_adj = np.zeros([n_genes,n_genes],dtype=int)
for i in PPI.index:
    row = gene2idx[PPI.loc[i,'Gene 1']]
    col = gene2idx[PPI.loc[i,'Gene 2']]
    ppi_adj[row,col]=ppi_adj[col,row]=1
ppi_degrees = np.sum(ppi_adj,axis=0)
ppi_adj = sp.csr_matrix(ppi_adj)

In [19]:
# Drug Feature matrix
drug_feat = np.zeros([n_drugs,n_semono],dtype=int)
for i in DSE.index:
    row = drug2idx[DSE.loc[i,'STITCH']]
    col = se_mono_name2idx[DSE.loc[i,'Side Effect Name']]
    drug_feat[row,col] = 1
drug_feat = sp.csr_matrix(drug_feat)

### Print

In [20]:
# Interactions (edges)
print('Interactions (edges)')
print('Original number of PPI interactions:', orig_ppi)
print('New number of PPI interactions:', len(PPI.index))
print('\n')
print('Original number of DTI interactions:', orig_dti)
print('New number of DTI interactions:', len(DTI.index))
print('\n')
print('Original number of DDI interactions:', orig_ddi)
print('New number of DDI interactions:', len(DDI.index))
print('\n')
print('Original number of DSE interactions:', orig_dse)
print('New number of DSE interactions:', len(DSE.index))
print('\n')
print('Original number of DF interactions:', orig_df)
print('New number of DF interactions:', len(DF.index))
print('\n')
# Drugs and genes (nodes)
print('Drugs and genes (nodes)')
print("Original number of drugs in DSE:",orig_dse_drugs)
print("New number of drugs in DSE:",dse_drugs)
print('\n')
print("Original number drugs in DTI",orig_dti_drugs)
print("New number of drugs in DTI",dti_drugs)
print('\n')
print('Original number of genes in DTI:', orig_dti_genes)
print('New number of genes in DTI:',dti_genes)
print('\n')
print('Original number of genes:',orig_ppi_genes)
print('New number of genes:', n_genes)
print('\n')
print('Original number of drugs:',orig_ddi_drugs)
print('New number of drugs:', n_drugs)
print('\n')
print("Original number of genes in DF:",orig_df_genes)
print("New number of genes in DF:",df_genes)
print('\n')
print("Original number of drugs in DF:",orig_df_drugs)
print("New number of drugs in DF:",df_drugs)
print('\n')
# Side effects
print('Side effects')
print('Original number of joint side effects:',orig_se_combo)
print('New number of joint side effects:', len(se_names))
print('\n')
print('Original number of single side effects:', orig_se_mono)
print('New number of single side effects:', n_semono)

Interactions (edges)
Original number of PPI interactions: 20565
New number of PPI interactions: 16820


Original number of DTI interactions: 2974
New number of DTI interactions: 2921


Original number of DDI interactions: 97694
New number of DDI interactions: 3291


Original number of DSE interactions: 19701
New number of DSE interactions: 19498


Original number of DF interactions: 7552
New number of DF interactions: 6032


Drugs and genes (nodes)
Original number of drugs in DSE: 59
New number of drugs in DSE: 58


Original number drugs in DTI 59
New number of drugs in DTI 58


Original number of genes in DTI: 104
New number of genes in DTI: 104


Original number of genes: 5298
New number of genes: 4116


Original number of drugs: 59
New number of drugs: 58


Original number of genes in DF: 128
New number of genes in DF: 104


Original number of drugs in DF: 59
New number of drugs in DF: 58


Side effects
Original number of joint side effects: 1232
New number of joint side effects: 7


## Save

In [21]:
data = {}
# Dictionaries
data['gene2idx'] = gene2idx
data['drug2idx'] = drug2idx
data['se_mono_name2idx'] = se_mono_name2idx
data['se_combo_name2idx'] = se_combo_name2idx
# DDI
data['ddi_adj_list'] = ddi_adj_list
data['ddi_degrees_list'] = ddi_degrees_list
# DTI
data['dti_norm_dock'] = dti_norm_dock
data['dti_bind_energy'] = dti_bind_energy
data['dti_adj'] = dti_adj
# PPI
data['ppi_adj'] = ppi_adj
data['ppi_degrees'] = ppi_degrees
# DSE
data['drug_feat'] = drug_feat

In [22]:
filename = './data_structures/DS/DS_real_docking_DSE_' + str(n_semono) +'_genes_' +\
str(n_genes) + '_drugs_' + str(n_drugs) + '_se_' + str(N)
print(filename)

./data_structures/DS/DS_real_docking_DSE_5233_genes_4116_drugs_58_se_7


In [23]:
with open(filename, 'wb') as f:
    pickle.dump(data, f, protocol=3)