# Reduced Data Structures
This notebook chooses a small consistent subset of the treated dataset to be run in small machines. The generated dataset is limited to a number of **drug-drug interactions** specified by the variable $N$. From this dataset, it generates the corresponding data structures like dictionaries and adjacency matrices to be fed directly to DECAGON.<br>
This code is in part the adaptation in `pandas` of the script `drug_dataset.sh`, merged with `data_structures.ipynb`.

Author: Juan Sebastian Diaz Boada, May 2020

## Python 3

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import pickle
from joblib import Parallel, delayed

In [2]:
# Number of side effects 
#(Maximum is 1317, but the max chosen is 964 to be consistents with authors of decagon)
N = 964

In [3]:
# Here we choose specific side effects: Neumonia, high blood pressure, Acute kidney failure,
# High rate increased, heart attack, respiratory failure,
# ignoring the previous given number 
idx = [4,20,29,35,41,42]

### Import databases

In [4]:
PPI = pd.read_csv('clean_data/new-decagon-ppi.csv',sep=',')
PF = pd.read_csv('clean_data/new-decagon-genes.csv',sep=',')
DTI = pd.read_csv('clean_data/new-decagon-targets.csv',sep=',')
DDI = pd.read_csv('clean_data/new-decagon-combo.csv',sep=',')
DSE = pd.read_csv('clean_data/new-decagon-mono.csv',sep=',')
SE = pd.read_csv('original_data/bio-decagon-effectcategories.csv',sep=',')

In [5]:
# Number of interactions
orig_ppi = len(PPI.index)
orig_pf = len(PF.index)
orig_dti = len(DTI.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)
# Number of nodes
orig_ddi_drugs = len(pd.unique(DDI[['STITCH 1','STITCH 2']].values.ravel()))
orig_ppi_genes = len(pd.unique(PPI[['Gene 1','Gene 2']].values.ravel()))
orig_dti_drugs = len(pd.unique(DTI['STITCH']))
orig_dti_genes = len(pd.unique(DTI['Gene']))
orig_dse_drugs = len(pd.unique(DSE['STITCH'].values))
# Side effects
orig_se_mono = len(pd.unique(DSE['Side Effect Name']))
orig_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))

### Choose Side effects

In [6]:
DDI.head()

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,CID000002173,CID000003345,C0151714,hypermagnesemia
1,CID000002173,CID000003345,C0035344,retinopathy of prematurity
2,CID000002173,CID000003345,C0004144,atelectasis
3,CID000002173,CID000003345,C0002063,alkalosis
4,CID000002173,CID000003345,C0004604,Back Ache


In [7]:
# Sort DDI to be consistent with the authors
DDI['freq'] = DDI.groupby('Polypharmacy Side Effect')['Polypharmacy Side Effect']\
            .transform('count')
DDI = DDI.sort_values(by=['freq'], ascending=False).drop(columns=['freq'])
se = pd.unique(DDI['Polypharmacy Side Effect'].values)
se_name = pd.unique(DDI['Side Effect Name'].values)

In [8]:
DDI

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
630074,CID000003386,CID000003446,C0020649,arterial pressure NOS decreased
378049,CID000001983,CID000003958,C0020649,arterial pressure NOS decreased
638233,CID000093860,CID000148211,C0020649,arterial pressure NOS decreased
2529493,CID000003386,CID000004595,C0020649,arterial pressure NOS decreased
3590526,CID000003419,CID000003958,C0020649,arterial pressure NOS decreased
...,...,...,...,...
428164,CID000003075,CID000003780,C0011593,skin abrasion
4121993,CID000004585,CID000060787,C0022735,hypogonadotropic hypogonadism
3170584,CID000000450,CID000060852,C0014935,estrogen replacement
1775067,CID000004585,CID000005076,C0022735,hypogonadotropic hypogonadism


In [9]:
if 'idx' in locals():
    se = se[idx]
    N = len(se);
    print(N)
else:
    se = se[:N]
    print(len(se))

6


### Select DDIs

In [10]:
DDI = DDI[DDI['Polypharmacy Side Effect'].isin(se)].reset_index(drop=True)
DDI_drugs = pd.unique(DDI[['STITCH 1','STITCH 2']].values.ravel()) # Unique drugs 
drug2idx = {drug: i for i, drug in enumerate(DDI_drugs)}
se_names = pd.unique(DDI['Side Effect Name']) # Unique joint side effects
se_combo_name2idx = {se: i for i, se in enumerate(se_names)}
n_drugs = len(DDI_drugs)

In [11]:
drug2idx

{'CID000002375': 0,
 'CID000003672': 1,
 'CID000004946': 2,
 'CID000028112': 3,
 'CID000002022': 4,
 'CID000004595': 5,
 'CID000003143': 6,
 'CID000093860': 7,
 'CID000002771': 8,
 'CID000060184': 9,
 'CID000004635': 10,
 'CID000002153': 11,
 'CID000002541': 12,
 'CID000004205': 13,
 'CID000005426': 14,
 'CID000003877': 15,
 'CID000004583': 16,
 'CID000004666': 17,
 'CID000005650': 18,
 'CID000003958': 19,
 'CID000005556': 20,
 'CID000002955': 21,
 'CID000003348': 22,
 'CID006447131': 23,
 'CID000003405': 24,
 'CID000003793': 25,
 'CID000001972': 26,
 'CID000003657': 27,
 'CID000002806': 28,
 'CID000003869': 29,
 'CID000002156': 30,
 'CID000005245': 31,
 'CID000002662': 32,
 'CID000005203': 33,
 'CID000003652': 34,
 'CID000054688': 35,
 'CID000000206': 36,
 'CID000003883': 37,
 'CID000004679': 38,
 'CID000004168': 39,
 'CID000004264': 40,
 'CID000003310': 41,
 'CID000004585': 42,
 'CID000005267': 43,
 'CID000039765': 44,
 'CID000004909': 45,
 'CID000005487': 46,
 'CID000064147': 47,
 '

### Select Drug side effects

In [12]:
DSE = DSE[DSE['STITCH'].isin(DDI_drugs)].reset_index(drop=True)
dse_drugs = len(pd.unique(DSE['STITCH'].values))
se_mono_names = pd.unique(DSE['Side Effect Name'].values) # Unique individual side effects
se_mono_name2idx = {name: i for i, name in enumerate(se_mono_names)}
n_semono = len(se_mono_names)

In [13]:
se_mono_name2idx

{'central nervous system mass': 0,
 'Photosensitivity reaction': 1,
 'leukaemic infiltration brain': 2,
 'platelet adhesiveness abnormal': 3,
 'Ventricular dysfunction': 4,
 'cytogenetic analysis abnormal': 5,
 'pollakiuria': 6,
 'myelocytosis': 7,
 'gingival infection': 8,
 'retroperitoneal lymphadenopathy': 9,
 'iliac artery stenosis': 10,
 'neutrophil count decreased': 11,
 'electromechanical dissociation': 12,
 'transplant failure': 13,
 'shock haemorrhagic': 14,
 'incision site oedema': 15,
 'troponin increased': 16,
 'subdiaphragmatic abscess': 17,
 'Macular oedema': 18,
 'capillary disorder': 19,
 'Conjunctival haemorrhage': 20,
 'escherichia urinary tract infection': 21,
 'Rash papular': 22,
 'Rash macular': 23,
 'gastrointestinal mucosal disorder': 24,
 'chylothorax': 25,
 'diastolic dysfunction': 26,
 'Tongue disorder': 27,
 'diverticulum': 28,
 'Periorbital oedema': 29,
 'acute respiratory failure': 30,
 'abnormal loss of weight': 31,
 'drug intolerance': 32,
 'meningitis li

### Select DTIs

In [13]:
DTI = DTI[DTI['STITCH'].isin(DDI_drugs)].reset_index(drop=True)
DTI_genes = pd.unique(DTI['Gene']) # Unique genes in DTI
DTI_drugs = pd.unique(DTI['STITCH']) # Unique drugs in DTI
dti_drugs = len(DTI_drugs)
dti_genes = len(DTI_genes)

### Select PPIs

In [14]:
PPI = PPI[np.logical_or(PPI['Gene 1'].isin(DTI_genes),
                       PPI['Gene 2'].isin(DTI_genes))].reset_index(drop=True)
PPI_genes = pd.unique(PPI[['Gene 1','Gene 2']].values.ravel()) # Unique genes is PPI
gene2idx = {gene: i for i, gene in enumerate(PPI_genes)}
n_genes = len(PPI_genes)

### Select PFs

In [15]:
PF = PF[PF['GeneID'].isin(PPI_genes)].reset_index(drop=True)

## Adjacency matrices and degrees

In [16]:
# DDI adjacency matrices
def se_adj_matrix(se_name):
    m = np.zeros([n_drugs,n_drugs],dtype=int)
    seDDI = DDI[DDI['Side Effect Name'].str.match(se_name)].reset_index()
    for j in seDDI.index:
        row = drug2idx[seDDI.loc[j,'STITCH 1']]
        col = drug2idx[seDDI.loc[j,'STITCH 2']]
        m[row,col] = m[col,row] = 1
    return sp.csr_matrix(m) 
ddi_adj_list = Parallel(n_jobs=8)\
    (delayed(se_adj_matrix)(d) for d in se_combo_name2idx.keys())        
ddi_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in ddi_adj_list]

In [17]:
# DTI adjacency matrix
dti_adj = np.zeros([n_genes,n_drugs],dtype=int)
for i in DTI.index:
    row = gene2idx[DTI.loc[i,'Gene']]
    col = drug2idx[DTI.loc[i,'STITCH']]
    dti_adj[row,col] = 1
dti_adj = sp.csr_matrix(dti_adj)

In [18]:
# PPI adjacency matrix and degrees
ppi_adj = np.zeros([n_genes,n_genes],dtype=int)
for i in PPI.index:
    row = gene2idx[PPI.loc[i,'Gene 1']]
    col = gene2idx[PPI.loc[i,'Gene 2']]
    ppi_adj[row,col]=ppi_adj[col,row]=1
ppi_degrees = np.sum(ppi_adj,axis=0)
ppi_adj = sp.csr_matrix(ppi_adj)

In [19]:
# Drug Feature matrix
drug_feat = np.zeros([n_drugs,n_semono],dtype=int)
for i in DSE.index:
    row = drug2idx[DSE.loc[i,'STITCH']]
    col = se_mono_name2idx[DSE.loc[i,'Side Effect Name']]
    drug_feat[row,col] = 1
drug_feat = sp.csr_matrix(drug_feat)

In [20]:
# Protein feature matrices
prot_feat = sp.coo_matrix(
    PF[['Length', 'Mass', 'n_helices', 'n_strands', 'n_turns']].to_numpy())
norm_prot_feat = sp.coo_matrix(
    PF[['Normalized Helices(Mean)',
       'Normalized Strands(Mean)',
       'Normalized Turns(Mean)']].to_numpy())
n_pf = np.shape(prot_feat)[1]
n_npf = np.shape(norm_prot_feat)[1]

### Print

In [21]:
# Interactions (edges)
print('Interactions (edges)')
print('Original number of PPI interactions:', orig_ppi)
print('New number of PPI interactions:', len(PPI.index))
print('\n')
print('Original number of DTI interactions:', orig_dti)
print('New number of DTI interactions:', len(DTI.index))
print('\n')
print('Original number of DDI interactions:', orig_ddi)
print('New number of DDI interactions:', len(DDI.index))
print('\n')
print('Original number of DSE interactions:', orig_dse)
print('New number of DSE interactions:', len(DSE.index))
print('\n')
# Drugs and genes (nodes)
print('Drugs and genes (nodes)')
print("Original number of drugs in DSE:",orig_dse_drugs)
print("New number of drugs in DSE:",dse_drugs)
print('\n')
print("Original number of genes in PF:",orig_pf)
print("New number of genes in PF:",len(PF.index))
print('\n')
print("Original number drugs in DTI",orig_dti_drugs)
print("New number of drugs in DTI",dti_drugs)
print('\n')
print('Original number of genes in DTI:', orig_dti_genes)
print('New number of genes in DTI:',dti_genes)
print('\n')
print('Original number of genes:',orig_ppi_genes)
print('New number of genes:', n_genes)
print('\n')
print('Original number of drugs:',orig_ddi_drugs)
print('New number of drugs:', n_drugs)
print('\n')
# Side effects
print('Side effects')
print('Original number of joint side effects:',orig_se_combo)
print('New number of joint side effects:', len(se_names))
print('\n')
print('Original number of single side effects:', orig_se_mono)
print('New number of single side effects:', n_semono)
print('\n')
# Protein side effects
print('Number of protein features:',n_pf)
print('Number of normalized protein features:',n_npf)

Interactions (edges)
Original number of PPI interactions: 693353
New number of PPI interactions: 310983


Original number of DTI interactions: 18291
New number of DTI interactions: 18091


Original number of DDI interactions: 4615522
New number of DDI interactions: 110776


Original number of DSE interactions: 174977
New number of DSE interactions: 174173


Drugs and genes (nodes)
Original number of drugs in DSE: 639
New number of drugs in DSE: 630


Original number of genes in PF: 17929
New number of genes in PF: 16269


Original number drugs in DTI 283
New number of drugs in DTI 279


Original number of genes in DTI: 3587
New number of genes in DTI: 3582


Original number of genes: 17929
New number of genes: 16269


Original number of drugs: 639
New number of drugs: 630


Side effects
Original number of joint side effects: 1317
New number of joint side effects: 6


Original number of single side effects: 9702
New number of single side effects: 9688


Number of protein features: 5
Num

## Save

In [22]:
data = {}
# Dictionaries
data['gene2idx'] = gene2idx
data['drug2idx'] = drug2idx
data['se_mono_name2idx'] = se_mono_name2idx
data['se_combo_name2idx'] = se_combo_name2idx
# DDI
data['ddi_adj_list'] = ddi_adj_list
data['ddi_degrees_list'] = ddi_degrees_list
# DTI
data['dti_adj'] = dti_adj
# PPI
data['ppi_adj'] = ppi_adj
data['ppi_degrees'] = ppi_degrees
# DSE
data['drug_feat'] = drug_feat
# PF
data['prot_feat'] = prot_feat
data['norm_prot_feat'] = norm_prot_feat

In [25]:
filename = './data_structures/DS/DS_real_DSE_' + str(n_semono) + '_PF_' + str(n_pf) +\
'_NPF_'+ str(n_npf) + '_genes_'+str(n_genes)+'_drugs_'+str(n_drugs)+'_se_'+str(N)
with open(filename, 'wb') as f:
    pickle.dump(data, f, protocol=3)