# Remove Outliers
This notebook filters the original databases used in DECAGON plus the affinity feature database to remove any unlinked node in the network. In short, it keeps only the elements present in all the databases so that the output database is consistent.<br>
This code is in part the adaptation in `pandas` of the script `remove_outliers.sh`.

Author: Juan Sebastian Diaz Boada, August 2020

## Python 3

In [1]:
import pandas as pd
import numpy as np

Import DECAGON Data as `pandas` dataframes

In [2]:
PPI = pd.read_csv('original_data/bio-decagon-ppi.csv',sep=',')
DTI = pd.read_csv('original_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('original_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('original_data/bio-decagon-mono.csv',sep=',')
AF = pd.read_csv('affinities/DTI-affinity.csv',sep=',')

In [3]:
# Original number of interactions
orig_ppi = len(PPI.index)
orig_dti = len(DTI.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)
orig_af = len(AF.index)

### Initial number of genes, drugs and side effects of datasets

In [4]:
# PPI genes
orig_genes_ppi = len(pd.unique((PPI[["Gene 1", "Gene 2"]].values.ravel())))
# Affinity genes
AF_genes = pd.unique(AF['Target'].values)
orig_genes_af = len(AF_genes)
# Affinity drugs
AF_drugs = pd.unique(AF['Drug'].values)
orig_drugs_af = len(AF_drugs)

In [5]:
# DDI drugs
DDI_drugs = pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel())
orig_drugs_ddi = len(DDI_drugs)
# DDI Side effects
orig_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))
# Drugs with single side effects
DSE_drugs = pd.unique(DSE['STITCH'].values)
orig_drug_dse = len(DSE_drugs)
# DSE Side effects
orig_se_mono = len(pd.unique(DSE['Side Effect Name']))

In [6]:
# DTI genes
DTI_genes = pd.unique(DTI['Gene'].values)
orig_genes_dti = len(DTI_genes)
# DTI drugs
DTI_drugs = pd.unique(DTI['STITCH'].values)
orig_drugs_dti = len(DTI_drugs)

### Node intersection of datasets
1. DTI and AF drug intersection (I1)<br>
2. DSE and DDI intersection (I2)
3. I2 and I1 total intersection (I3)
4. Selection of DTI, DSE and DDI drugs
5. DTI and AF gene intersection (I4)
6. PPI and I4 partial intersection (Formation of PPI network)
7. DTI update with PPI genes

In [7]:
# 1. DTI and AF drug intersection
I1 = np.intersect1d(DTI_drugs,AF_drugs,assume_unique=True)
# 2. DSE and DDI intersections
I2 = np.intersect1d(DSE_drugs,DDI_drugs,assume_unique=True)
# 3. I2 and I1 intersection
I3 = np.intersect1d(I2,I1,assume_unique=True)
# 4. Selection of DTI, DDI and DSE drugs
DDI = DDI[np.logical_and(DDI['STITCH 1'].isin(I3).values,
                     DDI['STITCH 2'].isin(I3).values)]
DDI_drugs = pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel())
AF = AF[AF['Drug'].isin(DDI_drugs).values]
DTI = DTI[DTI['STITCH'].isin(DDI_drugs).values]
DSE = DSE[DSE['STITCH'].isin(DDI_drugs).values]
new_drugs_dti = len(pd.unique(DTI['STITCH'].values))
new_drugs_ddi = len(DDI_drugs)
new_drugs_dse = len(pd.unique(DSE['STITCH'].values))
new_drugs_af = len(pd.unique(AF['Drug'].values))
new_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))
new_se_mono = len(pd.unique(DSE['Side Effect Name']))

In [8]:
# 5. DTI and AF gene intersection
I4 = np.intersect1d(DTI_genes,AF_genes,assume_unique=True)
# 6. PPI and I4 partial intersection (Formation of PPI network)
PPI = PPI[np.logical_or(PPI['Gene 1'].isin(I4).values,
                     PPI['Gene 2'].isin(I4).values)]
PPI_genes = pd.unique(PPI[["Gene 1", "Gene 1"]].values.ravel())
new_genes_ppi = len(PPI_genes)
# 7. DTI update with PPI genes
AF = AF[AF['Target'].isin(PPI_genes).values]
DTI = DTI[DTI['Gene'].isin(PPI_genes).values]
new_genes_af = len(pd.unique(AF['Target'].values))
new_genes_dti = len(pd.unique(DTI['Gene'].values))

In [9]:
# Interactions (edges)
print('Interactions (edges)')
print ('Original number of PPI interactions',orig_ppi)
print ('New number of PPI interactions',len(PPI.index))
print('\n')
print ('Original number of DTI interactions',orig_dti)
print ('New number of DTI interactions',len(DTI.index))
print('\n')
print ('Original number of DDI interactions',orig_ddi)
print ('New number of DDI interactions', len(DDI.index))
print('\n')
print ('Original number of DSE interactions',orig_dse)
print('New number of DSE interactions',len(DSE.index))
print('\n')
print ('Original number of AF interactions',orig_af)
print('New number of AF interactions',len(AF.index))
print('\n')
# Drugs and genes (nodes)
print('Drugs and genes (nodes)')
print("Original number of drugs in DSE:",orig_drug_dse)
print("New number of drugs in DSE:",new_drugs_dse)
print('\n')
print("Original number drugs in DTI",orig_drugs_dti)
print("New number of drugs in DTI",new_drugs_dti)
print('\n')
print('Original number of genes in DTI:',orig_genes_dti)
print('New number of genes in DTI:',new_genes_dti)
print('\n')
print('Original number of genes in PPI:',orig_genes_ppi)
print('New number of genes in PPI:',new_genes_ppi)
print('\n')
print('Original number of drugs in DDI:',orig_drugs_ddi)
print('New number of drugs in DDI:',new_drugs_ddi)
print('\n')
print('Original number of genes in AF:',orig_genes_af)
print('New number of genes in AF:',new_genes_af)
print('\n')
print("Original number drugs in AF",orig_drugs_af)
print("New number of drugs in AF",new_drugs_af)
print('\n')
# Side effects
print('Side effects')
print('Original number of joint side effects:',orig_se_combo)
print('New number of joint side effects:', new_se_combo)
print('\n')
print('Original number of single side effects:', orig_se_mono)
print('New number of single side effects:', new_se_mono)

Interactions (edges)
Original number of PPI interactions 715612
New number of PPI interactions 585448


Original number of DTI interactions 131034
New number of DTI interactions 17802


Original number of DDI interactions 4649441
New number of DDI interactions 1208035


Original number of DSE interactions 174977
New number of DSE interactions 81275


Original number of AF interactions 131034
New number of AF interactions 17802


Drugs and genes (nodes)
Original number of drugs in DSE: 639
New number of drugs in DSE: 282


Original number drugs in DTI 1774
New number of drugs in DTI 282


Original number of genes in DTI: 7795
New number of genes in DTI: 3534


Original number of genes in PPI: 19081
New number of genes in PPI: 16466


Original number of drugs in DDI: 645
New number of drugs in DDI: 282


Original number of genes in AF: 7795
New number of genes in AF: 3534


Original number drugs in AF 1774
New number of drugs in AF 282


Side effects
Original number of joint side effects

## Export to csv

In [10]:
PPI.to_csv('affinities/small-decagon-ppi.csv',index=False,sep=',')
DTI.to_csv('affinities/small-decagon-targets.csv',index=False,sep=',')
DDI.to_csv('affinities/small-decagon-combo.csv',index=False,sep=',')
DSE.to_csv('affinities/small-decagon-mono.csv',index=False,sep=',')
AF.to_csv('affinities/small-decagon-affinities.csv',index=False,sep=',')