# Remove Outliers
This notebook filters the original databases used in DECAGON plus the protein feature databases to remove any unlinked node in the network. In short, it keeps only the elements present in all the databases so that the output database is consistent.<br>
This code is in part the adaptation in `pandas` of the script `remove_outliers.sh`.

Author: Juan Sebastian Diaz Boada, May 2020

## Python 3

In [1]:
import pandas as pd
import numpy as np

Import DECAGON Data as `pandas` dataframes

In [2]:
PPI = pd.read_csv('original_data/bio-decagon-ppi.csv',sep=',')
DTI = pd.read_csv('original_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('original_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('original_data/bio-decagon-mono.csv',sep=',')
DF = pd.read_csv('amir_docking_data/docking_result_sample.csv',sep=',')

In [3]:
# Original number of interactions
orig_ppi = len(PPI.index)
orig_dti = len(DTI.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)
orig_df = len(DF.index)

### Initial number of genes, drugs and side effects of datasets

In [4]:
# PPI genes
orig_genes_ppi = len(pd.unique((PPI[["Gene 1", "Gene 2"]].values.ravel())))
# Docking genes
DF_genes = pd.unique(DF['Gene'].values)
orig_genes_df = len(DF_genes)
# Docking drugs
DF_drugs = pd.unique(DF['Drug'].values)
orig_drugs_df = len(DF_drugs)

In [5]:
# DDI drugs
DDI_drugs = pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel())
orig_drugs_ddi = len(DDI_drugs)
# DDI Side effects
orig_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))
# Drugs with single side effects
DSE_drugs = pd.unique(DSE['STITCH'].values)
orig_drug_dse = len(DSE_drugs)
# DSE Side effects
orig_se_mono = len(pd.unique(DSE['Side Effect Name']))

In [6]:
# DTI genes
DTI_genes = pd.unique(DTI['Gene'].values)
orig_genes_dti = len(DTI_genes)
# DTI drugs
DTI_drugs = pd.unique(DTI['STITCH'].values)
orig_drugs_dti = len(DTI_drugs)

### Node intersection of datasets
1. DTI and DF total intersection <br>
    a. DTI and DF gene intersection (I1g)<br>
    b. DTI and DF drug intersection (I1d)<br>
2. Formation of DTI network
3. PPI and I1 partial intersection (Formation of PPI network)
4. DSE and DDI intersection (I2)
5. I2 and DTI(new) total intersection (I3)
6. Formation of DDI network with DSE

In [7]:
# 1. DTI and DF total intersection
I1g = np.intersect1d(DTI_genes,DF_genes,assume_unique=True)
I1d = np.intersect1d(DTI_drugs,DF_drugs,assume_unique=True)
# 2. Forming DTI network
DF = DF[np.logical_and(DF['Gene'].isin(I1g).values,
                     DF['Drug'].isin(I1d).values)]
DTI = DTI[np.logical_and(DTI['Gene'].isin(I1g).values,
                     DTI['STITCH'].isin(I1d).values)]
DTI_genes = pd.unique(DTI['Gene'].values)
DTI_drugs = pd.unique(DTI['STITCH'].values)
new_genes_dti = len(DTI_genes)
new_drugs_dti = len(DTI_drugs)

In [8]:
# 3. Partial intersection for PPI
PPI = PPI[np.logical_or(PPI['Gene 1'].isin(I1g).values,
                     PPI['Gene 2'].isin(I1g).values)]
PPI_genes = pd.unique(PPI[["Gene 1", "Gene 1"]].values.ravel())
new_genes_ppi = len(pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel()))
new_genes_df = len(pd.unique(DF['Gene'].values))

In [9]:
# 4 & 5. DSE and DDI intersections & I2 and DTI total intersection
I2 = np.intersect1d(DSE_drugs,DDI_drugs,assume_unique=True)
I3 = np.intersect1d(I2,DTI_drugs,assume_unique=True)
assert new_drugs_dti==len(I3),'DTI has different drugs that I2'

In [10]:
# Total intersection for DDI
DDI = DDI[np.logical_and(DDI['STITCH 1'].isin(I3).values,
                     DDI['STITCH 2'].isin(I3).values)]
DSE = DSE[DSE['STITCH'].isin(I3)]
new_drugs_ddi = len(pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel()))
new_drugs_dse = len(pd.unique(DSE['STITCH'].values))
new_drugs_df = len(pd.unique(DF['Drug'].values))
new_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))
new_se_mono = len(pd.unique(DSE['Side Effect Name']))

In [11]:
# Interactions (edges)
print('Interactions (edges)')
print ('Original number of PPI interactions',orig_ppi)
print ('New number of PPI interactions',len(PPI.index))
print('\n')
print ('Original number of DTI interactions',orig_dti)
print ('New number of DTI interactions',len(DTI.index))
print('\n')
print ('Original number of DDI interactions',orig_ddi)
print ('New number of DDI interactions', len(DDI.index))
print('\n')
print ('Original number of DSE interactions',orig_dse)
print('New number of DSE interactions',len(DSE.index))
print('\n')
print ('Original number of DF interactions',orig_df)
print('New number of DF interactions',len(DF.index))
print('\n')
# Drugs and genes (nodes)
print('Drugs and genes (nodes)')
print("Original number of drugs in DSE:",orig_drug_dse)
print("New number of drugs in DSE:",new_drugs_dse)
print('\n')
print("Original number drugs in DTI",orig_drugs_dti)
print("New number of drugs in DTI",new_drugs_dti)
print('\n')
print('Original number of genes in DTI:',orig_genes_dti)
print('New number of genes in DTI:',new_genes_dti)
print('\n')
print('Original number of genes in PPI:',orig_genes_ppi)
print('New number of genes in PPI:',new_genes_ppi)
print('\n')
print('Original number of drugs in DDI:',orig_drugs_ddi)
print('New number of drugs in DDI:',new_drugs_ddi)
print('\n')
print('Original number of genes in DF:',orig_genes_df)
print('New number of genes in DF:',new_genes_df)
print('\n')
print("Original number drugs in DF",orig_drugs_df)
print("New number of drugs in DF",new_drugs_df)
print('\n')
# Side effects
print('Side effects')
print('Original number of joint side effects:',orig_se_combo)
print('New number of joint side effects:', new_se_combo)
print('\n')
print('Original number of single side effects:', orig_se_mono)
print('New number of single side effects:', new_se_mono)

Interactions (edges)
Original number of PPI interactions 715612
New number of PPI interactions 20565


Original number of DTI interactions 131034
New number of DTI interactions 2974


Original number of DDI interactions 4649441
New number of DDI interactions 97694


Original number of DSE interactions 174977
New number of DSE interactions 19701


Original number of DF interactions 7680
New number of DF interactions 7552


Drugs and genes (nodes)
Original number of drugs in DSE: 639
New number of drugs in DSE: 59


Original number drugs in DTI 1774
New number of drugs in DTI 59


Original number of genes in DTI: 7795
New number of genes in DTI: 104


Original number of genes in PPI: 19081
New number of genes in PPI: 5298


Original number of drugs in DDI: 645
New number of drugs in DDI: 59


Original number of genes in DF: 128
New number of genes in DF: 128


Original number drugs in DF 60
New number of drugs in DF 59


Side effects
Original number of joint side effects: 1317
New number

## Export to csv

In [12]:
PPI.to_csv('amir_docking_data/small-decagon-ppi.csv',index=False,sep=',')
DTI.to_csv('amir_docking_data/small-decagon-targets.csv',index=False,sep=',')
DDI.to_csv('amir_docking_data/small-decagon-combo.csv',index=False,sep=',')
DSE.to_csv('amir_docking_data/small-decagon-mono.csv',index=False,sep=',')
DF.to_csv('amir_docking_data/small-decagon-docking.csv',index=False,sep=',')