# Remove Outliers
This notebook filters the original databases used in DECAGON plus the protein feature databases to remove any unlinked node in the network. In short, it keeps only the elements present in all the databases so that the output database is consistent.<br>
This code is the adaptation in `pandas` of the script `remove_outliers.sh`.

Author: Juan Sebastian Diaz Boada, May 2020

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
# Only for Python 2
from __future__ import print_function

Import DECAGON Data as `pandas` dataframes

In [3]:
PPI = pd.read_csv('original_data/bio-decagon-ppi.csv',sep=',')
PF = pd.read_csv('original_data/proteins.csv',sep=';')
DTI = pd.read_csv('original_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('original_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('original_data/bio-decagon-mono.csv',sep=',')
# Remove comma after thousand
PF['Mass'] = PF['Mass'].apply(lambda x: x.replace(',', '')).astype('int')

In [4]:
# Original number of interactions
orig_ppi = len(PPI.index)
orig_dti = len(DTI.index)
orig_pf = len(PF.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)

### Common genes between PPI network and protein features

In [5]:
# PPI genes
PPI_genes = pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel())
orig_genes_ppi = len(PPI_genes) # Original number of genes
# PF genes
PF_genes = np.array(PF['GeneID'].tolist())
orig_genes_pf = len(PF_genes) # Original number of genes

In [6]:
# Calculate the instersection of the PPI and PF
# (i.e., the genes in the interaction network that code proteins with features)
inter_genes = intersect1d(PPI_genes,PF_genes,assume_unique=True)

In [7]:
# Choose only the entries in PPI that are in the intersection
PPI = PPI[np.logical_and(PPI['Gene 1'].isin(inter_genes).values,
                     PPI['Gene 2'].isin(inter_genes).values)]
# Some genes in PPI that are common to all 3 datasets may only interact with genes that are
# non-common (outsiders). That is why we need to filter a second time using this array.
PPI_genes = pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel()).astype(numpy.string_)
PF = PF[PF['GeneID'].isin(PPI_genes)]
new_genes_ppi = len(pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel()))
new_genes_pf = len(pd.unique(PF['GeneID'].values))
new_ppi = len(PPI.index)
new_pf = len(PF.index)

### Common drugs between DDI network and drug single side effects

In [8]:
# DDI drugs
DDI_drugs = np.array(pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel()).tolist())
orig_drugs_ddi = len(DDI_drugs) # Original number of drugs
# Drugs with single side effects
DSE_drugs = np.array(pd.unique(DSE['STITCH'].values).tolist())
orig_drugs_dse = len(DSE_drugs) # Original number of drugs

In [9]:
# Calculate the instersection of the DDI and DSE
# (i.e., the drugs in the intercation network that have single side effect)
inter_drugs = intersect1d(DDI_drugs,DSE_drugs,assume_unique=True)
# Choose only the entries in DDI that are in the intersection
DDI = DDI[np.logical_and(DDI['STITCH 1'].isin(inter_drugs).values,
                     DDI['STITCH 2'].isin(inter_drugs).values)]
# Some drugs in DDI that are common to all 3 datasets may only interact with genes that are
# non-common (outsiders). That is why we need to filter a second time using this array.
DDI_drugs = pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel()).astype(numpy.string_)
DSE = DSE[DSE['STITCH'].isin(DDI_drugs)]
new_drugs_ddi = len(pd.unique(DDI[['STITCH 1','STITCH 2']].values.ravel()))
new_drugs_dse = len(pd.unique(DSE['STITCH'].values))
new_ddi = len(DDI.index)
new_dse = len(DSE.index)

### Selection of entries of DTI database

In [10]:
orig_genes_dti = len(pd.unique(DTI['Gene'].values))
orig_drugs_dti = len(pd.unique(DTI['STITCH'].values))
DTI = DTI[np.logical_and(DTI['STITCH'].isin(DDI_drugs),DTI['Gene'].isin(PPI_genes))]
new_dti = len(DTI.index)
new_genes_dti = len(pd.unique(DTI['Gene'].values))
new_drugs_dti = len(pd.unique(DTI['STITCH'].values))

In [11]:
print ('Original number of PPI interactions',orig_ppi)
print ('New number of PPI interactions',new_ppi)
print('\n')
print ('Original number of DTI interactions',orig_dti)
print ('New number of DTI interactions',new_dti)
print('\n')
print ('Original number of DDI interactions',orig_ddi)
print ('New number of DDI interactions',new_ddi)
print('\n')
print ('Original number of proteins with features',orig_pf)
print ('New number of proteins with features',new_pf)
print('\n')
print ('Original number of single side effect interactions',orig_dse)
print('New number of single side effect interactions',new_dse)
print('\n')
print("Original number of unique genes in PPI:",orig_genes_ppi)
print("New number of unique genes in PPI:",new_genes_ppi)
print("Original number of genes whose proteins have features:",orig_genes_pf)
print("New number of genes whose proteins have features:",new_genes_pf)
print("Original number of unique genes in DTI",orig_genes_dti)
print("New number of unique genes in DTI",new_genes_dti)
print('\n')
print("Original number of unique drugs in DDI:",orig_drugs_ddi)
print("New number of unique drugs in DDI:",new_drugs_ddi)
print("Original number of drugs with single side effects:",orig_drugs_dse)
print("New number of drugs with single side effects:",new_drugs_dse)
print("Original number of unique drugs in DTI",orig_drugs_dti)
print("New number of unique drugs in DTI",new_drugs_dti)

Original number of PPI interactions 715612
New number of PPI interactions 693353


Original number of DTI interactions 131034
New number of DTI interactions 18291


Original number of DDI interactions 4649441
New number of DDI interactions 4615522


Original number of proteins with features 18991
New number of proteins with features 17929


Original number of single side effect interactions 174977
New number of single side effect interactions 174977


Original number of unique genes in PPI: 19081
New number of unique genes in PPI: 17929
Original number of genes whose proteins have features: 18991
New number of genes whose proteins have features: 17929
Original number of unique genes in DTI 7795
New number of unique genes in DTI 3587


Original number of unique drugs in DDI: 645
New number of unique drugs in DDI: 639
Original number of drugs with single side effects: 639
New number of drugs with single side effects: 639
Original number of unique drugs in DTI 1774
New number of unique dr

## Export to csv

In [12]:
PPI.to_csv('./modif_data/new-decagon-ppi.csv',index=False,sep=',')
DTI.to_csv('./modif_data/new-decagon-targets.csv',index=False,sep=',')
DDI.to_csv('./modif_data/new-decagon-combo.csv',index=False,sep=',')
DSE.to_csv('./modif_data/new-decagon-mono.csv',index=False,sep=',')
PF.to_csv('./modif_data/new-decagon-genes.csv',index=False,sep=',')