# Remove Outliers
This notebook filters the original databases used in DECAGON plus the protein feature databases to remove any unlinked node in the network. In short, it keeps only the elements present in all the databases so that the output database is consistent.<br>
This code is the adaptation in `pandas` of the script `remove_outliers.sh`.

Author: Juan Sebastian Diaz Boada, May 2020

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
# Only for Python 2
from __future__ import print_function

Import DECAGON Data as `pandas` dataframes

In [15]:
PPI = pd.read_csv('orig_data/bio-decagon-ppi.csv',sep=',')
PF = pd.read_csv('orig_data/proteins.csv',sep=';')
DTI = pd.read_csv('orig_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('orig_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('orig_data/bio-decagon-mono.csv',sep=',')

In [16]:
# Remove comma after thousand
PF['Mass'] = PF['Mass'].apply(lambda x: x.replace(',', '')).astype('int')

## Genes
Form arrays of unique genes in every dataset. Print their lengths

In [17]:
orig_ppi = len(PPI.index)
orig_dti = len(DTI.index)
orig_pf = len(PF.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)
# PPI genes
PPI_genes = pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel())
orig_genes_ppi = len(PPI_genes)
# PF genes
PF_genes = np.array(PF['GeneID'].tolist())
orig_genes_pf = len(PF_genes)
# DTI genes
DTI_genes = pd.unique(DTI["Gene"].values)
orig_genes_dti = len(DTI_genes)

Calculate the instersection of the 3 datasets (i.e. sind the common genes of the 3 datasest)

In [18]:
inter_genes = np.intersect1d(DTI_genes,np.intersect1d(PPI_genes,PF_genes))

Form new datasets with the common genes of the 3 datasets

In [19]:
PPI = PPI[np.logical_and(PPI['Gene 1'].isin(inter_genes).values,
                     PPI['Gene 2'].isin(inter_genes).values)]
# Some genes in PPI that are common to all 3 datasets may only interact with genes that are
# non-common (outsiders). That is why we need to filter a second time using this array.
PPI_genes = pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel()).astype(numpy.string_)
DTI = DTI[DTI['Gene'].isin(PPI_genes)]
PF = PF[PF['GeneID'].isin(PPI_genes)]
new_genes_ppi = len(pd.unique(PPI[["Gene 1", "Gene 2"]].values.ravel()))
new_genes_pf = len(pd.unique(PF['GeneID'].values))
new_genes_dti = len(pd.unique(DTI['Gene'].values))
new_ppi = len(PPI.index)
mid_dti = len(DTI.index)
new_pf = len(PF.index)

## Drugs
Form arrays of unique drugs in every dataset. Print their lengths

In [20]:
# DTI drugs
DTI_drugs = np.array(pd.unique(DTI["STITCH"].values).tolist())
orig_drugs_dti = len(DTI_drugs)
# DDI drugs
DDI_drugs = np.array(pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel()).tolist())
orig_drugs_ddi = len(DDI_drugs)
# Drugs with single side effects
DSE_drugs = np.array(pd.unique(DSE['STITCH'].values).tolist())
orig_drugs_dse = len(DSE_drugs)

Create new rows in DSE for drugs in DTI absent in DSE. Add value 'no_se' in side effects.<br>
(This is done to avoid ambiguity with the number of drugs in DDI and DSE. The code of DECAGON adds a vector of zero to these drugs anyway)

In [21]:
no_feat = np.setdiff1d(DDI_drugs,DSE_drugs) #outliers
n_nf = no_feat.shape[0] #Number of outliers
# Create DataFrame with outliers and append it to original DataFrame
se_outliers = {'STITCH':no_feat,'Individual Side Effect':n_nf*['no_se'],\
       'Side Effect Name':n_nf*['no_se']}
DSE = DSE.append(pd.DataFrame(se_outliers),sort=False, ignore_index=True)
# New number of drugs with single side effects
DSE_drugs = array(pd.unique(DSE['STITCH'].values).tolist())

Calculate the instersection of the 3 datasets (i.e. the common drugs of the 3 datasest)

In [22]:
inter_drugs = np.intersect1d(DTI_drugs,DDI_drugs)
n_drugs = len(inter_drugs)

Form new datasets with the common genes of the 3 datasets

In [23]:

DTI = DTI[DTI['STITCH'].isin(inter_drugs)]
DSE = DSE[DSE['STITCH'].isin(inter_drugs)]

new_drugs_ddi = len(pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel()))
new_drugs_dti = len(pd.unique(DTI['STITCH'].values))
new_drugs_dse = len(pd.unique(DSE['STITCH'].values))
new_ddi = len(DDI.index)
new_dti = len(DTI.index)
new_dse = len(DSE.index)

In [24]:
print ('Original number of PPI interactions',orig_ppi)
print ('New number of PPI interactions',new_ppi)
print('\n')
print ('Original number of DTI interactions',orig_dti)
print ('Number of DTI after gene outlier removal',mid_dti)
print ('New number of DTI interactions',new_dti)
print('\n')
print ('Original number of DDI interactions',orig_ddi)
print ('New number of DDI interactions',new_ddi)
print('\n')
print ('Original number of Protein Features',orig_pf)
print ('New number of Protein Features',new_pf)
print('\n')
print ('Original number of single side effect interactions',orig_dse)
print('New number of single side effect interactions',new_dse)
print('\n')
print("Original number of unique genes in PPI:",orig_genes_ppi)
print("New number of unique genes in PPI:",new_genes_ppi)
print("Original number of genes whose proteins have features:",orig_genes_pf)
print("New number of genes whose proteins have features:",new_genes_pf)
print("Original number of unique genes in DTI",orig_genes_dti)
print("New number of unique genes in DTI",new_genes_dti)
print('\n')
print("Original number of unique drugs in DTI",orig_drugs_dti)
print("New number of unique drugs in DTI",new_drugs_dti)
print("Original number of unique drugs in DDI:",orig_drugs_ddi)
print("New number of unique drugs in DDI:",new_drugs_ddi)
print("Original number of drugs with single side effects:",orig_drugs_dse)
print("New number of drugs with single side effects:",new_drugs_dse)
print('New number of drugs:',n_drugs)


Original number of PPI interactions 715612
New number of PPI interactions 271197


Original number of DTI interactions 131034
Number of DTI after gene outlier removal 127755
New number of DTI interactions 18253


Original number of DDI interactions 4649441
New number of DDI interactions 1208279


Original number of Protein Features 18991
New number of Protein Features 7586


Original number of single side effect interactions 174977
New number of single side effect interactions 81286


Original number of unique genes in PPI: 19081
New number of unique genes in PPI: 7586
Original number of genes whose proteins have features: 18991
New number of genes whose proteins have features: 7586
Original number of unique genes in DTI 7795
New number of unique genes in DTI 7586


Original number of unique drugs in DTI 1774
New number of unique drugs in DTI 284
Original number of unique drugs in DDI: 645
New number of unique drugs in DDI: 283
Original number of drugs with single side effects: 639
New

## Export to csv

In [None]:
PPI.to_csv('./modif_data/new-decagon-ppi.csv',index=False,sep=',')
DTI.to_csv('./modif_data/new-decagon-targets.csv',index=False,sep=',')
DDI.to_csv('./modif_data/new-decagon-combo.csv',index=False,sep=',')
DSE.to_csv('./modif_data/new-decagon-mono.csv',index=False,sep=',')
PF.to_csv('./modif_data/new-decagon-genes.csv',index=False,sep=',')