# Remove Outliers
This notebook filters the original databases used in DECAGON plus the protein feature databases to remove any unlinked node in the network. In short, it keeps only the elements present in all the databases so that the output database is consistent.<br>
This code is the adaptation in `pandas` of the script `remove_outliers.sh`.

Author: Juan Sebastian Diaz Boada, May 2020

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
# Only for Python 2
from __future__ import print_function

Import DECAGON Data as `pandas` dataframes

In [3]:
PPI = pd.read_csv('orig_data/bio-decagon-ppi.csv',sep=',')
PF = pd.read_csv('orig_data/proteins.csv',sep=';')
DTI = pd.read_csv('orig_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('orig_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('orig_data/bio-decagon-mono.csv',sep=',')

In [4]:
# Remove comma after thousand
PF['Mass'] = PF['Mass'].apply(lambda x: x.replace(',', '')).astype('int')

## Genes
Form arrays of unique genes in every dataset. Print their lengths

In [5]:
orig_ppi = len(PPI.index)
orig_dti = len(DTI.index)
orig_pf = len(PF.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)
# PPI genes
col_genes = PPI[["Gene 1", "Gene 2"]].values.ravel()
PPI_genes = pd.unique(col_genes)
orig_genes_ppi = len(PPI_genes)
# PF genes
PF_genes = np.array(PF['GeneID'].tolist())
orig_genes_pf = len(PF_genes)
# DTI genes
DTI_genes = pd.unique(DTI["Gene"].values)
orig_genes_dti = len(DTI_genes)

Calculate the instersection of the 3 datasets (i.e. sind the common genes of the 3 datasest)

In [6]:
inter_genes = np.intersect1d(DTI_genes,np.intersect1d(PPI_genes,PF_genes))
n_genes = len(inter_genes)

Form new datasets with the common genes of the 3 datasets

In [7]:
DTI = DTI[DTI['Gene'].isin(inter_genes)]
PF = PF[PF['GeneID'].isin(inter_genes)]
PPI = PPI[np.logical_or(PPI['Gene 1'].isin(inter_genes).values,\
                     PPI['Gene 2'].isin(inter_genes).values)]
new_ppi = len(PPI.index)
mid_dti = len(DTI.index)
new_pf = len(PF.index)

## Drugs
Form arrays of unique drugs in every dataset. Print their lengths

In [8]:
# DTI drugs
DTI_drugs = np.array(pd.unique(DTI["STITCH"].values).tolist())
orig_drugs_dti = len(DTI_drugs)
# DDI drugs
col_drugs = DDI[["STITCH 1", "STITCH 2"]].values.ravel()
DDI_drugs = np.array(pd.unique(col_drugs).tolist())
orig_drugs_ddi = len(DDI_drugs)
# Drugs with single side effects
DSE_drugs = np.array(pd.unique(DSE['STITCH'].values).tolist())
orig_drugs_dse = len(DSE_drugs)

Create new rows in DSE for drugs in DTI absent in DSE. Add value 'no_se' in side effects.<br>
(This is done to avoid ambiguity with the number of drugs in DDI and DSE. The code of DECAGON adds a vector of zero to these drugs anyway)

In [9]:
no_feat = np.setdiff1d(DDI_drugs,DSE_drugs) #outliers
n_nf = no_feat.shape[0] #Number of outliers
# Create DataFrame with outliers and append it to original DataFrame
se_outliers = {'STITCH':no_feat,'Individual Side Effect':n_nf*['no_se'],\
       'Side Effect Name':n_nf*['no_se']}
DSE = DSE.append(pd.DataFrame(se_outliers),sort=False, ignore_index=True)
# New number of drugs with single side effects
DSE_drugs = array(pd.unique(DSE['STITCH'].values).tolist())

Calculate the instersection of the 3 datasets (i.e. the common drugs of the 3 datasest)

In [10]:
inter_drugs = np.intersect1d(DTI_drugs,DDI_drugs)
n_drugs = len(inter_drugs)

Form new datasets with the common genes of the 3 datasets

In [11]:
DTI = DTI[DTI['STITCH'].isin(inter_drugs)]
DSE = DSE[DSE['STITCH'].isin(inter_drugs)]
DDI = DDI[logical_or(DDI['STITCH 1'].isin(inter_drugs).values,\
                     DDI['STITCH 2'].isin(inter_drugs).values)]
new_ddi = len(DDI.index)
new_dti = len(DTI.index)
new_dse = len(DSE.index)

In [12]:
print ('Original number of PPI interactions',orig_ppi)
print ('New number of PPI interactions',new_ppi)
print('\n')
print ('Original number of DTI interactions',orig_dti)
print ('Number of DTI after gene outlier removal',mid_dti)
print ('New number of DTI interactions',new_dti)
print('\n')
print ('Original number of DDI interactions',orig_ddi)
print ('New number of DDI interactions',new_ddi)
print('\n')
print ('Original number of Protein Features',orig_pf)
print ('New number of Protein Features',new_pf)
print('\n')
print ('Original number of single side effect interactions',orig_dse)
print('New number of single side effect interactions',new_dse)
print('\n')
print("Original number of unique genes in PPI:",orig_genes_ppi)
print("Original number of genes whose proteins have features:",orig_genes_pf)
print("Original number of unique genes in DTI",orig_genes_dti)
print('New number of genes:',n_genes)
print('\n')
print("Total number of unique drugs in DTI",orig_drugs_dti)
print("Total number of unique drugs in DDI:",orig_drugs_ddi)
print("Total number of drugs with single side effects:",orig_drugs_dse)
print('New number of drugs:',n_drugs)


Original number of PPI interactions 715612
New number of PPI interactions 581429


Original number of DTI interactions 131034
Number of DTI after gene outlier removal 128149
New number of DTI interactions 18293


Original number of DDI interactions 4649441
New number of DDI interactions 3504271


Original number of Protein Features 18991
New number of Protein Features 7628


Original number of single side effect interactions 174977
New number of single side effect interactions 81286


Original number of unique genes in PPI: 19081
Original number of genes whose proteins have features: 18991
Original number of unique genes in DTI 7795
New number of genes: 7628


Total number of unique drugs in DTI 1774
Total number of unique drugs in DDI: 645
Total number of drugs with single side effects: 639
New number of drugs: 284


## Export to csv

In [13]:
PPI.to_csv('./modif_data/new-decagon-ppi.csv',index=False,sep=',')
DTI.to_csv('./modif_data/new-decagon-targets.csv',index=False,sep=',')
DDI.to_csv('./modif_data/new-decagon-combo.csv',index=False,sep=',')
DSE.to_csv('./modif_data/new-decagon-mono.csv',index=False,sep=',')
PF.to_csv('./modif_data/new-decagon-genes.csv',index=False,sep=',')