# Remove Outliers MIPPIE

## Python 3

In [11]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [1]:
import pandas as pd
import numpy as np

Import DECAGON Data as `pandas` dataframes

In [2]:
PPI = pd.read_csv('mippie_ppi_v1_0.tsv',sep='\t')
DTI = pd.read_csv('../original_data/bio-decagon-targets-all.csv',sep=',')
DDI = pd.read_csv('../original_data/bio-decagon-combo.csv',sep=',')
DSE = pd.read_csv('../original_data/bio-decagon-mono.csv',sep=',')
M2H = pd.read_csv('mouse2hum.csv', sep=',')

In [3]:
PPI = PPI[['entrezA','entrezB']]

In [4]:
PPI = PPI.rename(columns={"entrezA": "Gene 1", "entrezB": "Gene 2"})

In [5]:
print(len(PPI.index))
PPI.head()

42610


Unnamed: 0,Gene 1,Gene 2
0,66445,12391
1,66445,211651
2,66445,54401
3,66445,22627
4,66445,15495


In [6]:
m2h = {gene[0]:gene[1] for _,gene in M2H.iterrows()}

In [7]:
m2h

{66445: 1537,
 69253: 3316,
 12391: 859,
 13626: 8726,
 72169: 23650,
 211651: 2177,
 21423: 6929,
 56222: 163732,
 71241: 63946,
 13716: 8178,
 117198: 10625,
 18390: 4988,
 170484: 7827,
 54401: 7529,
 30960: 9218,
 72433: 23682,
 22629: 7533,
 22627: 7531,
 101502: 80270,
 65945: 22883,
 21937: 7132,
 319740: 118813,
 67052: 10403,
 21388: 6910,
 20983: 6860,
 78891: 57410,
 98376: 92344,
 17202: 4160,
 12385: 1495,
 237523: 374462,
 18545: 126006,
 18222: 8650,
 217331: 85451,
 12550: 999,
 27221: 10036,
 14359: 8087,
 58869: 51555,
 12387: 1499,
 16480: 3728,
 11789: 324,
 20191: 6262,
 22330: 7414,
 22631: 7534,
 244585: 23322,
 11306: 22,
 14370: 8325,
 22408: 7471,
 22416: 89780,
 16006: 3484,
 16012: 3489,
 16008: 3485,
 16010: 3487,
 94221: 57120,
 20187: 6259,
 72780: 84870,
 16210: 55364,
 81879: 29842,
 19821: 6045,
 214669: 83746,
 71950: 79923,
 56353: 23429,
 52609: 23492,
 17187: 4149,
 71041: 84108,
 214133: 54790,
 52463: 80312,
 209318: 2873,
 18999: 5460,
 67588: 1

In [13]:
gene1_list = []
gene2_list = []
for _,row in PPI.iterrows():
    try:
        gene1_list.append(m2h[row[0]])
    except KeyError:
        gene1_list.append(None)
    try:
        gene2_list.append(m2h[row[1]])
    except KeyError:
        gene2_list.append(None)

In [14]:
print(len(gene1_list),len(gene2_list))

42610 42610


In [27]:
data = {'Gene1':gene1_list,'Gene2':gene2_list}
nPPI = pd.DataFrame(data=data)
nPPI = nPPI[np.logical_and(~nPPI['Gene1'].isnull(),~nPPI['Gene2'].isnull())]
nPPI['Gene1'] = nPPI['Gene1'].astype(int)
nPPI['Gene2'] = nPPI['Gene2'].astype(int)
PPI = nPPI

In [29]:
print(len(PPI.index))
PPI.head()

36303


Unnamed: 0,Gene1,Gene2
0,1537,859
1,1537,2177
2,1537,7529
3,1537,7531
5,1537,7534


In [30]:
# Original number of interactions
orig_ppi = len(PPI.index)
orig_dti = len(DTI.index)
orig_ddi = len(DDI.index)
orig_dse = len(DSE.index)

### PPI genes

In [31]:
# PPI genes
PPI_genes = pd.unique(np.hstack((PPI['Gene1'].values,PPI['Gene2'].values))) #int
orig_genes_ppi = len(PPI_genes) # Original number of genes

### Common drugs between DDI network and drug single side effects

In [32]:
# DDI drugs
DDI_drugs = pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel())
orig_drugs_ddi = len(DDI_drugs) # Original number of drugs
orig_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values)) # number of side effects
# Drugs with single side effects
DSE_drugs = pd.unique(DSE['STITCH'].values)
orig_drug_dse = len(DSE_drugs) # Original number of drugs
orig_se_mono = len(pd.unique(DSE['Side Effect Name'])) # number of side effects

In [33]:
# Calculate the instersection of the DDI and DSE
# (i.e., the drugs in the interaction network that have single side effect)
inter_drugs = np.intersect1d(DDI_drugs,DSE_drugs,assume_unique=True)
# Choose only the entries in DDI that are in the intersection
DDI = DDI[np.logical_and(DDI['STITCH 1'].isin(inter_drugs).values,
                     DDI['STITCH 2'].isin(inter_drugs).values)]
# Some drugs in DDI that are common to all 3 datasets may only interact with genes that are
# non-common (outsiders). That is why we need to filter a second time using this array.
DDI_drugs = pd.unique(DDI[["STITCH 1", "STITCH 2"]].values.ravel())
DSE = DSE[DSE['STITCH'].isin(DDI_drugs)]
new_drugs_ddi = len(pd.unique(DDI[['STITCH 1','STITCH 2']].values.ravel()))
new_drugs_dse = len(pd.unique(DSE['STITCH'].values))
new_se_combo = len(pd.unique(DDI['Polypharmacy Side Effect'].values))
new_se_mono = len(pd.unique(DSE['Side Effect Name']))

### Selection of entries of DTI database

In [34]:
orig_genes_dti = len(pd.unique(DTI['Gene'].values))
orig_drugs_dti = len(pd.unique(DTI['STITCH'].values))
DTI = DTI[np.logical_and(DTI['STITCH'].isin(DDI_drugs),DTI['Gene'].isin(PPI_genes))]
DTI_genes = pd.unique(DTI['Gene'].values)
new_genes_dti = len(DTI_genes)
new_drugs_dti = len(pd.unique(DTI['STITCH'].values))
PPI = PPI[np.logical_or(PPI['Gene1'].isin(DTI_genes),PPI['Gene2'].isin(DTI_genes))]

In [35]:
# Interactions (edges)
print('Interactions (edges)')
print ('Original number of PPI interactions',orig_ppi)
print ('New number of PPI interactions',len(PPI.index))
print('\n')
print ('Original number of DTI interactions',orig_dti)
print ('New number of DTI interactions',len(DTI.index))
print('\n')
print ('Original number of DDI interactions',orig_ddi)
print ('New number of DDI interactions', len(DDI.index))
print('\n')
print ('Original number of DSE interactions',orig_dse)
print('New number of DSE interactions',len(DSE.index))
print('\n')
# Drugs and genes (nodes)
print('Drugs and genes (nodes)')
print("Original number of drugs in DSE:",orig_drug_dse)
print("New number of drugs in DSE:",new_drugs_dse)
print('\n')
print("Original number drugs in DTI",orig_drugs_dti)
print("New number of drugs in DTI",new_drugs_dti)
print('\n')
print('Original number of genes in DTI:',orig_genes_dti)
print('New number of genes in DTI:',new_genes_dti)
print('\n')
print('Original number of genes in PPI:',orig_genes_ppi)
print('New number of genes in PPI:',orig_genes_ppi)
print('\n')
print('Original number of drugs in DDI:',orig_drugs_ddi)
print('New number of drugs in DDI:',new_drugs_ddi)
print('\n')
# Side effects
print('Side effects')
print('Original number of joint side effects:',orig_se_combo)
print('New number of joint side effects:', new_se_combo)
print('\n')
print('Original number of single side effects:', orig_se_mono)
print('New number of single side effects:', new_se_mono)

Interactions (edges)
Original number of PPI interactions 36303
New number of PPI interactions 13455


Original number of DTI interactions 131034
New number of DTI interactions 8022


Original number of DDI interactions 4649441
New number of DDI interactions 4615522


Original number of DSE interactions 174977
New number of DSE interactions 174977


Drugs and genes (nodes)
Original number of drugs in DSE: 639
New number of drugs in DSE: 639


Original number drugs in DTI 1774
New number of drugs in DTI 249


Original number of genes in DTI: 7795
New number of genes in DTI: 1895


Original number of genes in PPI: 9299
New number of genes in PPI: 9299


Original number of drugs in DDI: 645
New number of drugs in DDI: 639


Side effects
Original number of joint side effects: 1317
New number of joint side effects: 1317


Original number of single side effects: 9702
New number of single side effects: 9702


## Export to csv

In [None]:
PPI.to_csv('./clean_data/decagon-ppi.csv',index=False,sep=',')
DTI.to_csv('./clean_data/decagon-targets.csv',index=False,sep=',')
DDI.to_csv('./clean_data/decagon-combo.csv',index=False,sep=',')
DSE.to_csv('./clean_data/decagon-mono.csv',index=False,sep=',')