## Adjust DDI

In [3]:
%cd ..
# %cd media/capstone-project

D:\OneDrive\w210\capstone-project


In [1]:
import random
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import itertools
import re

%matplotlib inline
sns.set_theme(color_codes=True)

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [4]:
raw_train = pd.read_csv('data/archive/ddi_training.csv')
# Add a drug-pair column
raw_train['d1_d2'] = raw_train.apply(lambda x: str([x['d1'], x['d2']]), axis=1)
raw_train['drug_pair'] = raw_train.apply(lambda x: str(set(sorted([x['d1'], x['d2']]))), axis=1)

print('Raw Train data dimension:', raw_train.shape)
print('Number of distinct drugs in d1:', len(raw_train['d1'].unique()))
print('Number of distinct drugs in d2:', len(raw_train['d2'].unique()))
print('Number of distinct d1-d2 sequence:', len(raw_train['d1_d2'].unique()))
print('Number of distinct drug-pair:', len(raw_train['drug_pair'].unique()))
raw_train.head()

Raw Train data dimension: (115185, 10)
Number of distinct drugs in d1: 1545
Number of distinct drugs in d2: 1553
Number of distinct d1-d2 sequence: 115064
Number of distinct drug-pair: 115044


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,d1,d2,type,split,smiles1,smiles2,d1_d2,drug_pair
0,0,0,DB04571,DB00460,74,training,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C(=C\2)/C(CCC(O)=O)=C5C)C(C=C)=C4C)C2=CC=C([C@@H](C(=O)OC)[C@@]32C)C(=O)OC)=C1C,"['DB04571', 'DB00460']","{'DB00460', 'DB04571'}"
1,1,1,DB00855,DB00460,74,training,NCC(=O)CCC(O)=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C(=C\2)/C(CCC(O)=O)=C5C)C(C=C)=C4C)C2=CC=C([C@@H](C(=O)OC)[C@@]32C)C(=O)OC)=C1C,"['DB00855', 'DB00460']","{'DB00460', 'DB00855'}"
2,2,2,DB09536,DB00460,74,training,O=[Ti]=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C(=C\2)/C(CCC(O)=O)=C5C)C(C=C)=C4C)C2=CC=C([C@@H](C(=O)OC)[C@@]32C)C(=O)OC)=C1C,"['DB09536', 'DB00460']","{'DB00460', 'DB09536'}"
3,7,8,DB01878,DB00460,74,training,O=C(C1=CC=CC=C1)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C(=C\2)/C(CCC(O)=O)=C5C)C(C=C)=C4C)C2=CC=C([C@@H](C(=O)OC)[C@@]32C)C(=O)OC)=C1C,"['DB01878', 'DB00460']","{'DB01878', 'DB00460'}"
4,8,9,DB00140,DB00460,74,training,CC1=C(C)C=C2N(C[C@H](O)[C@H](O)[C@H](O)CO)C3=NC(=O)NC(=O)C3=NC2=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C(=C\2)/C(CCC(O)=O)=C5C)C(C=C)=C4C)C2=CC=C([C@@H](C(=O)OC)[C@@]32C)C(=O)OC)=C1C,"['DB00140', 'DB00460']","{'DB00140', 'DB00460'}"


In [5]:
# Count the number of repeated pairs in the dataset
count_pair = dict(raw_train.d1_d2.value_counts())

# Display drug-pairs that have more than 1 interaction in the dataset
raw_train['count_pair'] = raw_train.d1_d2.map(count_pair)

pair_1count = len(raw_train[raw_train.count_pair == 1]['d1_d2'].unique())
pair_2count = len(raw_train[raw_train.count_pair > 1]['d1_d2'].unique())
print('Number of drug-pairs have 1 interaction:', pair_1count)
print('Number of drug-pairs have >1 interactions:', pair_2count)
raw_train[raw_train.count_pair > 1].sort_values('d1_d2')

Number of drug-pairs have 1 interaction: 114943
Number of drug-pairs have >1 interactions: 121


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,d1,d2,type,split,smiles1,smiles2,d1_d2,drug_pair,count_pair
23245,38857,38858,DB00176,DB00997,9,training,COCCCCC(=NOCCN)C1=CC=C(C=C1)C(F)(F)F,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H]3O[C@H]3C[C@H](N)[C@H](O)[C@H](C)O3)C(=O)CO)C(O)=C1C2=O,"['DB00176', 'DB00997']","{'DB00997', 'DB00176'}",2
23244,38856,38857,DB00176,DB00997,5,training,COCCCCC(=NOCCN)C1=CC=C(C=C1)C(F)(F)F,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H]3O[C@H]3C[C@H](N)[C@H](O)[C@H](C)O3)C(=O)CO)C(O)=C1C2=O,"['DB00176', 'DB00997']","{'DB00997', 'DB00176'}",2
18541,31102,31103,DB00213,DB00884,12,training,COC1=C(OC)C(CS(=O)C2=NC3=C(N2)C=C(OC(F)F)C=C3)=NC=C1,OC(CC1=CN=CC=C1)(P(O)(O)=O)P(O)(O)=O,"['DB00213', 'DB00884']","{'DB00884', 'DB00213'}",2
18542,31103,31104,DB00213,DB00884,9,training,COC1=C(OC)C(CS(=O)C2=NC3=C(N2)C=C(OC(F)F)C=C3)=NC=C1,OC(CC1=CN=CC=C1)(P(O)(O)=O)P(O)(O)=O,"['DB00213', 'DB00884']","{'DB00884', 'DB00213'}",2
23250,38862,38863,DB00215,DB00997,9,training,CN(C)CCCC1(OCC2=C1C=CC(=C2)C#N)C1=CC=C(F)C=C1,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H]3O[C@H]3C[C@H](N)[C@H](O)[C@H](C)O3)C(=O)CO)C(O)=C1C2=O,"['DB00215', 'DB00997']","{'DB00215', 'DB00997'}",2
...,...,...,...,...,...,...,...,...,...,...,...
23189,38769,38770,DB09167,DB00997,5,training,CN(C)CC\C=C1/C2=CC=CC=C2CSC2=CC=CC=C12,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H]3O[C@H]3C[C@H](N)[C@H](O)[C@H](C)O3)C(=O)CO)C(O)=C1C2=O,"['DB09167', 'DB00997']","{'DB00997', 'DB09167'}",2
23334,39017,39018,DB09238,DB00997,5,training,COC(=O)C1=C(C)NC(C)=C(C1C1=CC(=CC=C1)N(=O)=O)C(=O)OCCN1CCN(CC1)C(C1=CC=CC=C1)C1=CC=CC=C1,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H]3O[C@H]3C[C@H](N)[C@H](O)[C@H](C)O3)C(=O)CO)C(O)=C1C2=O,"['DB09238', 'DB00997']","{'DB00997', 'DB09238'}",2
23335,39018,39019,DB09238,DB00997,9,training,COC(=O)C1=C(C)NC(C)=C(C1C1=CC(=CC=C1)N(=O)=O)C(=O)OCCN1CCN(CC1)C(C1=CC=CC=C1)C1=CC=CC=C1,COC1=CC=CC2=C1C(=O)C1=C(O)C3=C(C[C@](O)(C[C@@H]3O[C@H]3C[C@H](N)[C@H](O)[C@H](C)O3)C(=O)CO)C(O)=C1C2=O,"['DB09238', 'DB00997']","{'DB00997', 'DB09238'}",2
32213,53628,53630,DB09241,DB00327,66,training,[Cl-].CN(C)C1=CC2=[S+]C3=C(C=CC(=C3)N(C)C)N=C2C=C1,[H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(C)[C@]([H])(C4)[C@]1([H])CCC2=O,"['DB09241', 'DB00327']","{'DB00327', 'DB09241'}",2


In [None]:
# API Interaction Scraping data
raw_data = pd.read_csv('data/ddis_from_api.csv', delimiter='\t')
# Filter out drug_pairs without SMILES data
# raw_data = raw_data.dropna(subset=['smiles1', 'smiles2'], how='any')
raw_data = raw_data.drop_duplicates()

# Add a drug-pair column
raw_data['d1_d2'] = raw_data.apply(lambda x: str([x['d1'], x['d2']]), axis=1)
raw_data['drug_pair'] = raw_data.apply(lambda x: str(set(sorted([x['d1'], x['d2']]))), axis=1)

print('Raw data dimension:', raw_data.shape)
print('Number of distinct drugs in d1:', len(raw_data['d1'].unique()))
print('Number of distinct drugs in d2:', len(raw_data['d2'].unique()))
print('Number of distinct d1-d2 sequence:', len(raw_data['d1_d2'].unique()))
print('Number of distinct drug-pair:', len(raw_data['drug_pair'].unique()))
raw_data.head()