In [58]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.optimize as opt
import seaborn as sns
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from tabulate import tabulate


ddi_fp = "drugbank.csv"

ddi = pd.read_csv(ddi_fp, sep='\t')

kaggle_fp = "smiles.csv"

smiles = pd.read_csv(kaggle_fp, sep='\t')

drug_names_fp = "drugs.txt"

drug_names = pd.read_csv(drug_names_fp, sep='\t')

# structure and metadata


In [59]:
ddi.head()

Unnamed: 0,ID1,ID2,Y,Map,X1,X2
0,DB04571,DB00460,1,#Drug1 may increase the photosensitizing activ...,CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
1,DB00855,DB00460,1,#Drug1 may increase the photosensitizing activ...,NCC(=O)CCC(O)=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
2,DB09536,DB00460,1,#Drug1 may increase the photosensitizing activ...,O=[Ti]=O,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
3,DB01600,DB00460,1,#Drug1 may increase the photosensitizing activ...,CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...
4,DB09000,DB00460,1,#Drug1 may increase the photosensitizing activ...,CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N,COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...


In [60]:
ddi.describe()

Unnamed: 0,Y
count,191808.0
mean,49.830346
std,19.01359
min,1.0
25%,47.0
50%,49.0
75%,67.0
max,86.0


In [61]:
ddi['Y'] = ddi['Y'].astype('category')
print(ddi.dtypes)

ID1      object
ID2      object
Y      category
Map      object
X1       object
X2       object
dtype: object


In [62]:
ddi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191808 entries, 0 to 191807
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   ID1     191808 non-null  object  
 1   ID2     191808 non-null  object  
 2   Y       191808 non-null  category
 3   Map     191808 non-null  object  
 4   X1      191808 non-null  object  
 5   X2      191808 non-null  object  
dtypes: category(1), object(5)
memory usage: 7.5+ MB


In [63]:
smiles.describe()

Unnamed: 0,CCO,CHEMBL545
count,1576903,1576903
unique,1503671,1576903
top,CSCCC(NC(=O)C(CC(C)C)NC(=O)CNC(=O)C(Cc1ccccc1)...,CHEMBL17564
freq,510,1


In [64]:
smiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576903 entries, 0 to 1576902
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   CCO        1576903 non-null  object
 1   CHEMBL545  1576903 non-null  object
dtypes: object(2)
memory usage: 24.1+ MB


In [65]:
smiles.head()

Unnamed: 0,CCO,CHEMBL545
0,C,CHEMBL17564
1,CO,CHEMBL14688
2,NCCS,CHEMBL602
3,NCCN,CHEMBL816
4,CN,CHEMBL43280


In [66]:
header = ['SMILES','CHEMBL']
smiles.columns = header
smiles.to_csv("smiles_updated.csv", header=header, index=False, sep='\t')
updated_file = pd.read_csv("smiles.csv", sep='\t')
print('\nModified file:')
print(updated_file)


Modified file:
                                                       CCO      CHEMBL545
0                                                        C    CHEMBL17564
1                                                       CO    CHEMBL14688
2                                                     NCCS      CHEMBL602
3                                                     NCCN      CHEMBL816
4                                                       CN    CHEMBL43280
...                                                    ...            ...
1576898  CC1=CN(C2CC(OP(O)(=O)OCC3OC(C(O)C3OP(O)(=O)OCC...  CHEMBL1077161
1576899  CC1=CN(C2CC(OP(O)(=O)OCC3OC(C(O)C3OP(O)(=O)OCC...  CHEMBL1077162
1576900  n1(cnc2c1N=C(N)NC2=O)C1OC(COP(O)(=O)OC2C(COP(O...  CHEMBL1077165
1576901  CC1=CN(C2CC(OP(O)(=O)OCC3OC(C(O)C3OP(O)(=O)OCC...  CHEMBL1077164
1576902  CC1=CN(C2CC(OP(O)(=O)OCC3OC(C(O)C3OP(O)(=O)OCC...  CHEMBL1077163

[1576903 rows x 2 columns]


In [67]:
update_smiles = "smiles_updated.csv"

updated_smile = pd.read_csv(update_smiles, sep='\t')

In [68]:
updated_smile.head()

Unnamed: 0,SMILES,CHEMBL
0,C,CHEMBL17564
1,CO,CHEMBL14688
2,NCCS,CHEMBL602
3,NCCN,CHEMBL816
4,CN,CHEMBL43280


In [69]:
updated_smile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576903 entries, 0 to 1576902
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   SMILES  1576903 non-null  object
 1   CHEMBL  1576903 non-null  object
dtypes: object(2)
memory usage: 24.1+ MB


In [70]:
updated_smile.describe()

Unnamed: 0,SMILES,CHEMBL
count,1576903,1576903
unique,1503671,1576903
top,CSCCC(NC(=O)C(CC(C)C)NC(=O)CNC(=O)C(Cc1ccccc1)...,CHEMBL17564
freq,510,1


In [71]:
drug_names.head()

Unnamed: 0,generic_name,cns_drug,smiles
0,Abacavir,False,NC1=NC2=C(N=CN2[C@@H]2C[C@H](CO)C=C2)C(NC2CC2)=N1
1,Abarelix,False,
2,Abatacept,False,
3,Abciximab,False,
4,Abiraterone,False,CC(=O)O[C@H]1CC[C@]2(C)C3CC[C@@]4(C)C(CC=C4C4=...


In [72]:
drug_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1691 entries, 0 to 1690
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   generic_name  1691 non-null   object
 1   cns_drug      1691 non-null   bool  
 2   smiles        1497 non-null   object
dtypes: bool(1), object(2)
memory usage: 28.2+ KB


In [73]:
drug_names.describe()

Unnamed: 0,generic_name,cns_drug,smiles
count,1691,1691,1497
unique,1691,2,1495
top,Abacavir,False,NC[C@@H]1O[C@H](O[C@@H]2[C@@H](CO)O[C@@H](O[C@...
freq,1,1470,2


In [74]:
missing_ddi = ddi.isnull().sum()
missing_smiles = updated_smile.isnull().sum()
missing_drug_names = drug_names.isnull().sum()
print("\nMissing values in datasets:")
print(tabulate(pd.DataFrame({"DDI Missing": missing_ddi, "SMILES Missing": missing_smiles, "Drug Names Missing": missing_drug_names}), headers="keys"))


Missing values in datasets:
                DDI Missing    SMILES Missing    Drug Names Missing
------------  -------------  ----------------  --------------------
CHEMBL                  nan                 0                   nan
ID1                       0               nan                   nan
ID2                       0               nan                   nan
Map                       0               nan                   nan
SMILES                  nan                 0                   nan
X1                        0               nan                   nan
X2                        0               nan                   nan
Y                         0               nan                   nan
cns_drug                nan               nan                     0
generic_name            nan               nan                     0
smiles                  nan               nan                   194


In [84]:
duplicates_ddi = ddi.duplicated().sum()
duplicates_smiles = smiles.duplicated().sum()
duplicates_drug_names = drug_names.duplicated().sum()

print("\nDuplicate rows in datasets:")
print(f"DDI: {duplicates_ddi}, SMILES: {duplicates_smiles}, Drug Names: {duplicates_drug_names}")


Duplicate rows in datasets:
DDI: 0, SMILES: 0, Drug Names: 0


In [77]:
scaler = RobustScaler()
numerical_cols = ddi.select_dtypes(include='number').columns
if len(numerical_cols) > 0:
    ddi[numerical_cols] = scaler.fit_transform(ddi[numerical_cols])
else:
    print("No numerical columns to apply RobustScaler.")

No numerical columns to apply RobustScaler.


In [78]:
scaler = RobustScaler()
numerical_cols = updated_smile.select_dtypes(include='number').columns
if len(numerical_cols) > 0:
    ddi[numerical_cols] = scaler.fit_transform(updated_smile[numerical_cols])
else:
    print("No numerical columns to apply RobustScaler.")

No numerical columns to apply RobustScaler.


In [79]:
scaler = RobustScaler()
numerical_cols = drug_names.select_dtypes(include='number').columns
if len(numerical_cols) > 0:
    ddi[numerical_cols] = scaler.fit_transform(drug_names[numerical_cols])
else:
    print("No numerical columns to apply RobustScaler.")

No numerical columns to apply RobustScaler.


In [80]:
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
if len(numerical_cols) > 0:
    discretized_data = discretizer.fit_transform(ddi[numerical_cols])
    ddi[numerical_cols] = discretized_data
else:
    print("No numerical columns to apply KBinsDiscretizer.")

No numerical columns to apply KBinsDiscretizer.


In [81]:
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
if len(numerical_cols) > 0:
    discretized_data = discretizer.fit_transform(updated_smile[numerical_cols])
    ddi[numerical_cols] = discretized_data
else:
    print("No numerical columns to apply KBinsDiscretizer.")

No numerical columns to apply KBinsDiscretizer.


In [82]:
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
if len(numerical_cols) > 0:
    discretized_data = discretizer.fit_transform(drug_names[numerical_cols])
    ddi[numerical_cols] = discretized_data
else:
    print("No numerical columns to apply KBinsDiscretizer.")

No numerical columns to apply KBinsDiscretizer.


In [83]:
X = ddi.drop(columns=['target_variable']) if 'target_variable' in ddi.columns else ddi
y = ddi['target_variable'] if 'target_variable' in ddi.columns else None
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data split into training and testing sets.")

    # Verify no overlap between training and testing sets
    overlap = set(X_train.index).intersection(set(X_test.index))
    overlap_series = pd.Series(list(overlap))

    # Visualize overlap
    if len(overlap) == 0:
        print("No data leakage: Training and testing sets are distinct.")
    else:
        print(f"Data leakage detected: Overlap of {len(overlap)} samples between training and testing sets.")
        overlap_df = pd.DataFrame(list(overlap), columns=["Overlapping Indices"])
        sns.histplot(overlap_df, x="Overlapping Indices", bins=10, color='orange', kde=False, edgecolor='black')
        plt.title("Histogram of Overlapping Samples")
        plt.xlabel("Sample Indices")
        plt.ylabel("Frequency")
        plt.show()


In [85]:
X = updated_smile.drop(columns=['target_variable']) if 'target_variable' in updated_smile.columns else ddi
y = updated_smile['target_variable'] if 'target_variable' in updated_smile.columns else None
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data split into training and testing sets.")

    # Verify no overlap between training and testing sets
    overlap = set(X_train.index).intersection(set(X_test.index))
    overlap_series = pd.Series(list(overlap))

    # Visualize overlap
    if len(overlap) == 0:
        print("No data leakage: Training and testing sets are distinct.")
    else:
        print(f"Data leakage detected: Overlap of {len(overlap)} samples between training and testing sets.")
        overlap_df = pd.DataFrame(list(overlap), columns=["Overlapping Indices"])
        sns.histplot(overlap_df, x="Overlapping Indices", bins=10, color='orange', kde=False, edgecolor='black')
        plt.title("Histogram of Overlapping Samples")
        plt.xlabel("Sample Indices")
        plt.ylabel("Frequency")
        plt.show()