In [2]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.5-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.5


In [5]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

ddi_fp = "drugbank.csv"
ddi = pd.read_csv(ddi_fp, sep='\t')

'''Function to compute molecular descriptors'''
def compute_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return None

        # Computing Molecular Descriptors
        mol_wt = Descriptors.MolWt(mol)
        logp = Descriptors.MolLogP(mol)
        h_donors = Descriptors.NumHDonors(mol)
        h_acceptors = Descriptors.NumHAcceptors(mol)
        tpsa = Descriptors.TPSA(mol)
        return [mol_wt, logp, h_donors, h_acceptors, tpsa]

    except Exception as e:
        print(f"Error computing features for {smiles}: {e}")
        return None  # Skipping invalid SMILES

# Using function to X1 and X2 to extract features
ddi['features_X1'] = ddi['X1'].apply(compute_features)
ddi['features_X2'] = ddi['X2'].apply(compute_features)

# Removed none rows
ddi = ddi.dropna(subset=['features_X1', 'features_X2'])

features_X1_df = pd.DataFrame(ddi['features_X1'].tolist(), columns=['MolWt_X1', 'LogP_X1', 'NumHDonors_X1', 'NumHAcceptors_X1', 'TPSA_X1'])
features_X2_df = pd.DataFrame(ddi['features_X2'].tolist(), columns=['MolWt_X2', 'LogP_X2', 'NumHDonors_X2', 'NumHAcceptors_X2', 'TPSA_X2'])

# Drop original feature columns
ddi = ddi.drop(columns=['features_X1', 'features_X2'])

# Concatenated original dataset with the original dataset
ddi_combined = pd.concat([ddi.reset_index(drop=True), features_X1_df, features_X2_df], axis=1)

ddi_combined.to_csv("drugbank_with_descriptors.csv", sep='\t', index=False)
print(f"Saved {len(ddi_combined)} valid entries.")
print(ddi_combined.head())


[17:02:47] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[17:02:47] SMILES Parse Error: check for mistakes around position 76:
[17:02:47] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[17:02:47] ~~~~~~~~~~~~~~~~~~~~^
[17:02:47] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'
[17:04:26] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[17:04:26] SMILES Parse Error: check for mistakes around position 76:
[17:04:26] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[17:04:26] ~~~~~~~~~~~~~~~~~~~~

Saved 191797 valid entries.
       ID1      ID2  Y                                                Map  \
0  DB04571  DB00460  1  #Drug1 may increase the photosensitizing activ...   
1  DB00855  DB00460  1  #Drug1 may increase the photosensitizing activ...   
2  DB09536  DB00460  1  #Drug1 may increase the photosensitizing activ...   
3  DB01600  DB00460  1  #Drug1 may increase the photosensitizing activ...   
4  DB09000  DB00460  1  #Drug1 may increase the photosensitizing activ...   

                                           X1  \
0         CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1   
1                             NCC(=O)CCC(O)=O   
2                                    O=[Ti]=O   
3       CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1   
4  CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N   

                                                  X2  MolWt_X1  LogP_X1  \
0  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...   228.247  3.46446   
1  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...   131.131 -

In [7]:
pip install torch torch_geometric pandas scikit-learn


Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data

ddi_fp = "drugbank_with_descriptors.csv"
ddi = pd.read_csv(ddi_fp, sep='\t')

#Encoding the different drug to drug interactions from the 'map' Column
label_encoder = LabelEncoder()
ddi['category_encoded'] = label_encoder.fit_transform(ddi['Map'])

#Using the features
feature_columns = ['MolWt_X1', 'LogP_X1', 'NumHDonors_X1', 'NumHAcceptors_X1', 'TPSA_X1',
                   'MolWt_X2', 'LogP_X2', 'NumHDonors_X2', 'NumHAcceptors_X2', 'TPSA_X2']
X = ddi[feature_columns].values
y = ddi['category_encoded'].values

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Using PyTorch Geometric Graph Format
def create_graph_features(features, labels):
    num_nodes = len(features)

    # creating the nodes
    edge_index = torch.tensor(
        np.array([[i, i] for i in range(num_nodes)]).T, dtype=torch.long
    )

    graphs = []
    for i in range(len(features)):
        x = torch.tensor(features[i], dtype=torch.float).unsqueeze(0)
        y = torch.tensor([labels[i]], dtype=torch.long)
        graph = Data(x=x, edge_index=edge_index, y=y)
        graphs.append(graph)
    return graphs

train_graphs = create_graph_features(X_train, y_train)
test_graphs = create_graph_features(X_test, y_test)

# Creating data loaders
batch_size = 32
train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)

#Creating GNN Model
class GNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

#Initialising the Model
input_dim = len(feature_columns)
hidden_dim = 16
output_dim = len(np.unique(y))

model = GNN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

#Training the GNN Model
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(torch.device("cpu"))  # Ensure correct device
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out.squeeze(1), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

#Evaluating the Model
def test(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(torch.device("cpu"))  # Ensure correct device
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.y.size(0)
    return correct / total

#Running Training
epochs = 20
for epoch in range(epochs):
    loss = train()
    acc = test(test_loader)
    print(f"Epoch {epoch+1}: Loss={loss:.4f}, Test Accuracy={acc:.4f}")

#Saving the model for future use
torch.save(model.state_dict(), "gnn_model.pth")

Epoch 1: Loss=2.7171, Test Accuracy=0.3158
Epoch 2: Loss=2.4995, Test Accuracy=0.3177
Epoch 3: Loss=2.5214, Test Accuracy=0.3177
Epoch 4: Loss=2.5216, Test Accuracy=0.3177
Epoch 5: Loss=2.5214, Test Accuracy=0.3177
