In [5]:
import numpy as np
import pandas as pd
from likeness import check_smiles_validity
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from ddi import process_drugbank_to_dataframe, extract_smiles_from_drugbank

In [6]:
color_palette = ['#006B3C', '#D3AF36', '#CD9B06', '#682861', '#4A265A']

## Read XML

In [None]:
file_path = '../../data/drugbank/drugbank.xml'
ddi = process_drugbank_to_dataframe(file_path)
ddi = ddi.reset_index()

In [5]:
file_path = '../../data/drugbank/drugbank.xml'
db_smiles = extract_smiles_from_drugbank(file_path)
db_smiles = db_smiles.reset_index()

In [6]:
ddi.to_csv('../../data/ddi.csv')
db_smiles.to_csv('../../data/db_smiles.csv')

## BioBert for Labeling

In [11]:
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans

In [None]:
ddi = pd.read_csv('../../data/ddi.csv')
ddi.head()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

embeddings = get_biobert_embeddings(ddi, device)

In [8]:
n_clusters = 3 
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

In [9]:
interaction_types = {0: "major", 1: "moderate", 2: "minor"} 

In [10]:
ddi['labels'] = labels
ddi['label_meaning'] = [interaction_types[i] for i in labels]

In [11]:
ddi.to_csv('../../data/ddi_labels.csv')

## Preprocess Labeled Data

In [8]:
from likeness import check_smiles_validity
from wordcloud import WordCloud

In [None]:
ddi = pd.read_csv('../../data/ddi_labels.csv')
ddi.drop(columns=['Unnamed: 0', 'primary_description'], inplace=True, errors='ignore')
ddi.head()

In [None]:
label_counts = ddi['label_meaning'].value_counts().reset_index()
label_counts.columns = ['Label', 'Frequency']

order = ['minor', 'moderate', 'major']
color_mapping = {'minor': '#006B3C', 'moderate': '#D3AF36', 'major': '#682861'}
label_counts['Label'] = pd.Categorical(label_counts['Label'], categories=order, ordered=True)
label_counts = label_counts.sort_values(by='Label')

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=label_counts['Label'],
        y=label_counts['Frequency'],
        marker=dict(color=[color_mapping[label] for label in label_counts['Label']]),
        name="Labels",
    )
)
fig.update_traces(marker_line_width=1.5)
fig.update_layout(
    title='Label Counts',
    xaxis_title='Label',
    yaxis_title='Frequency',
    template='plotly_white',
    legend=dict(title='Labels'),
    xaxis=dict(tickangle=45),
    transition=dict(duration=500)
)
fig.write_html("plots/label_bar_chart.html")
fig.show()

In [None]:

grouped = ddi.groupby('label_meaning')['interaction_description'].apply(lambda x: ' '.join(x)).reset_index()

def color_func(*args, **kwargs):
    return color_palette[np.random.randint(0, len(color_palette))]


fig = go.Figure()

for _, row in grouped.iterrows():
    label = row['label_meaning']
    text = row['interaction_description']
    wordcloud = WordCloud(
        width=1600,
        height=800,
        background_color='white',
        color_func=color_func
    ).generate(text)
    image_data = wordcloud.to_array()
    fig.add_trace(
        go.Image(z=image_data, name=label, visible=(label == 'minor'), hoverinfo='skip') 
    )

fig.update_layout(
    updatemenus=[
        dict(
            type="dropdown",
            showactive=True,
            buttons=[
                dict(
                    label="minor",
                    method="update",
                    args=[{"visible": [label == "minor" for label in grouped['label_meaning']]},
                          {"title": {"text": "Word Cloud for Label: Minor", "x": 0.5}}],
                ),
                dict(
                    label="moderate",
                    method="update",
                    args=[{"visible": [label == "moderate" for label in grouped['label_meaning']]},
                          {"title": {"text": "Word Cloud for Label: Moderate", "x": 0.5}}],
                ),
                dict(
                    label="major",
                    method="update",
                    args=[{"visible": [label == "major" for label in grouped['label_meaning']]},
                          {"title": {"text": "Word Cloud for Label: Major", "x": 0.5}}],
                ),
            ],
        )
    ]
)

fig.update_layout(
    title={"text": "Word Cloud for Label: Minor", "x": 0.5},
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
    template='plotly_white',
    margin=dict(l=20, r=20, t=40, b=20)
)

fig.write_html("plots/wordcloud_dropdown.html")
fig.show()

In [None]:
# Drop duplicated pairs
print(f'Shape before removing duplicates: {ddi.shape}')
ddi[['primary_id', 'interacting_drug_id']] = ddi.apply(lambda x: sorted([x['primary_id'], x['interacting_drug_id']]), axis=1, result_type='expand')
ddi.drop_duplicates(subset=['primary_id', 'interacting_drug_id'], inplace=True)
print(f'Shape after removing duplicates: {ddi.shape}')

In [None]:
db_smiles = pd.read_csv('../../data/db_smiles.csv')
db_smiles.drop(columns=['Unnamed: 0', 'index'], inplace=True, errors='ignore')
db_smiles.head()

In [12]:
db_smiles.rename(columns={'DrugBank ID': 'primary_id', 'SMILES': 'primary_smiles'}, inplace=True)
ddi = pd.merge(ddi, db_smiles, how='left', on='primary_id')
db_smiles.rename(columns={'primary_id': 'interacting_drug_id', 'primary_smiles': 'interaction_drug_smiles'}, inplace=True)
ddi = pd.merge(ddi, db_smiles, how='left', on='interacting_drug_id')

In [None]:
ddi.isna().sum()

In [14]:
ddi.dropna(inplace=True)

In [None]:
primary_isvalid = check_smiles_validity(ddi['primary_smiles'].unique())
interaction_drug_isvalid = check_smiles_validity(ddi['interaction_drug_smiles'].unique())

In [None]:
primary_isvalid[primary_isvalid['Valid'] != True].count()

In [None]:
interaction_drug_isvalid[interaction_drug_isvalid['Valid'] != True].count()

In [18]:
primary_isvalid.rename(columns={'SMILES': 'primary_smiles', 'Valid': 'primary_isvalid'}, inplace=True)
interaction_drug_isvalid.rename(columns={'SMILES': 'interaction_drug_smiles', 'Valid': 'interaction_drug_isvalid'}, inplace=True)
ddi = pd.merge(ddi, primary_isvalid, how='left', on='primary_smiles')
ddi = pd.merge(ddi, interaction_drug_isvalid, how='left', on='interaction_drug_smiles')

In [None]:
ddi[(ddi['primary_isvalid'] != True) | (ddi['interaction_drug_isvalid'] != True)].count()

In [None]:
ddi = ddi[(ddi['primary_isvalid'] == True) & (ddi['interaction_drug_isvalid'] == True)]
print(f'Final shape: {ddi.shape}')

In [21]:
ddi.to_csv('../../data/ddi_processed.csv')

## Develop Models

In [3]:
from rdkit.Chem import Draw, AllChem
from rdkit import Chem
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchview import draw_graph
from PIL import Image
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib.colors import ListedColormap
import plotly.figure_factory as ff

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
ddi = pd.read_csv('../../data/ddi_processed.csv')
ddi.drop(columns=['Unnamed: 0','primary_isvalid', 'interaction_drug_isvalid'], inplace=True, errors='ignore')
ddi.head()

In [None]:
smile = ddi.iloc[0]['primary_smiles']
m = Chem.MolFromSmiles(smile)
img = Draw.MolToImage(m, size=(1000, 1000))
img

In [7]:
def create_fingerprints(smiles_list, prefix):
    molecules = [Chem.MolFromSmiles(smile) for smile in smiles_list]
    fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in molecules]
    fingerprint_arrays = [np.array(fingerprint) for fingerprint in fingerprints]
    res_df = pd.DataFrame({
        f'{prefix}_smiles': smiles_list,
        f'{prefix}_fingerprint': fingerprint_arrays
    })
    return res_df

In [None]:
primary_fingerprint_df = create_fingerprints(ddi['primary_smiles'].unique(), 'primary')
interaction_drug_fingerprint_df = create_fingerprints(ddi['interaction_drug_smiles'].unique(), 'interaction_drug')

In [None]:
smile = ddi.iloc[0]['primary_smiles']
mol = Chem.MolFromSmiles(smile)
bi1 = {}
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024, bitInfo=bi1)
print(len(list(fp1.GetOnBits())))
tpls1 = [(mol, x, bi1) for x in fp1.GetOnBits()]
Draw.DrawMorganBits(tpls1[:], molsPerRow=8, legends=[str(x) for x in fp1.GetOnBits()][:])

In [None]:
ddi = pd.merge(ddi, primary_fingerprint_df, how='left', on='primary_smiles')
ddi = pd.merge(ddi, interaction_drug_fingerprint_df, how='left', on='interaction_drug_smiles')

ddi['concat_fingerprints'] = ddi.apply(
    lambda row: np.concatenate([row['primary_fingerprint'], row['interaction_drug_fingerprint']]), axis=1
)
ddi.head()

In [None]:
ddi['len_fingerprints'] = ddi['concat_fingerprints'].apply(
    lambda x: len(x)
)
print(f'Length of combined fingerprints: {ddi["len_fingerprints"].unique()}')

### First Basic Model

In [None]:
features = np.memmap('features.dat', dtype='float16', mode='w+', shape=(len(ddi), 2048))
labels = np.memmap('labels.dat', dtype='float16', mode='w+', shape=(len(ddi),))

for i, row in enumerate(ddi['concat_fingerprints']):
    features[i] = row
labels[:] = ddi['labels']

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

X_val, X_test_final, y_val, y_test_final = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

In [14]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_final_tensor = torch.tensor(X_test_final, dtype=torch.float32)
y_test_final_tensor = torch.tensor(y_test_final, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_final_tensor, y_test_final_tensor)

batch_size = 2048

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
class DDI_NN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DDI_NN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [None]:
input_dim = X_train.shape[1]
hidden_dim = 256
output_dim = 3

dummy_model = DDI_NN(input_dim, hidden_dim, output_dim)
dummy_input = torch.randn(1, 2048)

model_graph = draw_graph(
    dummy_model,
    input_data=(dummy_input),
    expand_nested=True,
    save_graph=True,
    filename="drug_interaction_model_basic", 
    directory="./",
)

image = Image.open("drug_interaction_model_basic.png")
plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('off')
plt.show()

In [None]:
input_dim = X_train.shape[1]
hidden_dim = 256
output_dim = 3
model = DDI_NN(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
train_losses = []
val_accuracies = []
early_stop_threshold = 0.01 

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    
    for batch_idx, (batch_X, batch_y) in enumerate(train_loader):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    model.eval()
    total_val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad(): 
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    val_accuracies.append(val_accuracy)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}, "
          f"Val Accuracy: {val_accuracy:.2f}%")
    
    if epoch > 0 and abs(val_accuracies[-1] - val_accuracies[-2]) < early_stop_threshold:
        print(f"Validation accuracy converged (Δ < {early_stop_threshold}%). Stopping early at epoch {epoch+1}.")
        break


fig, ax1 = plt.subplots()
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Train Loss', color='tab:red')
ax1.plot(range(1, min(epoch + 2, num_epochs + 1)), train_losses, marker='o', linestyle='-', color='tab:red', label='Train Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('Validation Accuracy (%)', color='tab:blue')
ax2.plot(range(1, min(epoch + 2, num_epochs + 1)), val_accuracies, marker='s', linestyle='--', color='tab:blue', label='Val Accuracy')
ax2.tick_params(axis='y', labelcolor='tab:blue')
fig.tight_layout()
fig.suptitle('Train Loss and Validation Accuracy', y=1.05)
plt.show()

In [None]:
model.eval()
total_test_loss = 0
correct = 0
total = 0

with torch.no_grad(): 
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        total_test_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

test_accuracy = correct / total
print(f"Validation Loss: {total_test_loss / len(test_loader):.4f}")
print(f"Validation Accuracy: {test_accuracy:.4f}")

In [23]:
torch.save(model.state_dict(), "drug_interaction_model_basic.pth")

### More Complex Model

In [17]:
features_primary = np.memmap('features_primary.dat', dtype='float16', mode='w+', shape=(len(ddi), 1024))
features_interaction = np.memmap('features_interaction.dat', dtype='float16', mode='w+', shape=(len(ddi), 1024))
labels = np.memmap('labels.dat', dtype='float16', mode='w+', shape=(len(ddi),))

for i, (fp1, fp2) in enumerate(zip(ddi['primary_fingerprint'], ddi['interaction_drug_fingerprint'])):
    features_primary[i] = fp1
    features_interaction[i] = fp2
labels[:] = ddi['labels']

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    features_primary, features_interaction, labels, test_size=0.2, random_state=42, stratify=labels
)

X1_val, X1_test_final, X2_val, X2_test_final, y_val, y_test_final = train_test_split(
    X1_test, X2_test, y_test, test_size=0.5, random_state=42, stratify=y_test
)

In [18]:
X1_train_tensor = torch.tensor(X1_train, dtype=torch.float32)
X2_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X1_val_tensor = torch.tensor(X1_val, dtype=torch.float32)
X2_val_tensor = torch.tensor(X2_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X1_test_final_tensor = torch.tensor(X1_test_final, dtype=torch.float32)
X2_test_final_tensor = torch.tensor(X2_test_final, dtype=torch.float32)
y_test_final_tensor = torch.tensor(y_test_final, dtype=torch.long)

train_dataset = TensorDataset(X1_train_tensor, X2_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X1_val_tensor, X2_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X1_test_final_tensor, X2_test_final_tensor, y_test_final_tensor)

batch_size = 2048

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
class DrugInteractionModel(nn.Module):
    def __init__(self):
        super(DrugInteractionModel, self).__init__()
        
        # Encoder of each fingerprint
        self.encoder = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        
        # Merging Layer + Classifier
        self.fc_merge = nn.Sequential(
            nn.Linear(256 * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 3),
        )

    def forward(self, drug1, drug2):
        encoded_drug1 = self.encoder(drug1)
        encoded_drug2 = self.encoder(drug2)
        merged = torch.cat((encoded_drug1, encoded_drug2), dim=1)
        out = self.fc_merge(merged)
        return out

In [None]:
dummy_model = DrugInteractionModel()
dummy_input_1 = torch.randn(1, 1024)
dummy_input_2 = torch.randn(1, 1024)

model_graph = draw_graph(
    dummy_model,
    input_data=(dummy_input_1, dummy_input_2),
    expand_nested=True,
    save_graph=True,
    filename="drug_interaction_model", 
    directory="./",
)

image = Image.open("drug_interaction_model.png")
plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('off')
plt.show()

In [None]:
model = DrugInteractionModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
train_losses = []
val_accuracies = []
early_stop_threshold = 0.005 


for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X1, batch_X2, batch_y in train_loader:
        batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
        outputs = model(batch_X1, batch_X2)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X1, batch_X2, batch_y in val_loader:
            batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
            outputs = model(batch_X1, batch_X2)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    epoch_loss = running_loss / len(train_loader)
    train_losses.append(epoch_loss)
    epoch_accuracy = 100 * correct / total
    val_accuracies.append(epoch_accuracy)


    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {epoch_loss:.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, "
          f"Val Accuracy: {epoch_accuracy:.2f}%")
    
    if epoch > 0 and abs(train_losses[-1] - train_losses[-2]) < early_stop_threshold:
        print(f"Training Loss converged (Δ < {early_stop_threshold}). Stopping early at epoch {epoch+1}.")
        break


fig, ax1 = plt.subplots()
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Train Loss', color='tab:red')
ax1.plot(range(1, min(epoch + 2, num_epochs + 1)), train_losses, marker='o', linestyle='-', color='tab:red', label='Train Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('Validation Accuracy (%)', color='tab:blue')
ax2.plot(range(1, min(epoch + 2, num_epochs + 1)), val_accuracies, marker='s', linestyle='--', color='tab:blue', label='Val Accuracy')
ax2.tick_params(axis='y', labelcolor='tab:blue')
fig.tight_layout()
fig.suptitle('Train Loss and Validation Accuracy', y=1.05)
plt.show()

In [None]:
model.eval()
test_loss = 0
correct = 0
total = 0

all_labels = []
all_predictions = []

with torch.no_grad():
    for batch_X1, batch_X2, batch_y in test_loader:
        batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
        outputs = model(batch_X1, batch_X2)
        loss = criterion(outputs, batch_y)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

        all_labels.extend(batch_y.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss / len(test_loader):.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
color_palette = ['#CD9B06', '#682861']
cm = confusion_matrix(all_labels, all_predictions, labels=[0, 1, 2])
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
labels = ["Minor", "Moderate", "Major"]

# Create heatmap using Plotly
fig = ff.create_annotated_heatmap(
    z=cm_normalized,
    x=labels,
    y=labels,
    colorscale=color_palette[:len(labels)],
    annotation_text=np.round(cm_normalized, 2),
    showscale=True
)

# Customize layout
fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Label",
    yaxis_title="True Label",
    yaxis=dict(autorange="reversed"),
    template="plotly",
    width=600, 
    height=600  
)

fig.write_html("plots/nn_cm.html")
fig.show()

In [28]:
torch.save(model.state_dict(), "drug_interaction_model.pth")

### Embedding Locations Into The Model

In [None]:
bd_location = pd.read_csv('../../data/ddi_locations.csv')
bd_location.drop(columns='Unnamed: 0', inplace=True)
bd_location.head()

In [None]:
bd_location.rename(
    columns={
        'DrugBank ID of Ligand': 'primary_id', 
        'Latitude': 'primary_latitude', 
        'Longitude': 'primary_longitude', 
        'Institution': 'primary_institution'}, 
    inplace=True
)
ddi = pd.merge(ddi, bd_location, how='left', on='primary_id')

bd_location.rename(
    columns={
        'primary_id': 'interacting_drug_id', 
        'primary_latitude': 'interacting_drug_latitude', 
        'primary_longitude': 'interacting_drug_longitude', 
        'primary_institution': 'interacting_drug_institution'}, 
    inplace=True
)
ddi = pd.merge(ddi, bd_location, how='left', on='interacting_drug_id')
ddi.head()

In [None]:
ddi.isna().sum()

In [23]:
ddi.dropna(inplace=True)

In [51]:
df = ddi[['primary_name', 'interacting_drug_name', 'label_meaning', 'primary_latitude', 'primary_longitude', 'interacting_drug_latitude', 'interacting_drug_longitude']]
df.to_csv('../../data/ddi_locations_preprocessed.csv')

In [None]:
label_counts = ddi['label_meaning'].value_counts()
plt.figure(figsize=(8, 4))
plt.bar(label_counts.index, label_counts, color='#aad7d4')
plt.title('Label Counts', fontsize=16)
plt.xlabel('Label', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

In [24]:
num_features = 1024 + 2
features_primary = np.memmap('features_primary.dat', dtype='float16', mode='w+', shape=(len(ddi), num_features))
features_interaction = np.memmap('features_interaction.dat', dtype='float16', mode='w+', shape=(len(ddi), num_features))
labels = np.memmap('labels.dat', dtype='float16', mode='w+', shape=(len(ddi),))

for i, (fp1, fp2, lat1, lon1, lat2, lon2) in enumerate(zip(
    ddi['primary_fingerprint'], 
    ddi['interaction_drug_fingerprint'], 
    ddi['primary_latitude'], 
    ddi['primary_longitude'],
    ddi['interacting_drug_latitude'], 
    ddi['interacting_drug_longitude']
)):
    features_primary[i] = np.concatenate([fp1, [lat1, lon1]])
    features_interaction[i] = np.concatenate([fp2, [lat2, lon2]])
labels[:] = ddi['labels']

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    features_primary, features_interaction, labels, test_size=0.2, random_state=42
)

X1_val, X1_test_final, X2_val, X2_test_final, y_val, y_test_final = train_test_split(
    X1_test, X2_test, y_test, test_size=0.5, random_state=42
)

In [47]:
X1_train_tensor = torch.tensor(X1_train, dtype=torch.float32)
X2_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X1_val_tensor = torch.tensor(X1_val, dtype=torch.float32)
X2_val_tensor = torch.tensor(X2_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X1_test_final_tensor = torch.tensor(X1_test_final, dtype=torch.float32)
X2_test_final_tensor = torch.tensor(X2_test_final, dtype=torch.float32)
y_test_final_tensor = torch.tensor(y_test_final, dtype=torch.long)

train_dataset = TensorDataset(X1_train_tensor, X2_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X1_val_tensor, X2_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X1_test_final_tensor, X2_test_final_tensor, y_test_final_tensor)

batch_size = 256

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [48]:
class DDI_LocationModel(nn.Module):
    def __init__(self):
        super(DDI_LocationModel, self).__init__()
        
        # Encoder for each fingerprint (1024 features only)
        self.encoder = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        
        # Merging Layer + Classifier
        self.fc_merge = nn.Sequential(
            nn.Linear(256 * 2 + 4, 128),
            nn.ReLU(),
            nn.Linear(128, 3),
        )

    def forward(self, drug1, drug2):
        fingerprint1, additional_features1 = drug1[:, :-2], drug1[:, -2:]
        fingerprint2, additional_features2 = drug2[:, :-2], drug2[:, -2:]

        encoded_drug1 = self.encoder(fingerprint1)
        encoded_drug2 = self.encoder(fingerprint2)
        
        merged = torch.cat(
            (encoded_drug1, encoded_drug2, additional_features1, additional_features2), 
            dim=1
        )

        out = self.fc_merge(merged)
        return out

In [None]:
dummy_model = DDI_LocationModel()
dummy_input_1 = torch.randn(1, 1026)
dummy_input_2 = torch.randn(1, 1026)

model_graph = draw_graph(
    dummy_model,
    input_data=(dummy_input_1, dummy_input_2),
    expand_nested=True,
    save_graph=True,
    filename="drug_interaction_locations_model", 
    directory="./",
)
image = Image.open("drug_interaction_locations_model.png")
plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('off')
plt.show()

In [None]:
model = DDI_LocationModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
train_losses = []
val_accuracies = []
early_stop_threshold = 0.01 


for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X1, batch_X2, batch_y in train_loader:
        batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
        outputs = model(batch_X1, batch_X2)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X1, batch_X2, batch_y in val_loader:
            batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
            outputs = model(batch_X1, batch_X2)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    epoch_loss = running_loss / len(train_loader)
    train_losses.append(epoch_loss)
    epoch_accuracy = 100 * correct / total
    val_accuracies.append(epoch_accuracy)


    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {epoch_loss:.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, "
          f"Val Accuracy: {epoch_accuracy:.2f}%")
    
    if epoch > 0 and abs(val_accuracies[-1] - val_accuracies[-2]) < early_stop_threshold:
        print(f"Validation accuracy converged (Δ < {early_stop_threshold}%). Stopping early at epoch {epoch+1}.")
        break


fig, ax1 = plt.subplots()
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Train Loss', color='tab:red')
ax1.plot(range(1, min(epoch + 2, num_epochs + 1)), train_losses, marker='o', linestyle='-', color='tab:red', label='Train Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('Validation Accuracy (%)', color='tab:blue')
ax2.plot(range(1, min(epoch + 2, num_epochs + 1)), val_accuracies, marker='s', linestyle='--', color='tab:blue', label='Val Accuracy')
ax2.tick_params(axis='y', labelcolor='tab:blue')
fig.tight_layout()
fig.suptitle('Train Loss and Validation Accuracy', y=1.05)
plt.show()

In [None]:
model.eval()
test_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for batch_X1, batch_X2, batch_y in test_loader:
        batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
        outputs = model(batch_X1, batch_X2)
        loss = criterion(outputs, batch_y)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss / len(test_loader):.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

### Augment Data with 'no-interaction' DDI Pairs

In [13]:
from itertools import product

In [14]:
def create_random_non_existent_pairs(df, num_pairs):
    primary_names = df['primary_name'].unique()
    interacting_drugs = df['interacting_drug_name'].unique()

    all_possible_pairs = set(product(primary_names, interacting_drugs))
    existing_pairs = set(zip(df['primary_name'], df['interacting_drug_name']))
    non_existent_pairs = all_possible_pairs - existing_pairs
    non_existent_pairs = list(non_existent_pairs)

    if len(non_existent_pairs) < num_pairs:
        raise ValueError("Not enough non-existent pairs available to satisfy the requested number.")

    selected_pairs = np.random.choice(len(non_existent_pairs), size=num_pairs, replace=False)
    selected_pairs = [non_existent_pairs[i] for i in selected_pairs]

    return selected_pairs

In [15]:
num_pairs = ddi['labels'].value_counts().max()
new_pairs = create_random_non_existent_pairs(ddi, num_pairs)

In [None]:
no_interaction = pd.DataFrame(new_pairs, columns=['primary_name', 'interacting_drug_name'])

no_interaction['labels'] = 3
no_interaction['label_meaning'] = 'no-interaction'

ddi = pd.concat([ddi, no_interaction], ignore_index=True)
ddi.tail()

In [None]:
primary_columns = ddi.groupby('primary_name')[['primary_id', 'primary_smiles', 'primary_fingerprint']].apply(
    lambda group: group.ffill().bfill()
).reset_index()
primary_columns.drop('level_1', inplace=True, errors='ignore')
ddi.update(primary_columns)

interaction_drug_columns = ddi.groupby('interacting_drug_name')[['interacting_drug_id', 'interaction_drug_smiles', 'interaction_drug_fingerprint']].apply(
    lambda group: group.ffill().bfill()
).reset_index()
interaction_drug_columns.drop('level_1', inplace=True, errors='ignore')
ddi.update(interaction_drug_columns)

ddi.tail()

In [None]:
ddi.isna().sum()

In [None]:
label_counts = ddi['label_meaning'].value_counts()
plt.figure(figsize=(8, 4))
plt.bar(label_counts.index, label_counts, color='#aad7d4')
plt.title('Label Counts', fontsize=16)
plt.xlabel('Label', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

In [20]:
ddi.to_csv('../../data/ddi_augmented.csv')

#### Model with 4 outputs

In [21]:
features_primary = np.memmap('features_primary.dat', dtype='float16', mode='w+', shape=(len(ddi), 1024))
features_interaction = np.memmap('features_interaction.dat', dtype='float16', mode='w+', shape=(len(ddi), 1024))
labels = np.memmap('labels.dat', dtype='float16', mode='w+', shape=(len(ddi),))

for i, (fp1, fp2) in enumerate(zip(ddi['primary_fingerprint'], ddi['interaction_drug_fingerprint'])):
    features_primary[i] = fp1
    features_interaction[i] = fp2
labels[:] = ddi['labels']

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    features_primary, features_interaction, labels, test_size=0.2, random_state=42, stratify=labels
)

X1_val, X1_test_final, X2_val, X2_test_final, y_val, y_test_final = train_test_split(
    X1_test, X2_test, y_test, test_size=0.5, random_state=42, stratify=y_test
)

In [22]:
X1_train_tensor = torch.tensor(X1_train, dtype=torch.float32)
X2_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X1_val_tensor = torch.tensor(X1_val, dtype=torch.float32)
X2_val_tensor = torch.tensor(X2_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

X1_test_final_tensor = torch.tensor(X1_test_final, dtype=torch.float32)
X2_test_final_tensor = torch.tensor(X2_test_final, dtype=torch.float32)
y_test_final_tensor = torch.tensor(y_test_final, dtype=torch.long)

train_dataset = TensorDataset(X1_train_tensor, X2_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X1_val_tensor, X2_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X1_test_final_tensor, X2_test_final_tensor, y_test_final_tensor)

batch_size = 4096

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [23]:
class DrugInteractionModel_4outputs(nn.Module):
    def __init__(self):
        super(DrugInteractionModel_4outputs, self).__init__()
        
        # Encoder of each fingerprint
        self.encoder = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        
        # Merging Layer + Classifier
        self.fc_merge = nn.Sequential(
            nn.Linear(256 * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 4),
        )

    def forward(self, drug1, drug2):
        encoded_drug1 = self.encoder(drug1)
        encoded_drug2 = self.encoder(drug2)
        merged = torch.cat((encoded_drug1, encoded_drug2), dim=1)
        out = self.fc_merge(merged)
        return out

In [None]:
model = DrugInteractionModel_4outputs().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
train_losses = []
val_accuracies = []
early_stop_threshold = 0.01 


for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X1, batch_X2, batch_y in train_loader:
        batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
        outputs = model(batch_X1, batch_X2)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X1, batch_X2, batch_y in val_loader:
            batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
            outputs = model(batch_X1, batch_X2)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    epoch_loss = running_loss / len(train_loader)
    train_losses.append(epoch_loss)
    epoch_accuracy = 100 * correct / total
    val_accuracies.append(epoch_accuracy)


    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {epoch_loss:.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, "
          f"Val Accuracy: {epoch_accuracy:.2f}%")
    
    if epoch > 0 and abs(val_accuracies[-1] - val_accuracies[-2]) < early_stop_threshold:
        print(f"Validation accuracy converged (Δ < {early_stop_threshold}%). Stopping early at epoch {epoch+1}.")
        break


fig, ax1 = plt.subplots()
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Train Loss', color='tab:red')
ax1.plot(range(1, min(epoch + 2, num_epochs + 1)), train_losses, marker='o', linestyle='-', color='tab:red', label='Train Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('Validation Accuracy (%)', color='tab:blue')
ax2.plot(range(1, min(epoch + 2, num_epochs + 1)), val_accuracies, marker='s', linestyle='--', color='tab:blue', label='Val Accuracy')
ax2.tick_params(axis='y', labelcolor='tab:blue')
fig.tight_layout()
fig.suptitle('Train Loss and Validation Accuracy', y=1.05)
plt.show()

### Play with loaded models

In [None]:
model = DrugInteractionModel()
model.load_state_dict(torch.load("drug_interaction_model.pth"))
model.eval() 
print("Model loaded successfully from 'drug_interaction_model.pth'")

In [None]:
model.eval()
test_loss = 0
correct = 0
total = 0

all_labels = []
all_predictions = []

with torch.no_grad():
    for batch_X1, batch_X2, batch_y in test_loader:
        batch_X1, batch_X2, batch_y = batch_X1.to(device), batch_X2.to(device), batch_y.to(device)
        outputs = model(batch_X1, batch_X2)
        loss = criterion(outputs, batch_y)
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

        all_labels.extend(batch_y.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss / len(test_loader):.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
color_palette = ['#CD9B06', '#682861']
cm = confusion_matrix(all_labels, all_predictions, labels=[0, 1, 2])
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
labels = ["Minor", "Moderate", "Major"]

# Create heatmap using Plotly
fig = ff.create_annotated_heatmap(
    z=cm_normalized,
    x=labels,
    y=labels,
    colorscale=color_palette[:len(labels)],
    annotation_text=np.round(cm_normalized, 2),
    showscale=True
)

# Customize layout
fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Label",
    yaxis_title="True Label",
    yaxis=dict(autorange="reversed"),
    template="plotly",
    width=600, 
    height=600  
)

fig.write_html("plots/nn_cm.html")
fig.show()

## Likeness and Model

### Analyse DDI for Most Likable Drugs

In [7]:
most_like_df = pd.read_csv('../../data/most_likable_drugs_withDB.csv')
most_like_df.drop(columns=['Unnamed: 0', 'BindingDB Reactant_set_id', 'Ligand SMILES', 'BindingDB Ligand Name', 'Target Name', 'Color'], inplace=True)
most_like_df.head()

Unnamed: 0,Molecular Weight,Log P,H Donors,H Acceptors,Rotatable Bonds,PSA,QED Score,Lipinski Pass,Veber Pass,DrugBank ID of Ligand
0,281.38,3.39872,0.0,2.0,1.0,20.31,0.798242,True,True,DB07472
1,266.304,2.65122,1.0,4.0,1.0,58.12,0.861716,True,True,DB00238
2,266.304,2.65122,1.0,4.0,1.0,58.12,0.861716,True,True,DB00238
3,266.304,2.65122,1.0,4.0,1.0,58.12,0.861716,True,True,DB00238
4,266.304,2.65122,1.0,4.0,1.0,58.12,0.861716,True,True,DB00238


In [8]:
ddi = pd.read_csv('../../data/ddi_processed.csv')
ddi.drop(columns=['Unnamed: 0','primary_isvalid', 'interaction_drug_isvalid'], inplace=True, errors='ignore')

In [24]:
ddi_most_liked = ddi[(ddi['primary_id'].isin(most_like_df['DrugBank ID of Ligand'])) | 
                     (ddi['interacting_drug_id'].isin(most_like_df['DrugBank ID of Ligand']))]

In [20]:
ddi.shape

(1163415, 9)

In [21]:
ddi_most_liked.shape

(17341, 9)

In [None]:
label_counts = ddi_most_liked['label_meaning'].value_counts().reset_index()
label_counts.columns = ['Label', 'Frequency']
ddi_label_counts = ddi['label_meaning'].value_counts().reset_index()
ddi_label_counts.columns = ['Label', 'Frequency']

order = ['minor', 'moderate', 'major']
ddi_color_mapping = {'minor': '#006B3C', 'moderate': '#D3AF36', 'major': '#682861'}
color_mapping = {'minor': '#03a960', 'moderate': '#ffd546', 'major': '#9439d9'}
label_counts['Label'] = pd.Categorical(label_counts['Label'], categories=order, ordered=True)
ddi_label_counts['Label'] = pd.Categorical(ddi_label_counts['Label'], categories=order, ordered=True)
ddi_label_counts = ddi_label_counts.sort_values(by='Label')
label_counts = label_counts.sort_values(by='Label')

percentage_diff = ((ddi_label_counts['Frequency'] - label_counts['Frequency']) / ddi_label_counts['Frequency']) * 100

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=ddi_label_counts['Label'],
        y=ddi_label_counts['Frequency'],
        marker=dict(color=[ddi_color_mapping[label] for label in ddi_label_counts['Label']]),
        name=f"All DDI",
        showlegend=False
    )
)
fig.add_trace(
    go.Bar(
        x=label_counts['Label'],
        y=label_counts['Frequency'],
        marker=dict(color=[color_mapping[label] for label in label_counts['Label']]),
        name=f"Likable DDI",
        showlegend=False
    )
)

for i, row in label_counts.iterrows():
    label = row['Label']
    likable_freq = row['Frequency']
    all_ddi_freq = ddi_label_counts[ddi_label_counts['Label'] == label]['Frequency'].values[0]
    
    # Calculate percentage drop
    perc_drop = ((all_ddi_freq - likable_freq) / all_ddi_freq) * 100
    perc_drop_text = f"{perc_drop:.1f}% drop"
    
    # Get the corresponding color for the Likable DDI bar
    bar_color = color_mapping[label]
    
    # Add annotation to the plot
    fig.add_annotation(
        x=label,
        y=1.2*likable_freq,  # Position the text slightly above the 'Likable DDI' bars
        text=perc_drop_text,
        showarrow=False,
        font=dict(size=12, color='black'),  # Use the corresponding color for the text
        align='center'
    )
# Update the plot layout
fig.update_traces(marker_line_width=1.5)
fig.update_layout(
    title='Label Counts for Most Likable Drugs (vs. All Labels)',
    xaxis_title='Label',
    yaxis_title='Frequency',
    template='plotly_white',
    xaxis=dict(tickangle=45),
    barmode='group',  # Ensure bars are grouped next to each other
    transition=dict(duration=500)
)

fig.write_html("plots/label_bar_chart_with_ddi_frequencies.html")
fig.show()


In [9]:
most_like_df_primary = most_like_df.rename(columns={
    'Molecular Weight': 'primary_mol_weight',
    'Log P': 'primary_log_p',
    'H Donors': 'primary_h_donors',
    'H Acceptors': 'primary_h_acceptors',
    'Rotatable Bonds': 'primary_rotatable_bonds',
    'PSA': 'primary_psa',
    'QED Score': 'primary_qed_score',
    'Lipinski Pass': 'primary_lipinski_pass',
    'Veber Pass': 'primary_veber_pass'
})
most_liked_ddi_primary = pd.merge(ddi, most_like_df_primary, left_on='primary_id', right_on='DrugBank ID of Ligand', how='left')

most_like_df_interacting = most_like_df.rename(columns={
    'Molecular Weight': 'interacting_mol_weight',
    'Log P': 'interacting_log_p',
    'H Donors': 'interacting_h_donors',
    'H Acceptors': 'interacting_h_acceptors',
    'Rotatable Bonds': 'interacting_rotatable_bonds',
    'PSA': 'interacting_psa',
    'QED Score': 'interacting_qed_score',
    'Lipinski Pass': 'interacting_lipinski_pass',
    'Veber Pass': 'interacting_veber_pass'
})
most_liked_ddi_interacting = pd.merge(ddi, most_like_df_interacting, left_on='interacting_drug_id', right_on='DrugBank ID of Ligand', how='left')

In [None]:
label_counts = ddi['label_meaning'].value_counts().reset_index()
label_counts.columns = ['Label', 'Frequency']

order = ['minor', 'moderate', 'major']
color_mapping = {'minor': '#006B3C', 'moderate': '#D3AF36', 'major': '#682861'}
label_counts['Label'] = pd.Categorical(label_counts['Label'], categories=order, ordered=True)
label_counts = label_counts.sort_values(by='Label')

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=label_counts['Label'],
        y=label_counts['Frequency'],
        marker=dict(color=[color_mapping[label] for label in label_counts['Label']]),
        name="Labels",
    )
)
fig.update_traces(marker_line_width=1.5)
fig.update_layout(
    title='Label Counts for Most Likable Drugs',
    xaxis_title='Label',
    yaxis_title='Frequency',
    template='plotly_white',
    legend=dict(title='Labels'),
    xaxis=dict(tickangle=45),
    transition=dict(duration=500)
)
fig.write_html("plots/label_bar_chart.html")
fig.show()

## Visualize DDI on a Map

In [56]:
import pandas as pd
import folium
from folium.plugins import AntPath
from IPython.display import display, HTML

In [None]:
df = pd.read_csv('../../data/ddi_locations_preprocessed.csv')
df.drop(columns='Unnamed: 0',inplace=True)
df.head()

In [66]:
sampled_df = df.sample(frac=0.01, random_state=42)
label_colors = {
    'major': 'red',
    'moderate': 'yellow',
    'minor': 'green'
}
map_center = [sampled_df['primary_latitude'].mean(), sampled_df['primary_longitude'].mean()]
interaction_map = folium.Map(location=map_center, zoom_start=2, tiles='cartodbpositron')
for _, row in sampled_df.iterrows():
    start_point = [row['primary_latitude'], row['primary_longitude']]
    end_point = [row['interacting_drug_latitude'], row['interacting_drug_longitude']]

    line_color = label_colors.get(row['label_meaning'], 'gray') 

    folium.PolyLine(
        locations=[start_point, end_point],
        color=line_color,
        weight=0.2, 
        opacity=0.7
    ).add_to(interaction_map)

interaction_map.save('ddi_interaction_map.html')


In [None]:
display(HTML(interaction_map._repr_html_()))