In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 10)     # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full column width
ddi_path = '../../data/ddi.csv'

In [None]:
import xml.etree.ElementTree as ET

"""
The data provided by DrugBank is given in the format of XML so we need to first read and parse the .xml file to extract the 
relevant information to DDI.
"""

def process_drugbank_to_dataframe(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {'ns': 'http://www.drugbank.ca'}
    data = []
    for drug in root.findall('ns:drug', namespace):
        primary_id = None
        for drugbank_id in drug.findall('ns:drugbank-id', namespace):
            # We take primary drugbank-id as our 1st Drug ID because it is in the correct format to be mapped to BindingDB
            if drugbank_id.attrib.get('primary') == 'true': 
                primary_id = drugbank_id.text
                break

        # If no primary ID is found, skip this drug entry
        if not primary_id:
            continue

        # Extract name and description
        name = drug.find('ns:name', namespace)
        description = drug.find('ns:description', namespace)
        primary_name = name.text if name is not None else 'N/A'
        primary_description = description.text if description is not None else 'N/A'

        # For each drug there is a list of drug-interaction pairings each with different description
        interactions = drug.find('ns:drug-interactions', namespace)

        # We skip drug entries that have no DDI records
        if interactions:
            for interaction in interactions.findall('ns:drug-interaction', namespace):
                interacting_drug_id = interaction.find('ns:drugbank-id', namespace).text
                interacting_drug_name = interaction.find('ns:name', namespace).text
                interaction_description = interaction.find('ns:description', namespace).text

                data.append({
                    'primary_id': primary_id,
                    'primary_name': primary_name,
                    'primary_description': primary_description,
                    'interacting_drug_id': interacting_drug_id,
                    'interacting_drug_name': interacting_drug_name,
                    'interaction_description': interaction_description
                })

    df = pd.DataFrame(data)
    multi_index_df = df.set_index(['primary_id', 'interacting_drug_id'])

    return multi_index_df

file_path = '../../data/drugbank.xml'
ddi = process_drugbank_to_dataframe(file_path)
ddi = ddi.reset_index()

In [None]:
ddi.to_csv(ddi_path) # Save DDI 

In [3]:
ddi = pd.read_csv(ddi_path)
ddi

Unnamed: 0.1,Unnamed: 0,primary_id,interacting_drug_id,primary_name,primary_description,interacting_drug_name,interaction_description
0,0,DB00001,DB06605,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Apixaban,Apixaban may increase the anticoagulant activities of Lepirudin.
1,1,DB00001,DB06695,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Dabigatran etexilate,Dabigatran etexilate may increase the anticoagulant activities of Lepirudin.
2,2,DB00001,DB01254,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Dasatinib,The risk or severity of bleeding and hemorrhage can be increased when Dasatinib is combined with Lepirudin.
3,3,DB00001,DB01609,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Deferasirox,The risk or severity of gastrointestinal bleeding can be increased when Lepirudin is combined with Deferasirox.
4,4,DB00001,DB01586,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Ursodeoxycholic acid,The risk or severity of bleeding and bruising can be increased when Lepirudin is combined with Ursodeoxycholic acid.
...,...,...,...,...,...,...,...
2839605,2839605,DB18716,DB13328,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Butanilicaine,The risk or severity of methemoglobinemia can be increased when Enmetazobactam is combined with Butanilicaine.
2839606,2839606,DB18716,DB13578,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Metabutethamine,The risk or severity of methemoglobinemia can be increased when Enmetazobactam is combined with Metabutethamine.
2839607,2839607,DB18716,DB13683,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Quinisocaine,The risk or severity of methemoglobinemia can be increased when Enmetazobactam is combined with Quinisocaine.
2839608,2839608,DB18716,DB00565,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Cisatracurium,Enmetazobactam may increase the neuromuscular blocking activities of Cisatracurium.


In [19]:
len(ddi['primary_id'].unique())

4532

In [None]:
from tqdm import tqdm
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans

"""
Experimental approach to cluster the interaction description of DDI.
We use BioBert (a pretrained BERT model on biological texts) to extract the context appropriate embeddings for out descriptions.
Then we apply K-Means to cluster the embeddings into three clusters which later are interpretated as 'Major', 'Moderate' or 'Minor' 
sensitivity.
"""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
texts = ddi['interaction_description'].to_list()
def get_embeddings_in_batches(texts, batch_size=32):
    embeddings = []
    # Proceed in batches to minimize time of execution.
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**tokens)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

embeddings = get_embeddings_in_batches(texts, batch_size=32)

In [None]:
n_clusters = 3 
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings) # Perform k-means clustering

In [None]:
interaction_types = {0: "major", 1: "moderate", 2: "minor"} # Cluster label interpretation map

In [14]:
ddi['labels'] = labels
ddi['label_meaning'] = [interaction_types[i] for i in labels]

In [None]:
ddi.to_csv(ddi_path) # Save DDI 

In [None]:
ddi = pd.read_csv(ddi_path)
labels = ddi['labels'].to_list()
ddi

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,primary_id,interacting_drug_id,primary_name,primary_description,interacting_drug_name,interaction_description,labels,label_meaning
0,0,0,DB00001,DB06605,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Apixaban,Apixaban may increase the anticoagulant activities of Lepirudin.,2,antagonistic
1,1,1,DB00001,DB06695,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Dabigatran etexilate,Dabigatran etexilate may increase the anticoagulant activities of Lepirudin.,2,antagonistic
2,2,2,DB00001,DB01254,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Dasatinib,The risk or severity of bleeding and hemorrhage can be increased when Dasatinib is combined with Lepirudin.,1,additive
3,3,3,DB00001,DB01609,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Deferasirox,The risk or severity of gastrointestinal bleeding can be increased when Lepirudin is combined with Deferasirox.,1,additive
4,4,4,DB00001,DB01586,Lepirudin,"Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] \r\n\r\nLepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]",Ursodeoxycholic acid,The risk or severity of bleeding and bruising can be increased when Lepirudin is combined with Ursodeoxycholic acid.,1,additive
...,...,...,...,...,...,...,...,...,...,...
2839605,2839605,2839605,DB18716,DB13328,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Butanilicaine,The risk or severity of methemoglobinemia can be increased when Enmetazobactam is combined with Butanilicaine.,1,additive
2839606,2839606,2839606,DB18716,DB13578,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Metabutethamine,The risk or severity of methemoglobinemia can be increased when Enmetazobactam is combined with Metabutethamine.,1,additive
2839607,2839607,2839607,DB18716,DB13683,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Quinisocaine,The risk or severity of methemoglobinemia can be increased when Enmetazobactam is combined with Quinisocaine.,1,additive
2839608,2839608,2839608,DB18716,DB00565,Enmetazobactam,"Enmetazobactam is a penicillanic acid sulfone extended-spectrum beta (β)-lactamase (ESBL) inhibitor.[A263266] Because ESBL enzymes can hydrolyze important antibiotics such as penicillins, broad-spectrum cephalosporins and monobactams, ESBL-producing bacteria poses challenges in the treatment of serious infections.[A263276] \r\n\r\nThe combination product of enmetazobactam and [cefepime] was first approved by the FDA on February 23, 2024, for the treatment of complicated urinary tract infections.[L50126] Enmetazobactam is used as cefepime-sparing therapy by preventing its breakdown by ESBL.",Cisatracurium,Enmetazobactam may increase the neuromuscular blocking activities of Cisatracurium.,2,antagonistic


In [None]:
# Manually check the correctness of sample's labeling
output_file = '../../data/interactions.txt'

with open(output_file, 'w') as file:
    for interaction in ddi[['interaction_description', 'label_meaning']].values[::10000]:
        file.write(f"{interaction[0]} - {interaction[1]}\n")