In [None]:
#Data Processing for the Drug-Drug Interaction (DDI) Dataset
import os
import pandas as pd
import matplotlib.pyplot as plt

# Define the path to the CSV file
file_path = r"DDI_data.csv"

# Check if the file exists
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    # Read the CSV file
    df = pd.read_csv(file_path)
df

Unnamed: 0,# STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,CID000002173,CID000003345,C0151714,hypermagnesemia
1,CID000002173,CID000003345,C0035344,retinopathy of prematurity
2,CID000002173,CID000003345,C0004144,atelectasis
3,CID000002173,CID000003345,C0002063,alkalosis
4,CID000002173,CID000003345,C0004604,Back Ache
...,...,...,...,...
4649436,CID000003461,CID000003954,C0149871,deep vein thromboses
4649437,CID000003461,CID000003954,C0035410,rhabdomyolysis
4649438,CID000003461,CID000003954,C0043096,loss of weight
4649439,CID000003461,CID000003954,C0003962,ascites


In [None]:
import os
import pandas as pd

# === Step 1: Load drug ID-name mapping ===
drug_name_path = r"drug_names.csv"

if not os.path.exists(drug_name_path):
    raise FileNotFoundError(f"Drug name file not found: {drug_name_path}")

drug_df = pd.read_csv(drug_name_path, header=None)
if drug_df.shape[1] != 2:
    raise ValueError("drug_names.csv must have exactly two columns: [ID, Name]")

# Build ID → Name mapping
drug_mapping = dict(zip(drug_df[0], drug_df[1]))

print("Loaded drug mapping (preview):")
print(drug_df.head())

# === Step 2: Load DDI dataset and replace IDs ===
ddi_path = r"DDI_data.csv"
if not os.path.exists(ddi_path):
    raise FileNotFoundError(f"DDI data file not found: {ddi_path}")

df = pd.read_csv(ddi_path)

if df.shape[1] < 2:
    raise ValueError("DDI data must contain at least two columns (drug1 ID, drug2 ID)")

# Replace IDs in the first two columns with drug names
df.iloc[:, 0] = df.iloc[:, 0].map(drug_mapping).fillna(df.iloc[:, 0])
df.iloc[:, 1] = df.iloc[:, 1].map(drug_mapping).fillna(df.iloc[:, 1])

# === Step 3: Check for unmatched IDs (still not replaced) ===
unmatched_1 = df.iloc[:, 0][~df.iloc[:, 0].isin(drug_mapping.values())]
unmatched_2 = df.iloc[:, 1][~df.iloc[:, 1].isin(drug_mapping.values())]

if unmatched_1.empty and unmatched_2.empty:
    print("All drug IDs successfully replaced with names.")
else:
    print("Some drug IDs were not replaced:")
    if not unmatched_1.empty:
        print(" - Unreplaced in first column:")
        print(unmatched_1.unique())
    if not unmatched_2.empty:
        print(" - Unreplaced in second column:")
        print(unmatched_2.unique())


In [14]:
df

Unnamed: 0,# STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,ampicillin,fentanyl,C0151714,hypermagnesemia
1,ampicillin,fentanyl,C0035344,retinopathy of prematurity
2,ampicillin,fentanyl,C0004144,atelectasis
3,ampicillin,fentanyl,C0002063,alkalosis
4,ampicillin,fentanyl,C0004604,Back Ache
...,...,...,...,...
4649436,gemcitabine,loperamide,C0149871,deep vein thromboses
4649437,gemcitabine,loperamide,C0035410,rhabdomyolysis
4649438,gemcitabine,loperamide,C0043096,loss of weight
4649439,gemcitabine,loperamide,C0003962,ascites


In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

if not duplicate_rows.empty:
    print("Duplicate rows found:")
    print(duplicate_rows)
else:
    print("No duplicate rows found.")

In [None]:
# Drop the third column
df = df.drop(df.columns[2], axis=1)

# Swap the second and third columns
df = df[[df.columns[0], df.columns[2], df.columns[1]]]

# Rename columns
df.columns = ['head', 'relation', 'tail']

# Print the first few rows
print(df.head())

         head                    relation      tail
0  ampicillin             hypermagnesemia  fentanyl
1  ampicillin  retinopathy of prematurity  fentanyl
2  ampicillin                 atelectasis  fentanyl
3  ampicillin                   alkalosis  fentanyl
4  ampicillin                   back ache  fentanyl


In [None]:
df['relation'] = 'causes ' + df['relation'] + ' with'
df

Unnamed: 0,head,relation,tail
0,ampicillin,causes hypermagnesemia with,fentanyl
1,ampicillin,causes retinopathy of prematurity with,fentanyl
2,ampicillin,causes atelectasis with,fentanyl
3,ampicillin,causes alkalosis with,fentanyl
4,ampicillin,causes back ache with,fentanyl
...,...,...,...
4649436,gemcitabine,causes deep vein thromboses with,loperamide
4649437,gemcitabine,causes rhabdomyolysis with,loperamide
4649438,gemcitabine,causes loss of weight with,loperamide
4649439,gemcitabine,causes ascites with,loperamide


In [None]:
# Generating prompts for LLMs
import json
import random
from collections import defaultdict

def is_json(string):
    try:
        json.loads(string)
        return True
    except ValueError:
        return False

# Step 1: Load entity and relation mappings
ent2txt = {}
with open("entity2text.txt", "r") as f:
    for line in f:
        tmp = line.strip().split("\t")
        ent2txt[tmp[0]] = tmp[1]

rel2txt = {}
with open("relation2text.txt", "r") as f:
    for line in f:
        tmp = line.strip().split("\t")
        rel2txt[tmp[0]] = tmp[1]

ent_list = list(ent2txt.keys())

# Optional override for relation texts
rel2txt = {
    'avoid eating': 'avoid eating',
    'good to eat': 'good to eat'
}

# Step 2: Build head-relation to tails mapping from test set
head_rel_to_tails = defaultdict(set)
with open("test.tsv", "r") as f:
    for line in f:
        tmp = line.strip().split("\t")
        head_rel_to_tails[(tmp[0], tmp[1])].add(tmp[2])

# Step 3: Generate prompts
lines_to_write_glm = []
lines_to_write_llama_lora = []

with open("train.tsv", "r") as f:
    for line in f:
        tmp = line.strip().split("\t")

        # Positive sample
        prompt = f"Is the following statement correct: {ent2txt[tmp[0]]} {rel2txt[tmp[1]]} {ent2txt[tmp[2]]}?"
        tmp_str = json.dumps({"prompt": prompt, "response": "Yes, this is true."})
        if is_json(tmp_str):
            lines_to_write_glm.append(tmp_str + "\n")

        llama_format = {
            "instruction": prompt,
            "input": "",
            "output": "Yes, this is true."
        }
        lines_to_write_llama_lora.append(json.dumps(llama_format, indent=2))

        # Generate negative sample by corrupting tail
        candidate_entities = set(ent_list) - head_rel_to_tails[(tmp[0], tmp[1])]
        if candidate_entities:
            corrupted_tail = random.choice(list(candidate_entities))
            prompt = f"Is the following statement correct: {ent2txt[tmp[0]]} {rel2txt[tmp[1]]} {ent2txt[corrupted_tail]}?"
            tmp_str = json.dumps({"prompt": prompt, "response": "No, this is not true."})
            if is_json(tmp_str):
                lines_to_write_glm.append(tmp_str + "\n")

            llama_format = {
                "instruction": prompt,
                "input": "",
                "output": "No, this is not true."
            }
            lines_to_write_llama_lora.append(json.dumps(llama_format, indent=2))

        print(len(lines_to_write_glm))

# Save data to file
# with open("test_instructions_glm.json", "w") as f:
#     f.writelines(lines_to_write_glm)

with open("train_instructions_llama.json", "w") as f:
    json_array = "[\n" + ",\n".join(lines_to_write_llama_lora) + "\n]"
    f.write(json_array)