# LLM Project – Feasibility & Prep 

This notebook prepares data for a project that explores whether drugs with similar descriptions also share similar side effects. The idea is to use a language model to embed drug descriptions from DrugBank and compare those embeddings to known side effect data from SIDER. If similar descriptions align with shared side effects, it could help uncover drug relationships using only text.

What's done in this notebook:
- Parsed DrugBank XML to extract drug descriptions
- Loaded and cleaned SIDER side effect data
- Merged both sources by drug name
- Grouped known side effects per drug
- Embedded drug descriptions using `all-MiniLM-L6-v2`

-------------

In [1]:
# load imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load and parse DrugBank XML
drugbank_path = 'full_database.xml'
tree = ET.parse(drugbank_path)
root = tree.getroot()
ns = {'db': 'http://www.drugbank.ca'}  # DrugBank XML namespace

In [3]:
# extract drug info: id, name, description
drug_records = []
for drug in root.findall('db:drug', ns):
    try:
        drug_id = drug.find('db:drugbank-id', ns).text
        name = drug.find('db:name', ns).text
        description = drug.find('db:description', ns).text
        drug_records.append((drug_id, name, description))
    except:
        continue

# new df
drug_df = pd.DataFrame(drug_records, columns=['drugbank_id', 'name', 'description'])
drug_df['name_lower'] = drug_df['name'].str.lower()
# check
print("DrugBank shape:", drug_df.shape)
print(drug_df.head())

DrugBank shape: (17430, 4)
  drugbank_id                 name  \
0     DB00001            Lepirudin   
1     DB00002            Cetuximab   
2     DB00003         Dornase alfa   
3     DB00004  Denileukin diftitox   
4     DB00005           Etanercept   

                                         description           name_lower  
0  Lepirudin is a recombinant hirudin formed by 6...            lepirudin  
1  Cetuximab is a recombinant chimeric human/mous...            cetuximab  
2  Dornase alfa is a biosynthetic form of human d...         dornase alfa  
3  Denileukin diftitox is an IL2-receptor-directe...  denileukin diftitox  
4  Dimeric fusion protein consisting of the extra...           etanercept  


In [4]:
# load and clean SIDER data
# drug names
sider_names = pd.read_csv('drug_names.tsv', sep='\t', header=None)
sider_names.head()

Unnamed: 0,0,1
0,CID100000085,carnitine
1,CID100000119,gamma-aminobutyric
2,CID100000137,5-aminolevulinic
3,CID100000143,leucovorin
4,CID100000146,5-methyltetrahydrofolate


In [5]:
sider_names.columns = ['stitch_id', 'drug_name']
sider_names['drug_name_lower'] = sider_names['drug_name'].str.lower()
sider_names.head()

Unnamed: 0,stitch_id,drug_name,drug_name_lower
0,CID100000085,carnitine,carnitine
1,CID100000119,gamma-aminobutyric,gamma-aminobutyric
2,CID100000137,5-aminolevulinic,5-aminolevulinic
3,CID100000143,leucovorin,leucovorin
4,CID100000146,5-methyltetrahydrofolate,5-methyltetrahydrofolate


In [6]:
# side effect data
sider_effects = pd.read_csv('meddra_all_se.tsv', sep='\t', header=None)
sider_effects.head()

Unnamed: 0,0,1,2,3,4,5
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [7]:
sider_effects = pd.read_csv('meddra_all_se.tsv', sep='\t', header=None)
# some mapping
sider_effects.columns = [
    'stitch_id', 'stitch_id_stereo', 'umls_id',
    'term_type', 'umls_linked_id', 'side_effect_name'
]
# col names (had to look these up & what each column likely represents)
# 0, stitch_id: Drug ID (flat)
# 1, stitch_id_stereo: Drug ID with stereochemistry
# 2, umls_id: UMLS concept ID (unique code for the side effect term)
# 3, term_type: MedDRA term type (LLT = Lower Level Term, PT = Preferred Term)
# 4, umls_linked_id: Canonical UMLS ID for grouping similar terms
# 5, side_effect_name: Human-readable name of the side effect
sider_effects.head()

Unnamed: 0,stitch_id,stitch_id_stereo,umls_id,term_type,umls_linked_id,side_effect_name
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [8]:
# filter out invalid side effect entries
sider_effects = sider_effects[sider_effects['side_effect_name'].notna()][['stitch_id', 'side_effect_name']]

print("SIDER drug names:", sider_names.shape)
print("SIDER side effects (cleaned):", sider_effects.shape)
sider_effects.head()

SIDER drug names: (1430, 3)
SIDER side effects (cleaned): (309849, 2)


Unnamed: 0,stitch_id,side_effect_name
0,CID100000085,Abdominal cramps
1,CID100000085,Abdominal pain
2,CID100000085,Abdominal pain
3,CID100000085,Gastrointestinal pain
4,CID100000085,Abdominal pain


In [10]:
# merge and group by drug

# merge DrugBank & SIDER names using lowercase names
merged = pd.merge(drug_df, sider_names, left_on='name_lower', right_on='drug_name_lower', how='inner')

# merge side effect names using id
merged = pd.merge(merged, sider_effects, on='stitch_id', how='inner')

# drop missing descriptions
merged = merged[merged['description'].notna()].copy()

# group side effects per drug
grouped = merged.groupby(['drugbank_id', 'name', 'description'])['side_effect_name'].apply(set).reset_index()
grouped.reset_index(drop=True, inplace=True)

print("Final grouped shape:", grouped.shape)
print(grouped.head())

Final grouped shape: (1028, 4)
  drugbank_id          name  \
0     DB00006   Bivalirudin   
1     DB00014     Goserelin   
2     DB00035  Desmopressin   
3     DB00040      Glucagon   
4     DB00049   Rasburicase   

                                         description  \
0  Bivalirudin is a synthetic 20 residue peptide ...   
1  Goserelin is a synthetic hormone. In men, it s...   
2  Desmopressin (dDAVP), a synthetic analogue of ...   
3  Glucagon is a 29 amino acid hormone used as a ...   
4  Rasburicase is a recombinant urate-oxidase enz...   

                                    side_effect_name  
0  {Ecchymosis, Urinary retention, Atelectasis, P...  
1  {Pulmonary embolism, Incontinence, Cough, Ecch...  
2  {Wheezing, Vertigo, Dehydration, Water retenti...  
3  {Hypoglycaemic coma, Drug interaction, Blood p...  
4  {Oedema peripheral, Hypophosphataemia, Cough, ...  


------

## Embed Descriptions with SentenceTransformer

In [12]:
# embed drug descriptions 
model = SentenceTransformer('all-MiniLM-L6-v2')

# get 2d numpy array of shape (1028, 384)
embeddings = model.encode(grouped['description'].tolist(), show_progress_bar=True)

# store ea embedding as a row 
grouped['embedding'] = list(embeddings)
print(f"Embedding shape: {grouped.loc[0, 'embedding'].shape}")

Batches: 100%|██████████████████████████████████| 33/33 [00:02<00:00, 14.33it/s]

Embedding shape: (384,)





In [13]:
# saving into csv 
export_df = grouped[['drugbank_id', 'name', 'description', 'side_effect_name']].copy()

# embeddings are arrays, flatten before saving
embeddings_matrix = np.vstack(grouped['embedding'].values)
embedding_df = pd.DataFrame(embeddings_matrix, columns=[f"emb_{i}" for i in range(embeddings_matrix.shape[1])])
final_df = pd.concat([export_df.reset_index(drop=True), embedding_df], axis=1)

final_df.to_csv('drug_descriptions_embeddings.csv', index=False)
print("Saved: drug_descriptions_embeddings.csv")

Saved: drug_descriptions_embeddings.csv
