# Prerequisites

In [1]:
import os
import json
import random
from datetime import datetime
from collections import Counter

import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data

import networkx as nx
from node2vec import Node2Vec

In [None]:
# Filtering thresholds
PRR_MIN = 1.5
MEAN_FREQ_MIN = 0.01

# Keep the top-K most frequent side effects
TOP_K_SE = 50

# Reproducibility
SEED = 42

In [2]:
file_path = r"C:\Users\Asus\Downloads\TWOSIDES\TWOSIDES.csv"

In [5]:
def set_seeds(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [6]:
set_seeds(SEED)

# Preprocessing

In [7]:
df = pd.read_csv(file_path, dtype=str)

In [8]:
df.head()

Unnamed: 0,drug_1_rxnorn_id,drug_1_concept_name,drug_2_rxnorm_id,drug_2_concept_name,condition_meddra_id,condition_concept_name,A,B,C,D,PRR,PRR_error,mean_reporting_frequency
0,10355,Temazepam,136411,sildenafil,10003239,Arthralgia,7,149,24,1536,2.91667,0.421275,0.0448718
1,1808,Bumetanide,7824,Oxytocin,10003239,Arthralgia,1,13,2,138,5.0,1.19224,0.0714286
2,221147,POLYETHYLENE GLYCOL 3350,5521,Hydroxychloroquine,10003239,Arthralgia,6,103,20,1070,3.0,0.454505,0.0550459
3,10324,Tamoxifen,8640,Prednisone,10012735,Diarrhoea,18,123,35,1375,5.14286,0.276271,0.12766
4,10355,Temazepam,136411,sildenafil,10012735,Diarrhoea,2,154,37,1523,0.540541,0.721093,0.0128205


In [9]:
df.shape

(42920391, 13)

In [10]:
# Checking for null values
df.isnull().sum()

drug_1_rxnorn_id            0
drug_1_concept_name         0
drug_2_rxnorm_id            0
drug_2_concept_name         0
condition_meddra_id         0
condition_concept_name      0
A                           0
B                           0
C                           0
D                           0
PRR                         0
PRR_error                   0
mean_reporting_frequency    0
dtype: int64

In [11]:
# Chacking for duplicates
df.duplicated().sum()

1000

In [12]:
df = df.drop_duplicates(subset=['drug_1_rxnorn_id', 'drug_2_rxnorm_id', 'condition_meddra_id'])

In [13]:
df.duplicated().sum()

0

In [14]:
df.shape

(42919391, 13)

In [15]:
# Normalize drug pair order to prevent duplication
df['min_drug'] = df[['drug_1_rxnorn_id', 'drug_2_rxnorm_id']].min(axis=1)
df['max_drug'] = df[['drug_1_rxnorn_id', 'drug_2_rxnorm_id']].max(axis=1)

df['drug_1_rxnorn_id'] = df['min_drug']
df['drug_2_rxnorm_id'] = df['max_drug']

df.drop(['min_drug', 'max_drug'], axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42919391 entries, 0 to 42920390
Data columns (total 13 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   drug_1_rxnorn_id          object
 1   drug_1_concept_name       object
 2   drug_2_rxnorm_id          object
 3   drug_2_concept_name       object
 4   condition_meddra_id       object
 5   condition_concept_name    object
 6   A                         object
 7   B                         object
 8   C                         object
 9   D                         object
 10  PRR                       object
 11  PRR_error                 object
 12  mean_reporting_frequency  object
dtypes: object(13)
memory usage: 4.5+ GB


In [16]:
# Convert columns to numeric
numeric_cols = ['drug_1_rxnorn_id', 'drug_2_rxnorm_id', 'condition_meddra_id',
                'A', 'B', 'C', 'D', 'PRR', 'PRR_error', 'mean_reporting_frequency']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [17]:
# Filter low-signal interactions
df = df[(df['PRR'] > PRR_MIN) & (df['mean_reporting_frequency'] > MEAN_FREQ_MIN)].copy()

In [None]:
# Keep top 50 most frequent side effects
top_side_effects = df['condition_meddra_id'].value_counts().head(TOP_K_SE).index.tolist()
df = df[df['condition_meddra_id'].isin(top_side_effects)].copy()

# Creating the Graph

In [16]:
# Group side effects per drug pair (multi-label setup)
edge_df = df.groupby(['drug_1_rxnorn_id', 'drug_2_rxnorm_id'])['condition_meddra_id'].apply(set)
edge_dict = edge_df.to_dict()

In [17]:
# Map drugs and side effects to consecutive indices
all_drugs = set([d for pair in edge_dict.keys() for d in pair])
drug2idx = {drug: idx for idx, drug in enumerate(sorted(all_drugs))}

all_side_effects = set([se for se_list in edge_dict.values() for se in se_list])
se2idx = {se: idx for idx, se in enumerate(sorted(all_side_effects))}

In [18]:
# Build edge_index and labels
edge_index = []
labels = []

for (d1, d2), se_set in edge_dict.items():
    edge_index.append([drug2idx[d1], drug2idx[d2]])

    label = np.zeros(len(se2idx), dtype=np.float32)
    for se in se_set:
        label[se2idx[se]] = 1.0
    labels.append(label)

edge_index_tensor = torch.tensor(edge_index).t().contiguous() # Rename to avoid conflict with networkx
labels_tensor = torch.tensor(np.array(labels))

In [19]:
# Create a NetworkX graph from edge_index_tensor
nx_graph = nx.Graph()
for i in range(edge_index_tensor.shape[1]):
    u, v = edge_index_tensor[0, i].item(), edge_index_tensor[1, i].item()
    nx_graph.add_edge(u, v)

for node_id in range(len(drug2idx)):
    if node_id not in nx_graph:
        nx_graph.add_node(node_id)

In [21]:
node2vec = Node2Vec(nx_graph, dimensions=128, walk_length=30, num_walks=20, p=1, q=1, workers=4, quiet=True)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

node_embeddings_dict = {int(node): model.wv[node] for node in model.wv.index_to_key}

In [22]:
embeddings_list = []
for node_id in range(len(drug2idx)):
    embedding = node_embeddings_dict.get(node_id)
    if embedding is not None:
        embeddings_list.append(embedding)
    else:
        # Handle nodes not in the embedding model (e.g., isolated nodes if Node2Vec skips them)
        embeddings_list.append(np.zeros(128, dtype=np.float32)) # Use zeros as fallback

x_features = torch.tensor(np.array(embeddings_list), dtype=torch.float)

In [23]:
print(f"Generated Node2Vec embeddings with shape: {x_features.shape}")

Generated Node2Vec embeddings with shape: torch.Size([1909, 128])


In [24]:
data = Data(edge_index=edge_index_tensor, y=labels_tensor, num_nodes=len(drug2idx), x=x_features)

In [25]:
print(data)
print(f"Shape of initial drug features (data.x): {data.x.shape}")
print(f"Number of nodes (data.num_nodes): {data.num_nodes}")
print(f"Number of DDI labels: {data.y.shape[1]}")

Data(x=[1909, 128], edge_index=[2, 211510], y=[211510, 50], num_nodes=1909)
Shape of initial drug features (data.x): torch.Size([1909, 128])
Number of nodes (data.num_nodes): 1909
Number of DDI labels: 50


In [26]:
# Save the graph

base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

processed_dir = os.path.join(base_dir, 'processed')
os.makedirs(processed_dir, exist_ok=True)

torch.save(data, os.path.join(processed_dir, 'ddi_graph.pt'))
