In [1]:
%cd graph-enhanced-retrieval-qa
!pwd

/home/sslab/24m0786/graph-enhanced-retrieval-qa
/home/sslab/24m0786/graph-enhanced-retrieval-qa


In [5]:
import sys
sys.path.append('..')

import torch
from sentence_transformers import SentenceTransformer, util
from torch_geometric.data import Data
from pprint import pprint
import numpy as np

from src.data_loader import load_dataset, process_sample

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model_path = '/home/sslab/24m0786/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181'
model = SentenceTransformer(model_path, device=device)

train_dataset = load_dataset('data/raw/train.json')
sample = train_dataset[0]
processed_data = process_sample(sample)

print("\n--- Sample Data ---")
print("Question:", processed_data['question'])
print(f"Number of passages: {len(processed_data['passages'])}")

Using device: cuda

--- Sample Data ---
Question: Are director of film Move (1970 Film) and director of film Méditerranée (1963 Film) from the same country?
Number of passages: 10


In [6]:
passages_map = processed_data['passages']
passage_titles = list(passages_map.keys())
passage_texts = list(passages_map.values())
node_features = model.encode(
    passage_texts, 
    convert_to_tensor=True, 
    batch_size=32,
    show_progress_bar=True
)
print("\n--- Node Features ---")
print("Shape of node feature matrix (x):", node_features.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)



--- Node Features ---
Shape of node feature matrix (x): torch.Size([10, 1024])


In [None]:
similarity_matrix = util.cos_sim(node_features, node_features)
SIMILARITY_THRESHOLD = 0.6 
edge_indices_similarity = torch.where(similarity_matrix > SIMILARITY_THRESHOLD)

similarity_edges = []
for i in range(len(edge_indices_similarity[0])):
    u = edge_indices_similarity[0][i].item()
    v = edge_indices_similarity[1][i].item()
    if u != v: # Exclude self-loops
        similarity_edges.append((u, v))

print(f"\n--- Semantic Edges ---")
print(f"Found {len(similarity_edges)} edges based on similarity > {SIMILARITY_THRESHOLD}")
print("Example edges:", similarity_edges[:5])


--- Semantic Edges ---
Found 0 edges based on similarity > 0.6
Example edges: []


In [None]:
num_passages = len(passage_titles)
sequential_edges = []
for i in range(num_passages - 1):
    sequential_edges.append((i, i + 1))
    sequential_edges.append((i + 1, i))

print(f"\n--- Sequential Edges ---")
print(f"Found {len(sequential_edges)} sequential edges.")
print("Example edges:", sequential_edges[:18])


--- Sequential Edges ---
Found 18 sequential edges.
Example edges: [(0, 1), (1, 0), (1, 2), (2, 1), (2, 3), (3, 2), (3, 4), (4, 3), (4, 5), (5, 4), (5, 6), (6, 5), (6, 7), (7, 6), (7, 8), (8, 7), (8, 9), (9, 8)]


In [None]:
all_edges = similarity_edges + sequential_edges
unique_edges = list(set(all_edges))

print(f"\n--- Combined Edges ---")
print(f"Total unique edges: {len(unique_edges)}")
if unique_edges:
    edge_index = torch.tensor(unique_edges, dtype=torch.long).t().contiguous()
else:
    edge_index = torch.empty((2, 0), dtype=torch.long)

print("Shape of edge_index tensor:", edge_index.shape)


--- Combined Edges ---
Total unique edges: 18
Shape of edge_index tensor: torch.Size([2, 18])


In [None]:
graph_data = Data(x=node_features, edge_index=edge_index)

print("\n--- Final Graph Object ---")
print(graph_data)


--- Final Graph Object ---
Data(x=[10, 1024], edge_index=[2, 18])
