In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
import json
import networkx as nx
import requests
from bs4 import BeautifulSoup
!pip install transformers torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
!pip install accelerate
import accelerate
import os, json, glob
# Configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
print(f"Using device: {device}")

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


# Load model once outside the function (so it doesn't reload on every call)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,   # use float32 if on CPU
    device_map=torch.device('mps'),           # automatically uses GPU if availa
)


# intialize pretrained thing, for now just the embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")


Using device: mps


`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


## Fetching, Chunking, and Building the Graph

`fetch_and_chunk` returns a vector of words from parsed from a given url


`build_graph` uses the embedder to create a similarity matrix of all the chunks generated from `fetch_and_chunk`. If entries in the similarity matrix have cosine similarity > `sim_threshold`, then add an edge between the chunks in the graph representation. 

In [2]:
REPO_PATH = "./WildGraphBench"  # path to cloned repo


def load_questions(domain):
    path = f"{REPO_PATH}/QA/{domain}/questions.jsonl"
    with open(path) as f:
        return [json.loads(line) for line in f]


def load_reference_pages(domain, topic, chunk_size=300):
    folder = f"{REPO_PATH}/corpus/{domain}/{topic}/reference_pages/"
    all_chunks = []
    for filepath in glob.glob(folder + "*.txt"):
        with open(filepath, "r", errors="ignore") as f:
            text = f.read()
        words = text.split()
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            node_id = f"{os.path.basename(filepath)}__{i}"
            all_chunks.append((node_id, chunk))
    return all_chunks


def build_graph(all_chunks, sim_threshold):
    G = nx.Graph()
    for node_id, chunk in all_chunks:
        G.add_node(node_id, text=chunk)

    embeddings = embedder.encode([c[1] for c in all_chunks], convert_to_tensor=True)
    sim_matrix = util.cos_sim(embeddings, embeddings)

    for i in range(len(all_chunks)):
        for j in range(i + 1, len(all_chunks)):
            if sim_matrix[i][j] > sim_threshold:
                G.add_edge(all_chunks[i][0], all_chunks[j][0], weight=float(sim_matrix[i][j]))

    return G, embeddings

## Retrieval and Responding to Prompts

In [3]:
def retrieve(question, graph, all_chunks, chunk_embeddings, top_k, hopping_number):
    q_emb = embedder.encode([question], convert_to_tensor=True)
    scores = util.cos_sim(q_emb, chunk_embeddings)[0]
    top_results = scores.argsort(descending=True)[:top_k].tolist()
    seed_nodes = [all_chunks[i][0] for i in top_results]
    expanded = set(seed_nodes)
    for node in seed_nodes:
        neighbors = sorted(G[node].items(), key = lambda x: x[1].get("weight", 0), reverse=True)[:hopping_number]
        expanded.update([n for n, _ in neighbors])
    return [graph.nodes[n]["text"] for n in expanded if n in graph.nodes]


def respond(question, context_chunks):
    context = "\n\n".join(context_chunks)
    messages = [
        {"role": "system", "content": "Answer concisely using only the provided context. If the answer isn't in the context, say so."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
    ]
    # Tokenize using the model's chat template
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict = True,
        truncation = True,
        max_length = 1800
    ).to(model.device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            do_sample=False,       # greedy decoding â€” deterministic, good for QA
            temperature=1.0,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the newly generated tokens (not the prompt)
    generated = outputs[0][inputs.shape[-1]:]
    return tokenizer.decode(generated, skip_special_tokens=True)
    

## The Whole Shabang

In [4]:
def graphrag(question, ref_urls):
    G, all_chunks, embeddings = build_graph(ref_urls)
    context = retrieve(question, G, all_chunks, embeddings)
    return answer_question(question, context)

# Benchmarking

In [5]:
DOMAIN = "culture"
TOPIC = 'Marvel Cinematic Universe'

questions = load_questions(DOMAIN)
all_chunks = load_reference_pages(DOMAIN, TOPIC)
print('checkpoint 1')
G, embeddings = build_graph(all_chunks, sim_threshold = 0.8)
print('checkpoint 2')

checkpoint 1
checkpoint 2


In [8]:
nx.write_gml(G, "graph.gml")
torch.save(embeddings, "embeddings.pt")

In [1]:
predictions = []
i = 0
for q in questions:
    print("question number ", i)
    i += 1
    context = retrieve(q["question"], G, all_chunks, embeddings, top_k = 3, hopping_number = 2)
    answer = respond(q["question"], context)
    predictions.append({
        "question": q["question"],
        "prediction": answer
    })
    print(f"Q: {q['question']}\nA: {answer}\n")
    
print('checkpoint 3')
# Save to file
with open("predictions.jsonl", "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

NameError: name 'questions' is not defined