In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
import json
import networkx as nx
import requests
from bs4 import BeautifulSoup
!pip install transformers torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
!pip install accelerate
import accelerate
import os, json, glob
# Configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
print(f"Using device: {device}")

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


# Load model once outside the function (so it doesn't reload on every call)
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,   # use float32 if on CPU
    device_map=torch.device('cuda'),           # automatically uses GPU if availa
)


# intialize pretrained thing, for now just the embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Google drive utils etc. in the event of running in CoLab

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
import os
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'CS6540/' #
GOOGLE_DRIVE_PATH = os.path.join('drive', 'MyDrive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

Mounted at /content/drive/
['gstorer-A1-MLP.ipynb', 'gstorer-A1-CNN.ipynb', 'content', 'gstorer-A3-RNN.ipynb', 'WildGraphBench', 'transfer_learning.ipynb']


## Fetching, Chunking, and Building the Graph

`fetch_and_chunk` returns a vector of words from parsed from a given url


`build_graph` uses the embedder to create a similarity matrix of all the chunks generated from `fetch_and_chunk`. If entries in the similarity matrix have cosine similarity > `sim_threshold`, then add an edge between the chunks in the graph representation.

In [15]:
REPO_PATH = '/content/drive/MyDrive/CS6540/WildGraphBench'


def load_questions(domain):
    path = f"{REPO_PATH}/QA/{domain}/questions.jsonl"
    with open(path) as f:
        return [json.loads(line) for line in f]


def load_reference_pages(domain, topic, chunk_size=300):
    folder = f"{REPO_PATH}/corpus/{domain}/{topic}/reference_pages/"
    all_chunks = []
    for filepath in glob.glob(folder + "*.txt"):
        with open(filepath, "r", errors="ignore") as f:
            text = f.read()
        words = text.split()
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            node_id = f"{os.path.basename(filepath)}__{i}"
            all_chunks.append((node_id, chunk))
    return all_chunks


def build_graph(all_chunks, sim_threshold):
    G = nx.Graph()
    for node_id, chunk in all_chunks:
        G.add_node(node_id, text=chunk)

    embeddings = embedder.encode([c[1] for c in all_chunks], convert_to_tensor=True)
    sim_matrix = util.cos_sim(embeddings, embeddings)

    for i in range(len(all_chunks)):
        for j in range(i + 1, len(all_chunks)):
            if sim_matrix[i][j] > sim_threshold:
                G.add_edge(all_chunks[i][0], all_chunks[j][0], weight=float(sim_matrix[i][j]))

    return G, embeddings

## Retrieval and Responding to Prompts

In [23]:
def retrieve(question, graph, all_chunks, chunk_embeddings, top_k, hopping_number):
    q_emb = embedder.encode([question], convert_to_tensor=True)
    scores = util.cos_sim(q_emb, chunk_embeddings)[0]
    top_results = scores.argsort(descending=True)[:top_k].tolist()
    seed_nodes = [all_chunks[i][0] for i in top_results]
    expanded = set(seed_nodes)
    for node in seed_nodes:
        neighbors = sorted(G[node].items(), key = lambda x: x[1].get("weight", 0), reverse=True)[:hopping_number]
        expanded.update([n for n, _ in neighbors])
    return [graph.nodes[n]["text"] for n in expanded if n in graph.nodes]

def respond(question, context_chunks):
    context = "\n\n".join(context_chunks)
    messages = [
        {"role": "system", "content": "Answer concisely using only the provided context. If the answer isn't in the context, say so."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
    ]
    # Tokenize using the model's chat template
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict = True,
        truncation = True,
        max_length = 1800
    ).to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            do_sample=False,       # greedy decoding — deterministic, good for QA
            temperature=1.0,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the newly generated tokens (not the prompt)
    generated = outputs[0][inputs['input_ids'].shape[-1]:]
    return tokenizer.decode(generated, skip_special_tokens=True)

## The Whole Shabang

In [17]:
def graphrag(question, ref_urls):
    G, all_chunks, embeddings = build_graph(ref_urls)
    context = retrieve(question, G, all_chunks, embeddings)
    return respond(question, context)

# Benchmarking

In [18]:
DOMAIN = "culture"
TOPIC = 'Marvel Cinematic Universe'

questions = load_questions(DOMAIN)
all_chunks = load_reference_pages(DOMAIN, TOPIC)
print('checkpoint 1')
G, embeddings = build_graph(all_chunks, sim_threshold = 0.8)
print('checkpoint 2')

checkpoint 1
checkpoint 2


In [20]:
nx.write_gml(G, "/content/drive/MyDrive/CS6540/graph.gml")
torch.save(embeddings, "/content/drive/MyDrive/CS6540/embeddings.pt")

In [24]:
predictions = []
i = 0
for q in questions:
    print("question number ", i)
    i += 1
    context = retrieve(q["question"], G, all_chunks, embeddings, top_k = 3, hopping_number = 2)
    answer = respond(q["question"], context)
    predictions.append({
        "question": q["question"],
        "prediction": answer
    })
    print(f"Q: {q['question']}\nA: {answer}\n")

print('checkpoint 3')
# Save to file
with open("predictions.jsonl", "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

question number  0
Q: For the 'A Mini Marvel' commercial, what distinct visual effects processes did Luma Pictures utilize for creating the Hulk versus the technique used for Ant-Man?
A: 11](http://i0.wp.com/www.lumapictures.com/wp-content/uploads/2016/02/LumaPictures_a-mini-marvel_7-1.jpg?fit=2048%2C853?fit=1170,450) ![Image 12](http://i1.wp.com/www.lumapictures.com/wp-content/uploads/2016/02/LumaPictures_a-mini-marvel_8-1.jpg?fit=2048%2C853?fit=1170,450) ![Image 13](http://i0.wp.com/www.lumapictures.com/wp-content/uploads/2016/02/LumaPictures_a-mini-marvel_9-1.jpg?fit=2048%2C853?fit=1170,450) ![Image 14](http://i1.wp.com/www.lumapictures.com/wp-content/uploads/2016/02/LumaPictures_a-mini-marvel_10

question number  1
Q: What was the premiere date for _Marvel Studios: Assembled_ on Disney+, and what was the release pattern for its specials, beginning with the first one?
A: /stan-lee-marvel-movie-cameos-list-complete/)

# Here are all the new Marvel, Star Wars, and other projects Disne