# SETUP ENVIRONMENT

In [6]:
import os
import sys
import subprocess

# Set environment variables (for downstream retrieval)
%env KGQA_DATASET=webqsp
%env MODEL_NAME=gte-large-en-v1.5

# Constants
ROOT = "/kaggle/working"
REPO_URL = "https://github.com/Graph-COM/SubgraphRAG.git"
DATASET = os.environ.get("KGQA_DATASET", "cwq")
MODEL_NAME = os.environ.get("MODEL_NAME", "gte-large-en-v1.5")
BASEDIR = os.path.join(ROOT, "SubgraphRAG")

# Print status
print("📁 Environment configured")
print(f"   Dataset : {DATASET}")
print(f"   Model   : {MODEL_NAME}")
print(f"   RepoDir : {BASEDIR}")

import torch
print(f"🚀 Torch version: {torch.__version__}")
print(f"🧠 CUDA available: {torch.cuda.is_available()}")
print(f"💻 CUDA device count: {torch.cuda.device_count()}")
print(f"🖥  Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'CPU'}")


env: KGQA_DATASET=webqsp
env: MODEL_NAME=gte-large-en-v1.5
📁 Environment configured
   Dataset : webqsp
   Model   : gte-large-en-v1.5
   RepoDir : /kaggle/working/SubgraphRAG
🚀 Torch version: 2.6.0+cu124
🧠 CUDA available: True
💻 CUDA device count: 2
🖥  Current device: 0


# INSTALL DEPENDENCIES

In [2]:
import os
import subprocess
import sys

# ────────────────────────────────────────────────────────────────────────────────
# Constants
# ────────────────────────────────────────────────────────────────────────────────
REPO_URL = "https://github.com/Graph-COM/SubgraphRAG.git"
ROOT     = "/kaggle/working"
BASEDIR  = os.path.join(ROOT, "SubgraphRAG")

# ────────────────────────────────────────────────────────────────────────────────
# Helper to install via pip without failing the cell
# ────────────────────────────────────────────────────────────────────────────────
def pip_install(packages, force=False, find_links=None, index_url=None):
    cmd = [sys.executable, "-m", "pip", "install"]
    if force:
        cmd += ["--force-reinstall", "--no-cache-dir"]
    cmd += ["-q"] + packages
    if find_links:
        cmd += ["-f", find_links]
    if index_url:
        cmd += ["-f", index_url]
    try:
        subprocess.run(cmd, check=True)
        print(f"✅ Installed: {' '.join(packages)}")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Skipped install of {' '.join(packages)} (error: {e})")

# ────────────────────────────────────────────────────────────────────────────────
# 1. Clone SubgraphRAG if needed
# ────────────────────────────────────────────────────────────────────────────────
if not os.path.exists(BASEDIR):
    print(f"📦 Cloning SubgraphRAG into {BASEDIR}…")
    try:
        subprocess.run(["git", "clone", "--depth", "1", REPO_URL, BASEDIR], check=True)
        print("✅ Repo cloned\n")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Could not clone repo (continuing): {e}")
else:
    print("✅ Repo already exists\n")

os.chdir(BASEDIR)

# ────────────────────────────────────────────────────────────────────────────────
# 2. Pin core numerical stack
# ────────────────────────────────────────────────────────────────────────────────
print("🔧 Pinning core numerical stack (numpy, scipy)")
pip_install(["numpy==1.26.4"], force=True)
pip_install(["scipy==1.11.4"], force=True)

# ────────────────────────────────────────────────────────────────────────────────
# 3. Install/Verify PyTorch + CUDA (cu118)
# ────────────────────────────────────────────────────────────────────────────────
print("🧠 Checking existing torch installation")
try:
    import torch
    print(f"   torch {torch.__version__}, cuda:{torch.version.cuda}, available:{torch.cuda.is_available()}")
    need_torch_install = not (torch.cuda.is_available() and torch.version.cuda.startswith("11.8"))
except ImportError:
    need_torch_install = True

if need_torch_install:
    print("🔄 Installing torch/cu118 stack")
    pip_install(
        ["torch==2.1.0+cu118", "torchvision==0.15.0+cu118"],
        force=True,
        index_url="https://download.pytorch.org/whl/torch_stable.html"
    )

# ────────────────────────────────────────────────────────────────────────────────
# 4. Install torch-geometric stack
# ────────────────────────────────────────────────────────────────────────────────
print("📦 Installing torch-geometric stack")
pip_install(
    ["torch-scatter", "torch-sparse", "torch-geometric"],
    force=True,
    find_links="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
)

# ────────────────────────────────────────────────────────────────────────────────
# 5. Install SubgraphRAG requirements
# ────────────────────────────────────────────────────────────────────────────────
print("📦 Installing SubgraphRAG requirements")
pip_install(["-r", "retrieve/requirements/gte_large_en_v1-5.txt"])
pip_install(["-r", "retrieve/requirements/retriever.txt"])

# ────────────────────────────────────────────────────────────────────────────────
# 6. Install xformers
# ────────────────────────────────────────────────────────────────────────────────
print("⚙️ Installing xformers")
pip_install(["xformers==0.0.23.post1"], force=True)


# ────────────────────────────────────────────────────────────────────────────────
# 7. Ensure torch_geometric is available
# ────────────────────────────────────────────────────────────────────────────────
import torch
try:
    import torch_geometric
    print("✅ torch_geometric available\n")
except ImportError:
    print("⚙️ Installing torch-geometric stack")
    version = torch.__version__.split("+")[0]        # e.g. "2.6.0"
    cuda_tag = torch.version.cuda.replace(".", "")  # e.g. "124"
    pyg_link = f"https://data.pyg.org/whl/torch-{version}+cu{cuda_tag}.html"
    subprocess.run([
        sys.executable, "-m", "pip", "install", "-q",
        "torch-scatter", "torch-sparse", "torch-geometric",
        "-f", pyg_link
    ], check=True)
    import torch_geometric
    print("✅ torch_geometric installed\n")

# ────────────────────────────────────────────────────────────────────────────────
# 8. Validate critical libraries
# ────────────────────────────────────────────────────────────────────────────────
print("\n🔍 Validating critical libraries")
errors = []
try:
    import torch
    print(f"✅ torch:     {torch.__version__}")
    print(f"🧠 CUDA avail:{torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"🖥️  Device:    {torch.cuda.get_device_name(0)}")
except Exception as e:
    errors.append(f"torch import error: {e}")

try:
    import numpy
    print(f"✅ numpy:     {numpy.__version__}")
except Exception as e:
    errors.append(f"numpy import error: {e}")

try:
    import scipy
    print(f"✅ scipy:     {scipy.__version__}")
except Exception as e:
    errors.append(f"scipy import error: {e}")

try:
    import xformers
    print(f"✅ xformers:  {xformers.__version__}")
except Exception as e:
    errors.append(f"xformers import error: {e}")

if errors:
    print("\n⚠️ Validation errors:")
    for err in errors:
        print("  -", err)
else:
    print("\n🎉 All dependencies installed and verified!")


✅ Repo already exists

🔧 Pinning core numerical stack (numpy, scipy)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.0/61.0 kB 4.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 186.6 MB/s eta 0:00:00


ERROR: Operation cancelled by user


KeyboardInterrupt: 

TRAIN THE MODEL

In [1]:
import os
import subprocess
import sys

# ────────────────────────────────────────────────────────────────────────────────
# 0. Disable WANDB so training doesn’t pause for an API key
# ────────────────────────────────────────────────────────────────────────────────
os.environ["WANDB_MODE"] = "disabled"

# ────────────────────────────────────────────────────────────────────────────────
# 1. Kickoff
# ────────────────────────────────────────────────────────────────────────────────
print("🚀 Starting SubgraphRAG pipeline")
print(f"   Dataset : {DATASET}")
print(f"   Model   : {MODEL_NAME}")
print(f"   BaseDir : {BASEDIR}\n")

# ────────────────────────────────────────────────────────────────────────────────
# 2. Verify config files exist
# ────────────────────────────────────────────────────────────────────────────────
emb_cfg = os.path.join(BASEDIR, "retrieve", "configs", "emb", MODEL_NAME, f"{DATASET}.yaml")
ret_cfg = os.path.join(BASEDIR, "retrieve", "configs", "retriever", f"{DATASET}.yaml")
if not os.path.isfile(emb_cfg):
    raise FileNotFoundError(f"❌ Missing embedding config: {emb_cfg}")
if not os.path.isfile(ret_cfg):
    raise FileNotFoundError(f"❌ Missing retriever config: {ret_cfg}")
print("✅ Configs found\n")

# ────────────────────────────────────────────────────────────────────────────────
# 3. Precompute Embeddings (skip if already cached)
# ────────────────────────────────────────────────────────────────────────────────
emb_cache = os.path.join(
    BASEDIR, "retrieve", "data_files", DATASET, "emb", MODEL_NAME, "train.pth"
)
if not os.path.isfile(emb_cache):
    os.chdir(os.path.join(BASEDIR, "retrieve"))
    print(f"🧠 Precomputing embeddings for dataset '{DATASET}'…")
    subprocess.run([sys.executable, "emb.py", "-d", DATASET], check=True)
    print("✅ Embedding complete\n")
else:
    print(f"✅ Found cached embeddings at {emb_cache} — skipping embedding\n")

# ────────────────────────────────────────────────────────────────────────────────
# 4. Train Retriever
# ────────────────────────────────────────────────────────────────────────────────
os.chdir(os.path.join(BASEDIR, "retrieve"))
print(f"🚀 Training retriever on '{DATASET}'…")
subprocess.run([sys.executable, "train.py", "-d", DATASET], check=True)
print("✅ Training complete\n")




# ────────────────────────────────────────────────────────────────────────────────
# 5. Locate the most recent checkpoint under retrieve_dir
# ────────────────────────────────────────────────────────────────────────────────

retrieve_dir = os.path.join(BASEDIR, "retrieve")

ckpt_dirs = [
    d for d in os.listdir(retrieve_dir)
    if d.startswith(f"{DATASET}_") and os.path.isdir(os.path.join(retrieve_dir, d))
]
if not ckpt_dirs:
    raise FileNotFoundError(f"❌ No checkpoint directories found under {retrieve_dir}")
ckpt_dirs.sort(
    key=lambda d: os.path.getmtime(os.path.join(retrieve_dir, d)),
    reverse=True
)
latest_ckpt = ckpt_dirs[0]
ckpt_path = os.path.join(retrieve_dir, latest_ckpt, "cpt.pth")
if not os.path.isfile(ckpt_path):
    raise FileNotFoundError(f"❌ Checkpoint file not found at {ckpt_path}")
print(f"🔖 Found checkpoint: {ckpt_path}\n")



# ────────────────────────────────────────────────────────────────────────────────
# 6. Run inference, pointing to the checkpoint path relative to retrieve_dir
# ────────────────────────────────────────────────────────────────────────────────
os.chdir(retrieve_dir)
ckpt_arg = os.path.join(latest_ckpt, "cpt.pth")
print(f"🔍 Running inference with '{ckpt_arg}'…")
subprocess.run(
    [sys.executable, "inference.py", "-p", ckpt_arg],
    check=True
)
print("✅ Inference complete\n")

# ────────────────────────────────────────────────────────────────────────────────
# 7. Locate retrieval result
# ────────────────────────────────────────────────────────────────────────────────
result_path = os.path.join(retrieve_dir, latest_ckpt, "retrieval_result.pth")
if not os.path.isfile(result_path):
    raise FileNotFoundError(f"❌ retrieval_result.pth not found at {result_path}")
print(f"📂 Retrieval result: {result_path}\n")

# ────────────────────────────────────────────────────────────────────────────────
# 8. Evaluate
# ────────────────────────────────────────────────────────────────────────────────
res_arg = os.path.join(latest_ckpt, "retrieval_result.pth")
print("📊 Evaluating retrieval…")
subprocess.run(
    [sys.executable, "eval.py", "-d", DATASET, "-p", res_arg],
    check=True
)
print("🎉 Pipeline completed successfully!\n")

🚀 Starting SubgraphRAG pipeline


NameError: name 'DATASET' is not defined