In [1]:
from google.colab import drive
drive.mount('/content/drive')


KeyboardInterrupt: 

# InsightSpike-AI: Large Scale RAG (Colab Pro / A100)

This notebook runs a large-scale RAG experiment with the Exp2-4 lite pipeline.

Notes:
- Runtime > Change runtime type > GPU (A100 if available).
- Results, cache, and dataset are stored on Google Drive.


In [None]:
!nvidia-smi


## 1. Setup Environment
Clone the repo and install dependencies.


In [None]:
import os

REPO_URL = 'https://github.com/miyauchikazuyoshi/InsightSpike-AI.git'
REPO_DIR = '/content/InsightSpike-AI'

if not os.path.exists(REPO_DIR):
    !git clone --depth 1 {REPO_URL}

%cd /content/InsightSpike-AI
!pip -q install -e .
!pip -q install sentence-transformers scikit-learn


## 2. Configure Scale
Adjust sizes below for larger or smaller runs.


In [None]:
import os

DRIVE_ROOT = '/content/drive/MyDrive/insightspike/rag_large_scale'
os.makedirs(DRIVE_ROOT, exist_ok=True)

TOTAL_DOCS = 50000
TOTAL_QUERIES = 5000
MAX_QUERIES = TOTAL_QUERIES

if TOTAL_DOCS % TOTAL_QUERIES != 0:
    raise ValueError('TOTAL_DOCS must be divisible by TOTAL_QUERIES')
DOCS_PER_QUERY = TOTAL_DOCS // TOTAL_QUERIES

DATASET_PATH = os.path.join(DRIVE_ROOT, f'synthetic_{TOTAL_DOCS}_docs_{TOTAL_QUERIES}_queries.jsonl')
RESULTS_DIR = os.path.join(DRIVE_ROOT, 'results')
CACHE_DIR = os.path.join(DRIVE_ROOT, 'cache')
CONFIG_PATH = os.path.join(DRIVE_ROOT, 'rag_large_scale_config.yaml')

os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

print('Dataset:', DATASET_PATH)
print('Results:', RESULTS_DIR)
print('Cache:', CACHE_DIR)


## 3. Generate Large Dataset
Creates a synthetic dataset on Drive (skips if already exists).


In [None]:
import json
import random

if not os.path.exists(DATASET_PATH):
    random.seed(42)
    print(f'Generating {TOTAL_QUERIES} queries and {TOTAL_DOCS} documents...')

    with open(DATASET_PATH, 'w', encoding='utf-8') as f:
        doc_counter = 0
        for q_idx in range(TOTAL_QUERIES):
            batch_docs = []
            for _ in range(DOCS_PER_QUERY):
                doc_id = f'doc_{doc_counter}'
                text = (
                    f'This is the content of document {doc_counter}. '
                    f'It contains information relevant to query {q_idx} if selected as ground truth.'
                )
                metadata = {'id': doc_id, 'source': 'synthetic'}
                batch_docs.append({'id': doc_id, 'text': text, 'metadata': metadata})
                doc_counter += 1

            target_doc = batch_docs[0]
            query_text = 'What is the content of document {}?'.format(target_doc['id'])
            ground_truth = target_doc['text']

            entry = {
                'query': query_text,
                'ground_truth': ground_truth,
                'documents': batch_docs,
            }
            f.write(json.dumps(entry) + '\n')

    print('Created:', DATASET_PATH)
else:
    print('Dataset exists, skipping:', DATASET_PATH)


## 4. Create Configuration
Writes a config file pointing to the Drive dataset and output directory.


In [None]:
config_text = f'''
experiment:
  name: exp23_large_scale_{TOTAL_DOCS}
  output_dir: {RESULTS_DIR}
  seed: 42
  target_ag_rate: 0.08
  target_dg_rate: 0.04

dataset:
  path: {DATASET_PATH}
  max_queries: {MAX_QUERIES}

embedding:
  model: sentence-transformers/all-MiniLM-L6-v2
  normalize: true
  cache_dir: {CACHE_DIR}

retrieval:
  top_k: 10
  bm25_weight: 0.5
  embedding_weight: 0.5
  expansion_hops: 1

gedig:
  lambda: 0.6
  use_multihop: true
  max_hops: 3
  decay_factor: 0.7
  sp_beta: 0.2
  theta_ag: 2.0
  theta_dg: 0.05
  ig_mode: raw
  spike_mode: and

psz:
  acceptance_threshold: 0.6
  fmr_threshold: 0.02
  latency_p50_threshold_ms: 200

baselines:
  - name: static_rag
    type: static
  - name: gedig_ag_dg
    type: gedig

logging:
  save_step_logs: false
  save_memory_snapshots: false
  snapshot_interval: 200
'''

with open(CONFIG_PATH, 'w', encoding='utf-8') as f:
    f.write(config_text)

print('Wrote config:', CONFIG_PATH)


## 5. Run Experiment
This may take time at scale. Progress and results will be saved to Drive.


In [None]:
!python -m experiments.exp2to4_lite.src.run_experiment --config "{CONFIG_PATH}"


## 6. Inspect Results


In [None]:
!ls -lh "{RESULTS_DIR}" | tail -n 20
