# RAG Pipeline - Experiment Notebook

Notebook for running RAG-based NER extraction experiments.

## Setup and Imports

In [None]:
import json
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List

from loguru import logger
from tqdm import tqdm
import pandas as pd

sys.path.append("..")

from src.config import PROCESSED_DATA_DIR, RESULTS_DIR, NERRagConfig
from src.data_processor import DataProcessor
from src.rag_pipeline import RAGNERExtractor
from src.utils import (
    calculate_metrics,
    display_metrics,
    save_experiment_results,
    compare_experiments
)

logger.info("Setup complete")

## Experiment Configuration

Configure your RAG experiment here. Change these settings to try different configurations.

In [None]:
EXPERIMENT_NAME = "rag_qwen3_4b_top3"

config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    embedding_model="BAAI/bge-small-en-v1.5",
    top_k_retrieval=3,
    temperature=0.1
)

# EXPERIMENT_NAME = "rag_qwen3_4b_top5"

# config = NERRagConfig(
#     model_name="Qwen/Qwen3-4B",
#     embedding_model="BAAI/bge-small-en-v1.5",
#     top_k_retrieval=5,
#     temperature=0.1
# )

# EXPERIMENT_NAME = "rag_qwen3_4b_top1"

# config = NERRagConfig(
#     model_name="Qwen/Qwen3-4B",
#     embedding_model="BAAI/bge-small-en-v1.5",
#     top_k_retrieval=1,
#     temperature=0.1
# )

print("RAG Configuration:")
print(f"  Model: {config.model_name}")
print(f"  Embedding model: {config.embedding_model}")
print(f"  Top-k retrieval: {config.top_k_retrieval}")
print(f"  Temperature: {config.temperature}")
print(f"  Max tokens: {config.max_new_tokens}")
print("=" * 80)

## Load Datasets

Load training data as corpus and test data for evaluation.

In [None]:
# Load train dataset as corpus
train_dataset_path = PROCESSED_DATA_DIR / "train.json"
if not train_dataset_path.exists():
    raise FileNotFoundError(f"Train dataset not found: {train_dataset_path}")

logger.info(f"Loading train dataset as corpus from {train_dataset_path}")
corpus = DataProcessor.load_dataset(train_dataset_path)
logger.success(f"Loaded {len(corpus)} corpus documents")

# Load test dataset
test_dataset_path = PROCESSED_DATA_DIR / "test.json"
if not test_dataset_path.exists():
    raise FileNotFoundError(f"Test dataset not found: {test_dataset_path}")

logger.info(f"Loading test dataset from {test_dataset_path}")
test_dataset = DataProcessor.load_dataset(test_dataset_path)
logger.success(f"Loaded {len(test_dataset)} test samples")

print("\n" + "=" * 80)
print("DATASET EXAMPLE")
print("=" * 80)
print(f"Text:\n{test_dataset[0]['text'][:300]}...\n")
print(f"Entities:\n{json.dumps(test_dataset[0]['entities'], indent=2, ensure_ascii=False)}")
print("=" * 80)

## Initialize RAG Extractor

Load the model, embedding model, and build the FAISS index.

In [None]:
extractor = RAGNERExtractor(config=config, corpus=corpus)

## Test Single Sample

Test on one sample to verify everything works before running the full evaluation.

In [None]:
test_text = test_dataset[0]["text"]
test_label = test_dataset[0]["entities"]

print("=" * 80)
print("SINGLE SAMPLE TEST")
print("=" * 80)
print(f"\nInput text:\n{test_text[:300]}...\n")
print(f"Ground truth:\n{json.dumps(test_label, indent=2, ensure_ascii=False)}\n")

# Test retrieval
logger.info("Testing retrieval...")
retrieved = extractor.retrieve(test_text)
print(f"\nRetrieved {len(retrieved)} documents:")
for i, doc in enumerate(retrieved, 1):
    print(f"\nDocument {i} (score: {doc['retrieval_score']:.4f}):")
    print(f"  Text: {doc['text'][:150]}...")
    print(f"  Entities: {doc.get('entities', {})}")

# Test extraction
logger.info("\nRunning extraction on test sample...")
prediction = extractor.extract_entities(test_text)

print(f"\nPrediction:\n{json.dumps(prediction, indent=2, ensure_ascii=False)}")
print("=" * 80)

## Run Full Extraction

Extract entities from all test samples using RAG.

In [None]:
# Prepare data
texts = [sample["text"] for sample in test_dataset]
labels = [sample["entities"] for sample in test_dataset]

logger.info(f"Starting RAG extraction on {len(test_dataset)} samples...")
print("=" * 80)
print(f"Running RAG extraction on {len(test_dataset)} samples")
print(f"Corpus size: {len(corpus)} documents")
print("=" * 80)

start_time = time.time()
predictions = []

for i, (text, label) in enumerate(tqdm(zip(texts, labels), desc="Extracting entities", total=len(texts))):
    prediction = extractor.extract_entities(text)
    predictions.append(prediction)
    
    # Show first few predictions for debugging
    if i < 3:
        logger.debug(f"Sample {i+1}:")
        logger.debug(f"  Ground truth: {label}")
        logger.debug(f"  Prediction  : {prediction}")

elapsed_time = time.time() - start_time
throughput = len(test_dataset) / elapsed_time if elapsed_time > 0 else 0
avg_time = elapsed_time / len(test_dataset)

print("\n" + "=" * 80)
print("EXTRACTION COMPLETE")
print("=" * 80)
print(f"Total samples: {len(test_dataset)}")
print(f"Total time: {elapsed_time:.2f}s")
print(f"Throughput: {throughput:.2f} samples/s")
print(f"Avg time per sample: {avg_time:.2f}s")
print("=" * 80)

## Calculate Metrics

In [None]:
logger.info("Calculating metrics...")
metrics = calculate_metrics(predictions, labels)

display_metrics(metrics, title=f"METRICS - {EXPERIMENT_NAME}")

## Analyze Sample Results

Look at some examples to understand model performance.

In [None]:
# Analyze predictions
print("\n" + "=" * 80)
print("SAMPLE PREDICTIONS ANALYSIS")
print("=" * 80)

# Show first 3 samples
for i in range(min(3, len(test_dataset))):
    print(f"\nSample {i+1}:")
    print("-" * 80)
    print(f"Text: {texts[i][:200]}...")
    print("\nGround Truth:")
    print(json.dumps(labels[i], indent=2, ensure_ascii=False))
    print("\nPrediction:")
    print(json.dumps(predictions[i], indent=2, ensure_ascii=False))
    print("-" * 80)

# Find errors
print("\n" + "=" * 80)
print("ERROR ANALYSIS")
print("=" * 80)

error_count = 0
for i, (pred, truth) in enumerate(zip(predictions, labels)):
    has_error = False
    for entity_type in ["person", "organizations", "address"]:
        pred_set = set(pred.get(entity_type, []))
        truth_set = set(truth.get(entity_type, []))
        if pred_set != truth_set:
            has_error = True
            break
    
    if has_error:
        error_count += 1

accuracy = (len(test_dataset) - error_count) / len(test_dataset) * 100
print(f"Samples with errors: {error_count} / {len(test_dataset)}")
print(f"Perfect match accuracy: {accuracy:.2f}%")
print("=" * 80)

## Save Experiment Results

In [None]:
# Prepare config summary
config_summary = {
    "model_name": config.model_name,
    "embedding_model": config.embedding_model,
    "top_k_retrieval": config.top_k_retrieval,
    "temperature": config.temperature,
    "max_new_tokens": config.max_new_tokens,
    "corpus_size": len(corpus),
}

# Prepare performance summary
performance = {
    "total_samples": len(test_dataset),
    "elapsed_time": round(elapsed_time, 2),
    "throughput": round(throughput, 2),
    "avg_time_per_sample": round(avg_time, 2),
}

# Save results
exp_dir = save_experiment_results(
    experiment_name=EXPERIMENT_NAME,
    config=config_summary,
    metrics=metrics,
    performance=performance,
    predictions=predictions,
    texts=texts,
    ground_truth=labels
)

print(f"\nâœ… Experiment results saved to: {exp_dir}")

## Cleanup

Free GPU memory and clean up resources.

In [None]:
extractor.cleanup()
logger.success("Cleanup complete")

## Compare Multiple Experiments

Load and compare results from multiple RAG experiments.

In [None]:
# Get all experiment directories
experiments_dir = RESULTS_DIR / "prompt_pipeline_experiments"

if experiments_dir.exists():
    exp_dirs = [d for d in experiments_dir.iterdir() if d.is_dir()]
    
    if exp_dirs:
        print(f"Found {len(exp_dirs)} experiments")
        comparison_df = compare_experiments(exp_dirs)
        
        if comparison_df is not None:
            best = comparison_df.iloc[0]
            print(f"\nBest experiment: {best['experiment']}")
            print(f"   F1 Score: {best['f1_overall']:.4f}")
            print(f"   Throughput: {best['throughput']:.2f} samples/s")
    else:
        print("No experiments found yet")
else:
    print("Experiments directory doesn't exist yet")

---

## Experiment Templates

Copy these configurations into the "Experiment Configuration" cell to try different setups:

### 1. Top-3 Retrieval (Default)
```python
EXPERIMENT_NAME = "rag_top3_default"
config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    top_k_retrieval=3,
    temperature=0.1
)
```

### 2. Top-5 Retrieval
```python
EXPERIMENT_NAME = "rag_top5"
config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    top_k_retrieval=5,
    temperature=0.1
)
```

### 3. Top-1 Retrieval (Single Example)
```python
EXPERIMENT_NAME = "rag_top1"
config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    top_k_retrieval=1,
    temperature=0.1
)
```

### 4. Different Embedding Model
```python
EXPERIMENT_NAME = "rag_bge_large"
config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    embedding_model="BAAI/bge-large-en-v1.5",
    top_k_retrieval=3,
    temperature=0.1
)
```

### 5. Higher Temperature
```python
EXPERIMENT_NAME = "rag_temp_0.7"
config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    top_k_retrieval=3,
    temperature=0.7
)
```

### 6. Greedy Decoding
```python
EXPERIMENT_NAME = "rag_greedy"
config = NERRagConfig(
    model_name="Qwen/Qwen3-4B",
    top_k_retrieval=3,
    temperature=0.0,
    do_sample=False
)
```