# Adaptive RAG Dataset Builder - Kaggle GPU Version

This notebook builds domain relevance and hallucination risk datasets using weak supervision labeling techniques.

**Prerequisites:**
- Upload preprocessed data files to Kaggle dataset input
- Upload `ai_ml_lexicon.txt` to Kaggle dataset input
- Enable GPU acceleration in Kaggle settings

## 1. Environment Setup and Dependencies

In [None]:
# Install required packages
!pip install -q transformers sentence-transformers accelerate tokenizers rank-bm25

In [None]:
import os
import json
import warnings
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import math

warnings.filterwarnings("ignore")

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

## 2. Configuration

In [None]:
# Kaggle paths configuration
CONFIG = {
    "paths": {
        "ai_corpus": "/kaggle/input/arag-data/ai_ml_corpus.jsonl",
        "questions": "/kaggle/input/arag-data/hallucination_dataset_01.json",  # Will load all JSON files from this directory
        "output_dir": "/kaggle/working/processed",
        "artifacts_dir": "/kaggle/working/artifacts",
        "lexicon_path": "/kaggle/input/arag-data/ai_ml_lexicon.txt"
    },
    "retrieval": {
        "top_k": 5,
        "bm25": {"use_title": True},
        "vector": {
            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
            "normalize": True,
            "batch_size": 128
        }
    },
    "models": {
        "cross_encoder": "cross-encoder/stsb-distilroberta-base",
        "text_gen_model": "google/flan-t5-base"
    },
    "weak_labeling": {
        "hallucination": {
            "thresholds": {
                "factual_precision_risk_high_risk": 0.3,
                "obscurity_risk_high_risk": 0.3,
                "complexity_risk_high_risk": 0.3,
                "answer_deviation_high_risk": 0.7
            }
        },
        "domain": {
            "thresholds": {
                "lexicon_high": 0.01,
                "embedding_sim_high": 0.25,
                "bm25_ratio_high": 40,
                "cross_encoder_high": 0.3
            }
        }
    },
    "aggregation": {"seed": 42}
}

# Create output directories
os.makedirs(CONFIG["paths"]["output_dir"], exist_ok=True)
os.makedirs(CONFIG["paths"]["artifacts_dir"], exist_ok=True)

print("Configuration loaded successfully!")

## 3. Data Structures and Utilities

In [None]:
@dataclass
class Example:
    ex_id: str
    question: str
    answer: str | None
    source: str

@dataclass
class Document:
    doc_id: str
    text: str
    title: Optional[str] = None

def load_questions(path: str) -> List[Example]:
    """Load questions from directory or file"""
    items: List[Example] = []
    
    if os.path.isdir(path):
        json_files = [f for f in os.listdir(path) if f.endswith('.json')]
        print(f"Found {len(json_files)} JSON files in {path}")
        
        for json_file in json_files:
            file_path = os.path.join(path, json_file)
            initial_count = len(items)
            
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                if isinstance(data, list):
                    for obj in data:
                        items.append(Example(
                            ex_id=str(obj.get("id")), 
                            question=obj.get("question"), 
                            answer=obj.get("answer"), 
                            source=str(obj.get("source"))
                        ))
                else:
                    items.append(Example(
                        ex_id=str(data.get("id")), 
                        question=data.get("question"), 
                        answer=data.get("answer"), 
                        source=str(data.get("source"))
                    ))
            
            questions_loaded = len(items) - initial_count
            print(f"Loaded {questions_loaded} questions from {json_file}")
    
    elif os.path.isfile(path):
        if path.endswith('.jsonl'):
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    obj = json.loads(line)
                    items.append(Example(
                        ex_id=str(obj.get("id")), 
                        question=obj.get("question"), 
                        answer=obj.get("answer"), 
                        source=str(obj.get("source"))
                    ))
        else:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                if isinstance(data, list):
                    for obj in data:
                        items.append(Example(
                            ex_id=str(obj.get("id")), 
                            question=obj.get("question"), 
                            answer=obj.get("answer"), 
                            source=str(obj.get("source"))
                        ))
                else:
                    items.append(Example(
                        ex_id=str(data.get("id")), 
                        question=data.get("question"), 
                        answer=data.get("answer"), 
                        source=str(data.get("source"))
                    ))
    
    return items

def load_corpus(corpus_path: str) -> List[Document]:
    """Load corpus documents"""
    docs = []
    with open(corpus_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f):
            if line.strip():
                obj = json.loads(line)
                docs.append(Document(
                    doc_id=obj.get("id", str(line_num)),
                    text=obj.get("text"),
                    title=obj.get("title")
                ))
    return docs

## 4. BM25 Retrieval System

In [None]:

def simple_tokenize(text: str) -> List[str]:
    """Simple tokenization"""
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    return text.split()

class BM25Index:
    def __init__(self, docs: List[Document], use_title: bool = True):
        self.docs = docs
        self.use_title = use_title
        
        # Prepare texts for BM25
        texts = []
        for doc in docs:
            if use_title and doc.title:
                combined_text = f"{doc.title} {doc.text}"
            else:
                combined_text = doc.text
            texts.append(combined_text)
        
        # Tokenize
        tokenized_docs = [simple_tokenize(text) for text in texts]
        
        # Build BM25 index
        self.bm25 = BM25Okapi(tokenized_docs)
        print(f"Built BM25 index with {len(docs)} documents")
    
    def query(self, query: str, top_k: int = 5) -> List[Tuple[Document, float]]:
        """Query the BM25 index"""
        tokenized_query = simple_tokenize(query)
        scores = self.bm25.get_scores(tokenized_query)
        
        # Get top k
        top_indices = np.argsort(scores)[::-1][:top_k]
        results = [(self.docs[i], scores[i]) for i in top_indices]
        
        return results

## 5. Labeling Functions

In [None]:

@dataclass
class DomainSignals:
    lexicon_ratio: float
    embedding_sim: float
    bm25_ratio: float
    cross_encoder_score: float

@dataclass
class HallucinationSignals:
    factual_precision_risk: float
    obscurity_risk: float
    complexity_risk: float
    #answer_deviation: float

def sigmoid_score(signal: float, threshold: float, scaling: float = 5.0) -> float:
    """Convert signal to continuous score based on distance from threshold."""
    return 1.0 / (1.0 + math.exp(-(signal - threshold) * scaling))
        
class DomainLabelers:
    def __init__(self, embedding_model: str, cross_encoder_model: str, lexicon: List[str]):
        # Use GPU if available
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        self.embedding_model = SentenceTransformer(embedding_model, device=device)
        self.cross_encoder = CrossEncoder(cross_encoder_model, device=device)
        self.lexicon = set([term.lower() for term in lexicon])
        self.ai_centroid = None
        
        print(f"Loaded domain labelers on device: {device}")
    
    def build_ai_centroid(self, ai_texts: List[str]):
        """Build AI domain centroid from corpus"""
        print(f"Building AI centroid from {len(ai_texts)} texts...")
        embeddings = self.embedding_model.encode(ai_texts, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
        self.ai_centroid = np.mean(embeddings, axis=0)
        print("AI centroid built successfully")
    
    def compute_signals(self, question: str, ai_idx: BM25Index, top_k: int = 5) -> DomainSignals:
        # Lexicon ratio
        hits = 0
        for term in self.lexicon:
            if term.lower() in question.lower():
                hits += 1
        lexicon_ratio = hits / max(len(self.lexicon), 1)
        
        # Embedding similarity
        if self.ai_centroid is not None:
            q_emb = self.embedding_model.encode([question], convert_to_numpy=True, show_progress_bar=False, normalize_embeddings=True)[0]
            embedding_sim = float(np.dot(q_emb, self.ai_centroid) / 
                                (np.linalg.norm(q_emb) * np.linalg.norm(self.ai_centroid)))
        else:
            embedding_sim = 0.0
        
        # BM25 ratio
        ai_results = ai_idx.query(question, top_k=top_k)
        if ai_results:
            avg_ai_score = np.mean([score for _, score in ai_results])
            bm25_ratio = float(avg_ai_score)
        else:
            bm25_ratio = 0.0
        
        # Cross-encoder score
        if ai_results:
            ai_templates = [
                "artificial intelligence and machine learning concepts",
                "data science and statistical methods", 
                "neural networks and deep learning",
                "data analysis and predictive modeling",
                "pattern recognition and deep learning algorithms",
                "reinforcement learning and decision making",
                "computer vision and image processing",
                "natural language processing and text analysis",
            ]
            pairs = [(question, template) for template in ai_templates]  # Top 3 for efficiency
            ce_scores = self.cross_encoder.predict(pairs)
            cross_encoder_score = float(np.max(ce_scores))
        else:
            cross_encoder_score = 0.0
        
        return DomainSignals(
            lexicon_ratio=lexicon_ratio,
            embedding_sim=embedding_sim,
            bm25_ratio=bm25_ratio,
            cross_encoder_score=cross_encoder_score
        )
    
    def weak_votes(self, signals: DomainSignals, thresholds: Dict[str, float]) -> Dict[str, int]:
        """Generate weak supervision votes"""
        votes: Dict[str, float] = {}
        votes["lexicon"] = sigmoid_score(signals.lexicon_ratio, thresholds.get("lexicon_high", 0.01), scaling=400)
        votes["embed"] = sigmoid_score(signals.embedding_sim, thresholds.get("embedding_sim_high", 0.25), scaling=4)
        votes["bm25_ratio"] = sigmoid_score(signals.bm25_ratio, thresholds.get("bm25_ratio_high", 40), scaling=0.04)  # Lower scaling for wide range
        votes["cross_encoder"] = sigmoid_score(signals.cross_encoder_score, thresholds.get("cross_encoder_high", 0.3), scaling=5)
        return votes

class HallucinationLabelers:
    def __init__(self, cross_encoder: str, text_gen_model: str, embedding_model: str):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        self.cross_encoder = CrossEncoder(cross_encoder, device=device)
        self.text_gen = pipeline("text2text-generation", model=text_gen_model, device=0 if torch.cuda.is_available() else -1)
        #self.embedding_model = SentenceTransformer(embedding_model, device=device)

        print(f"Loaded hallucination labelers on device: {device}")
    
    def compute_signals(self, question: str, answer: str) -> HallucinationSignals:
        hypotheses = {
                "factual_precision": "This question demands exact facts, specific numbers, precise dates, or detailed information that could easily be fabricated if unknown.",
                "complexity": "This question involves multi-step reasoning, complex analysis, or connecting multiple concepts together.",
                "obscurity_high": "This question covers specialized, technical, or niche topics that most people wouldn't know.",
                "obscurity_low": "This question covers common knowledge, basic facts, or widely known information."
            }
        pairs = [(question, hypothesis) for hypothesis in hypotheses.values()]
        scores = self.cross_encoder.predict(pairs)
        risk_scores = {}
        for i, risk_type in enumerate(hypotheses.keys()):
            score = float(scores[i])
            normalized_score = max(0.0, min(1.0, score))
            risk_scores[risk_type] = normalized_score
            
        # Calculate obscurity risk as ratio: obscure_similarity / (obscure + common)
        obscure_sim = risk_scores["obscurity_high"]
        common_sim = risk_scores["obscurity_low"]
        if obscure_sim + common_sim > 0:
            obscurity_risk = obscure_sim / (obscure_sim + common_sim)
        else:
            obscurity_risk = 0.4
        
        # Answer deviation (generate alternative and compare)
        #generated = self.text_gen(question[:200], max_length=100, do_sample=True, temperature=0.7)
        #generated_text = generated[0]['generated_text'].strip()
        
        #gen_emb = self.embedding_model.encode([generated_text])
        #actual_emb = self.embedding_model.encode([answer])
        
        #similarity = float(gen_emb @ actual_emb.T)
        #deviation = 1.0 - max(0.0, similarity)  # Higher deviation = more risk
        
        #answer_deviation = min(1.0, max(0.0, deviation))

        return HallucinationSignals(
            factual_precision_risk=risk_scores["factual_precision"],
            obscurity_risk=obscurity_risk,
            complexity_risk=risk_scores["complexity"],
            #answer_deviation=answer_deviation
        )
    
    def weak_votes(self, signals: HallucinationSignals, thresholds: Dict[str, float]) -> Dict[str, int]:
        """Generate weak supervision votes"""
        
        votes: Dict[str, float] = {}
        votes["factual_precision"] = sigmoid_score(signals.factual_precision_risk, thresholds.get("factual_precision_risk_high_risk", 0.4), scaling=5)
        votes["obscurity"] = sigmoid_score(signals.obscurity_risk, thresholds.get("obscurity_risk_high_risk", 0.4), scaling=5)
        votes["complexity"] = sigmoid_score(signals.complexity_risk, thresholds.get("complexity_risk_high_risk", 0.4), scaling=5)
        #votes["answer_dev"] = sigmoid_score(signals.answer_deviation, thresholds.get("answer_deviation_high_risk", 0.5), scaling=5)
        return votes


## 6. Label Aggregation

In [None]:

def build_label_matrix(votes_list: List[Dict[str, int]], labeler_names: List[str]) -> np.ndarray:
    """Build label matrix for Snorkel"""
    L = np.full((len(votes_list), len(labeler_names)), float(-1), dtype=float)
    for i, votes in enumerate(votes_list):
        for j, name in enumerate(labeler_names):
            if name in votes:
                L[i, j] = float(votes[name])
    return L

def aggregate_probabilities(L: np.ndarray, seed: int = 42, labeler_names: Optional[List[str]] = None) -> np.ndarray:
    """Aggregate weak supervision labels using Snorkel"""
    domain_weights = {
        #Domain Relevance
        "cross_encoder": 0.35,  # Highest weight
        "bm25_ratio": 0.3,
        "embed": 0.25, 
        "lexicon": 0.2,         # Lowest weight
        #Hallucination
        "factual_precision": 0.3,
        "obscurity": 0.25,
        "complexity": 0.45,
        #"answer_dev": 0.05
    }
    default_weight = 1.0 / L.shape[1] if L.shape[1] > 0 else 1.0
    probs = []
    for row in L:
        vals = [v for v in row.tolist() if v != -1]
        if not vals:
            probs.append(0.5)
        else:
            if labeler_names and len(labeler_names) == len(row):
                # Use weighted average for domain signals
                weighted_sum = 0.0
                confidence_sum = 0.0
                for i, val in enumerate(row):
                    if val != -1:
                        weight = domain_weights.get(labeler_names[i], default_weight)
                        #confidence = abs(val-0.5) * 2
                        #confidence_boost = confidence ** 3 * 4
                        #boosted_weight = weight * (1 + confidence_boost)  # Normal boost
                        boosted_weight = weight
                        confidence_sum += boosted_weight
                        weighted_sum += val * boosted_weight
                p = float(weighted_sum / confidence_sum) if confidence_sum > 0 else 0.5
            else:
                # Simple average for hallucination or when no labeler names provided
                p = float(sum(vals) / len(vals))
            probs.append(p)
    probs = np.clip(np.array(probs), 0.0, 1.0)
    return probs


## 7. Main Dataset Building Pipeline

In [None]:
# Load corpus and build BM25 index
print("Loading AI corpus...")
ai_docs = load_corpus(CONFIG["paths"]["ai_corpus"])
print(f"Loaded {len(ai_docs)} AI documents")

print("Building BM25 index...")
ai_idx = BM25Index(ai_docs, use_title=CONFIG["retrieval"]["bm25"]["use_title"])

# Load lexicon
print("Loading lexicon...")
with open(CONFIG["paths"]["lexicon_path"], "r", encoding="utf-8") as f:
    lexicon = [ln.strip() for ln in f if ln.strip()]
print(f"Loaded {len(lexicon)} lexicon terms")

# Load questions
print("Loading questions...")
examples = load_questions(CONFIG["paths"]["questions"])
print(f"Loaded {len(examples)} questions")

In [None]:
# Initialize labelers
print("Initializing labelers...")

hall = HallucinationLabelers(
    cross_encoder=CONFIG["models"]["cross_encoder"],
    text_gen_model=CONFIG["models"]["text_gen_model"],
    embedding_model=CONFIG["retrieval"]["vector"]["embedding_model"]
)

dom = DomainLabelers(
    embedding_model=CONFIG["retrieval"]["vector"]["embedding_model"],
    cross_encoder_model=CONFIG["models"]["cross_encoder"],
    lexicon=lexicon
)

# Build AI centroid
print("Building AI domain centroid...")
dom.build_ai_centroid([d.text for d in ai_docs[:5000]])  # Use first 5000 for efficiency

In [None]:
# Compute signals and build datasets
top_k = CONFIG["retrieval"]["top_k"]
rows_domain: List[Dict[str, Any]] = []
rows_hallu: List[Dict[str, Any]] = []

print("Computing signals for all examples...")
for ex in tqdm(examples, desc="Building signals"):
    # Domain relevance signals
    dom_signals = dom.compute_signals(ex.question, ai_idx, top_k)
    dom_votes = dom.weak_votes(dom_signals, CONFIG["weak_labeling"]["domain"]["thresholds"])
    
    rows_domain.append({
        "id": ex.ex_id,
        "question": ex.question,
        "answer": ex.answer,
        "source": ex.source,
        "signals": {
            "lexicon_ratio": dom_signals.lexicon_ratio,
            "embedding_sim": dom_signals.embedding_sim,
            "bm25_ratio": dom_signals.bm25_ratio,
            "cross_encoder_score": dom_signals.cross_encoder_score,
        },
        "votes": dom_votes,
    })
    
    # Hallucination signals (only if answer exists)
    if ex.answer:
        hall_signals = hall.compute_signals(ex.question, ex.answer)
        hall_votes = hall.weak_votes(hall_signals, CONFIG["weak_labeling"]["hallucination"]["thresholds"])
        
        rows_hallu.append({
            "id": ex.ex_id,
            "question": ex.question,
            "answer": ex.answer,
            "source": ex.source,
            "signals": {
                "factual_precision_risk": hall_signals.factual_precision_risk,
                "obscurity_risk": hall_signals.obscurity_risk,
                "complexity_risk": hall_signals.complexity_risk,
                #"answer_deviation": hall_signals.answer_deviation,
            },
            "votes": hall_votes,
        })

print(f"Generated {len(rows_domain)} domain relevance examples")
print(f"Generated {len(rows_hallu)} hallucination risk examples")

In [None]:
# Aggregate labels using Snorkel
print("Aggregating domain relevance labels...")
dom_labelers = ["lexicon", "embed", "bm25_ratio", "cross_encoder"]
L_dom = build_label_matrix([r["votes"] for r in rows_domain], dom_labelers)
p_dom = aggregate_probabilities(L_dom, seed=CONFIG["aggregation"]["seed"], labeler_names=dom_labelers)

for i, r in enumerate(rows_domain):
    r["score"] = round(float(p_dom[i]), 2)

print("Aggregating hallucination risk labels...")
hall_labelers = ["factual_precision", "obscurity", "complexity"] #"answer_dev"
L_h = build_label_matrix([r["votes"] for r in rows_hallu], hall_labelers)
p_h = aggregate_probabilities(L_h, seed=CONFIG["aggregation"]["seed"])
    
for i, r in enumerate(rows_hallu):
    r["score"] = round(float(p_h[i]), 2)

print("Label aggregation completed!")

## 8. Save Results

In [None]:
# Create DataFrames
df_dom = pd.DataFrame(rows_domain)
df_hall = pd.DataFrame(rows_hallu) if rows_hallu else pd.DataFrame(columns=["id"])

# Save datasets
output_dir = CONFIG["paths"]["output_dir"]
artifacts_dir = CONFIG["paths"]["artifacts_dir"]

# Domain relevance dataset
dom_out_parquet = os.path.join(output_dir, "domain_relevance_dataset.parquet")
dom_out_json = os.path.join(output_dir, "domain_relevance_dataset.json")
hall_out_parquet = os.path.join(output_dir, "hallucination_risk_dataset.parquet")
hall_out_json = os.path.join(output_dir, "hallucination_risk_dataset.json")
    
df_hall.to_parquet(hall_out_parquet, index=False)
df_hall.to_json(hall_out_json, orient="records", indent=2)
df_dom.to_parquet(dom_out_parquet, index=False)
df_dom.to_json(dom_out_json, orient="records", indent=2)

print(f"Saved domain relevance dataset to {dom_out_parquet} and {dom_out_json}")
print(f"Domain relevance dataset shape: {df_dom.shape}")
    
print(f"Saved hallucination risk dataset to {hall_out_parquet} and {hall_out_json}")
print(f"Hallucination risk dataset shape: {df_hall.shape}")

# Save labeler info
with open(os.path.join(artifacts_dir, "domain_labelers.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(dom_labelers))

with open(os.path.join(artifacts_dir, "hallucination_labelers.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(hall_labelers))

print("\nDataset building completed successfully!")
print(f"Output files saved to: {output_dir}")
print(f"Artifact files saved to: {artifacts_dir}")

## 9. Dataset Analysis and Statistics

In [None]:
# Display dataset statistics
print("=== DOMAIN RELEVANCE DATASET STATISTICS ===")
print(f"Total examples: {len(df_dom)}")
print(f"Score distribution:")
print(df_dom['score'].describe())
print(f"\nSource distribution:")
print(df_dom['source'].value_counts())

if not df_hall.empty:
    print("\n=== HALLUCINATION RISK DATASET STATISTICS ===")
    print(f"Total examples: {len(df_hall)}")
    print(f"Score distribution:")
    print(df_hall['score'].describe())
    print(f"\nSource distribution:")
    print(df_hall['source'].value_counts())

# Sample examples
print("\n=== SAMPLE DOMAIN RELEVANCE EXAMPLES ===")
sample_domain = df_dom.sample(n=min(3, len(df_dom)), random_state=42)
for _, row in sample_domain.iterrows():
    print(f"ID: {row['id']}")
    print(f"Question: {row['question'][:100]}...")
    print(f"Score: {row['score']}")
    print(f"Source: {row['source']}")
    print("-" * 50)

if not df_hall.empty:
    print("\n=== SAMPLE HALLUCINATION RISK EXAMPLES ===")
    sample_hall = df_hall.sample(n=min(3, len(df_hall)), random_state=42)
    for _, row in sample_hall.iterrows():
        print(f"ID: {row['id']}")
        print(f"Question: {row['question'][:100]}...")
        print(f"Answer: {str(row['answer'])[:100]}...")
        print(f"Score: {row['score']}")
        print(f"Source: {row['source']}")
        print("-" * 50)

## 10. Prepare Files for Download

All output files are saved to `/kaggle/working/` and will be available for download after the notebook completes.