# Multi-Stage RAG Pipeline for Wikipedia QA

```
Stage 0: Query2Doc Expansion   → LLM generates contextual keywords
Stage 1: BM25 Retrieval        → Document retrieval + contextual chunking + passage filtering
Stage 2: Bi-Encoder Reranking  → Semantic filtering (200 → 25 passages)
Stage 3: Cross-Encoder Rerank  → Precision reranking (25 → 5 passages)
Stage 4: LLM Generation        → Answer extraction with Llama-3.2-1B
Stage 5: Post-Processing       → Self-consistency voting + F1 optimization
```

## 1. Setup & Installation

In [None]:
# Install Java (required for Pyserini)
!apt-get update -qq && apt-get install -y -qq openjdk-21-jdk-headless > /dev/null 2>&1

# Install Python dependencies
!pip install -q pyserini==0.36.0 torch transformers accelerate sentence-transformers \
    rank-bm25 langchain-text-splitters pandas numpy tqdm huggingface_hub datasets gdown

# Optional: Flash Attention for faster LLM inference on A100/H100
!pip install -q flash-attn --no-build-isolation 2>/dev/null || echo 'Flash Attention not available (CPU/T4)'

print('\n✓ All dependencies installed')

In [None]:
# Clone the repo (or mount Google Drive with the repo)
!git clone https://github.com/er1009/wikipedia-qa-rag-pipeline.git 2>/dev/null || echo 'Repo already cloned'

import sys
sys.path.insert(0, 'wikipedia-qa-rag-pipeline')

# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s | %(name)s | %(message)s')

# Verify imports
from src.pipeline import RAGPipeline
from src.config import FAST_CONFIG, BALANCED_CONFIG, COMPETITION_CONFIG
from src.metrics import f1_score, max_over_ground_truths
print('✓ Modules imported successfully')

## 2. Authentication

In [None]:
import os
from huggingface_hub import login

# Option 1: Set your token here
# HF_TOKEN = 'hf_your_token_here'

# Option 2: Use environment variable (recommended)
HF_TOKEN = os.environ.get('HF_TOKEN', None)

if HF_TOKEN:
    login(HF_TOKEN)
    print('✓ Logged in to Hugging Face')
else:
    print('⚠ Set HF_TOKEN or uncomment Option 1 above')
    print('  Get a token at: https://huggingface.co/settings/tokens')

## 3. Load Data

In [None]:
import json
import pandas as pd

# Download data (update this link to your data source)
DATA_URL = 'https://drive.google.com/drive/folders/1N2r7REIvn6pmnHyx_PSrERJhqMIF736S?usp=drive_link'
!gdown --folder $DATA_URL -O ./data/ -q 2>/dev/null || echo 'Download data manually into ./data/'

DATA_PATH = './data/rag_course'
df_train = pd.read_csv(f'{DATA_PATH}/train.csv', converters={'answers': json.loads})
df_test = pd.read_csv(f'{DATA_PATH}/test.csv', converters={'answers': json.loads})

print(f'✓ Train: {len(df_train)} samples')
print(f'✓ Test:  {len(df_test)} samples')
print(f'\nExample: {df_train["question"].iloc[0]}')
print(f'Answers: {df_train["answers"].iloc[0]}')

## 4. Download BM25 Index

In [None]:
import os
from google.colab import drive
from pyserini.search.lucene import LuceneSearcher

# Mount Google Drive for persistent index storage
drive.mount('/content/drive')

INDEX_NAME = 'wikipedia-kilt-doc'
DRIVE_DIR = '/content/drive/MyDrive/pyserini'
LOCAL_INDEX = f'/content/{INDEX_NAME}'

# Download index via Pyserini (cached on Drive after first run)
if not os.path.isdir(LOCAL_INDEX):
    print(f'Downloading {INDEX_NAME} index (first time only, ~30GB)...')
    searcher = LuceneSearcher.from_prebuilt_index(INDEX_NAME)
    LOCAL_INDEX = searcher.index_dir
    print(f'✓ Index downloaded to: {LOCAL_INDEX}')
else:
    print(f'✓ Index found at: {LOCAL_INDEX}')

BM25_INDEX_PATH = LOCAL_INDEX
print(f'\nBM25 index path: {BM25_INDEX_PATH}')

## 5. Initialize Pipeline

In [None]:
pipeline = RAGPipeline(
    bm25_index_path=BM25_INDEX_PATH,
    bi_encoder_model='sentence-transformers/multi-qa-mpnet-base-dot-v1',
    cross_encoder_model='cross-encoder/ms-marco-MiniLM-L-12-v2',
    llm_model='meta-llama/Llama-3.2-1B-Instruct',
    device='cuda',
    use_flash_attn=True,
)

print('\n✓ Pipeline ready!')

## 6. Quick Test (5 samples)

In [None]:
# Quick sanity check with FAST_CONFIG
sample = df_train.head(5)

results = pipeline.evaluate(
    questions=sample['question'].tolist(),
    ground_truths=sample['answers'].tolist(),
    config=FAST_CONFIG,
)

# Show predictions
for i in range(len(sample)):
    q = sample['question'].iloc[i]
    gt = sample['answers'].iloc[i]
    pred = results['predictions'][i]
    f1 = results['f1_scores'][i]
    print(f'\nQ: {q[:80]}...')
    print(f'  GT:   {gt}')
    print(f'  Pred: {pred}')
    print(f'  F1:   {f1:.2%}')

## 7. Full Evaluation (Competition Config)

In [None]:
# Evaluate on training set with full pipeline
EVAL_SAMPLES = 100  # Set to None for full training set

sample = df_train.head(EVAL_SAMPLES) if EVAL_SAMPLES else df_train

results = pipeline.evaluate(
    questions=sample['question'].tolist(),
    ground_truths=sample['answers'].tolist(),
    config=COMPETITION_CONFIG,
)

## 8. Generate Test Predictions

In [None]:
# Generate predictions for test set
test_queries = df_test['question'].tolist()
test_ids = df_test['id'].tolist()

answers = pipeline.answer(test_queries, config=COMPETITION_CONFIG)

# Save submission CSV
submission = pd.DataFrame({
    'id': test_ids,
    'prediction': [json.dumps([a], ensure_ascii=False) for a in answers],
})
submission.to_csv('submission.csv', index=False)

print(f'\n✓ Saved {len(submission)} predictions to submission.csv')
submission.head()