In [1]:
! pip install transformers sentence-transformers faiss-cpu --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
! pip install datasets==3.6.0 --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torch
from transformers import (
    RagConfig,
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    RagTokenForGeneration
)
import warnings
warnings.filterwarnings('ignore')

print("✓ All imports successful")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

# ============================================================================
# PART 2: Initialize Components with RagConfig
# ============================================================================

# Option A: Load from pretrained (recommended for quick start)
print("\n" + "="*60)
print("Loading RAG Components...")
print("="*60)

# Initialize tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
print("✓ Tokenizer loaded")

# Initialize retriever with dummy dataset for testing
# use_dummy_dataset=True allows quick testing without downloading large indices
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="exact",
    use_dummy_dataset=True  # Set to False for production with real data
)
print("✓ Retriever loaded")

# Load configuration
config = RagConfig.from_pretrained("facebook/rag-token-nq")
print(f"✓ Config loaded - Retrieved docs per query: {config.n_docs}")
print("\n" + "="*60)
print("RagTokenForGeneration")
print("="*60)
print("Token-level generation: generates answers token by token")
print()

# Initialize generator model (retriever is abstracted away at this step)
model_token = RagTokenForGeneration.from_pretrained(
    "facebook/rag-token-nq",
    retriever=retriever
)
print("✓ RagTokenForGeneration model loaded")


questions_token = [
    "What is the capital of France?",
    "Who invented the telephone?",
    "When was Python programming language created?"
]

print("\nGenerating answers with RagTokenForGeneration...")
print("-" * 60)
for question in questions_token:
    # Tokenize input query
    input_dict = tokenizer(
        question,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    # Generate answer
    with torch.no_grad():
        generated = model_token.generate(
            input_ids=input_dict["input_ids"],
            attention_mask=input_dict.get("attention_mask"),
            num_beams=2, # beam search
            max_length=50,
            early_stopping=True
        )
    answer = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

    print(f"Q: {question}")
    print(f"A: {answer}")
    print()

✓ All imports successful
Using device: cpu

Loading RAG Components...
✓ Tokenizer loaded
The repository for wiki_dpr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wiki_dpr.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


data/psgs_w100/dummy.nq/train-00000-of-0(…):   0%|          | 0.00/40.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/10 [00:00<?, ?it/s]

✓ Retriever loaded
✓ Config loaded - Retrieved docs per query: 5

RagTokenForGeneration
Token-level generation: generates answers token by token



pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Please make sure the generation config includes `forced_bos_token_id=0`. 


Loading weights:   0%|          | 0/711 [00:00<?, ?it/s]

RagTokenForGeneration LOAD REPORT from: facebook/rag-token-nq
Key                                                                      | Status     |  | 
-------------------------------------------------------------------------+------------+--+-
rag.question_encoder.question_encoder.bert_model.pooler.dense.bias       | UNEXPECTED |  | 
rag.question_encoder.question_encoder.bert_model.pooler.dense.weight     | UNEXPECTED |  | 
rag.question_encoder.question_encoder.bert_model.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


✓ RagTokenForGeneration model loaded

Generating answers with RagTokenForGeneration...
------------------------------------------------------------
Q: What is the capital of France?
A:  amsterdam

Q: Who invented the telephone?
A:  alexander graham bell

Q: When was Python programming language created?
A:  1966

