In [10]:
pip install tf-keras sentence-transformers transformers faiss-cpu datasets


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
import os

def load_documents(directory):
    corpus = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                corpus.append(file.read())
                filenames.append(filename)
    return corpus, filenames

docs_path = 'docs'
corpus, filenames = load_documents(docs_path)
print(f"Loaded {len(corpus)} documents.")

Loaded 10 documents.


In [33]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(corpus)
print("Document embeddings created.")

Document embeddings created.


In [35]:
from datasets import Dataset
import os

# Create a dataset with required columns
dataset_dict = {
    "title": [f"Document {i}" for i in range(len(corpus))],  # Dummy titles
    "text": corpus,
    "embeddings": doc_embeddings.tolist()  # Convert numpy arrays to lists for serialization
}

dataset = Dataset.from_dict(dataset_dict)

# Create simple paths
dataset_path = "dataset/rag_dataset"
index_path = "dataset/rag_index"
os.makedirs(dataset_path, exist_ok=True)
os.makedirs(index_path, exist_ok=True)

# Save dataset and index
dataset.save_to_disk(dataset_path)
dataset.add_faiss_index(column="embeddings")
dataset.get_index("embeddings").save(os.path.join(index_path, "faiss_index"))

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 912.22 examples/s] 
100%|██████████| 1/1 [00:00<00:00, 1001.51it/s]


In [37]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

# Load dataset and index
dataset = Dataset.load_from_disk(dataset_path)
dataset.load_faiss_index("embeddings", os.path.join(index_path, "faiss_index"))

# Initialize RAG components
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="embeddings", use_dummy_dataset=False, indexed_dataset=dataset)

# Force redownload of the model
rag_model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", force_download=True)
print("RAG model components initialized.")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [26]:
def rag_retrieve_and_generate(query, n_docs=5):
    # Encode the query
    input_ids = tokenizer(query, return_tensors="pt").input_ids

    # Retrieve documents
    question_hidden_states = rag_model.question_encoder(input_ids)[0]
    doc_scores, doc_indices = retriever(
        input_ids,
        question_hidden_states=question_hidden_states,
        return_tensors="pt"
    )

    # Get the document contents
    retrieved_docs = [dataset[idx]["text"] for idx in doc_indices[0].numpy()]

    # Generate the response using RAG
    context_input_ids = tokenizer(retrieved_docs, return_tensors="pt", padding=True, truncation=True).input_ids
    generated_ids = rag_model.generate(
        input_ids=input_ids,
        context_input_ids=context_input_ids,
        num_return_sequences=1
    )

    # Decode the generated response
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return response

In [None]:
query = "What is the content of the documents?"
response = rag_retrieve_and_generate(query)
print(response)