In [5]:
# Install python-dotenv
!pip install python-dotenv couchbase datasets langchain_core langchain_couchbase langchain_voyageai langchain_openai pyarrow requests fsspec



# Importing Necessary Libraries
This block imports all the required libraries and modules used in the notebook. These include libraries for environment management, data handling, natural language processing, interaction with Couchbase, and embeddings generation. Each library serves a specific function, such as managing environment variables, handling datasets, or interacting with the Couchbase database.

In [6]:
import json
import logging
import os
import time
import warnings
from datetime import timedelta
from uuid import uuid4

import numpy as np
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.exceptions import (CouchbaseException,
                                  InternalServerFailureException,
                                  QueryIndexAlreadyExistsException)
from couchbase.management.search import SearchIndex
from couchbase.options import ClusterOptions
from datasets import load_dataset
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_core.globals import set_llm_cache
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_couchbase.cache import CouchbaseCache
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from langchain_openai import ChatOpenAI
from langchain_voyageai import VoyageAIEmbeddings
from tqdm import tqdm


# Setting Up Logging
Logging is configured to track and record the script's execution, making it easier to debug and understand the flow of operations. Each log entry will include a timestamp and the severity level (INFO, WARNING, ERROR).

In [7]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Loading Environment Variables
 Environment variables, such as API keys and database credentials, are loaded using dotenv to securely manage sensitive information. We prompt the user for these values if they are not set.



In [8]:
import getpass

VOYAGE_API_KEY = getpass.getpass('Enter your VoyageAI API key: ')
OPENAI_API_KEY = getpass.getpass('Enter your OpenAI API key: ')
CB_HOST = input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'
CB_USERNAME = input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'
CB_PASSWORD = getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'
CB_BUCKET_NAME = input('Enter your Couchbase bucket name (default: vector-search-testing): ') or 'vector-search-testing'
INDEX_NAME = input('Enter your index name (default: vector_search_voyage): ') or 'vector_search_voyage'
SCOPE_NAME = input('Enter your scope name (default: shared): ') or 'shared'
COLLECTION_NAME = input('Enter your collection name (default: voyage): ') or 'voyage'
CACHE_COLLECTION = input('Enter your cache collection name (default: cache): ') or 'cache'

# Verifying that essential environment variables are set
if not VOYAGE_API_KEY:
    raise ValueError("VOYAGE_API_KEY is required.")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY is required.")


# Connect to Couchbase
The script connects to a Couchbase cluster using the provided connection string, username, and password. It waits until the connection is fully established before proceeding, ensuring that the database is ready for operations.

In [9]:
try:
    auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)
    options = ClusterOptions(auth)
    cluster = Cluster(CB_HOST, options)
    cluster.wait_until_ready(timedelta(seconds=5))
    logging.info("Successfully connected to Couchbase")
except Exception as e:
    raise ConnectionError(f"Failed to connect to Couchbase: {str(e)}")

#  Loading and Managing Search Index
The script loads a search index definition from a JSON file, checks if the index already exists, and either updates it or creates a new one. The search index optimizes data retrieval in the Couchbase database.

In [10]:
# index_definition_file_path = "/content/voyage_index.json"  # Update this path as needed

# try:
#     with open(index_definition_file_path, 'r') as file:
#         index_definition = json.load(file)
# except Exception as e:
#     raise ValueError(f"Error loading index definition from {index_definition_file_path}: {str(e)}")

# try:
#     scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()

#     # Check if index already exists
#     existing_indexes = scope_index_manager.get_all_indexes()
#     index_name = index_definition["name"]

#     if index_name in [index.name for index in existing_indexes]:
#         logging.info(f"Index '{index_name}' already exists. Updating...")
#     else:
#         logging.info(f"Creating new index '{index_name}'...")

#     # Create SearchIndex object
#     search_index = SearchIndex(
#         name=index_definition["name"],
#         source_type=index_definition.get("sourceType", "couchbase"),
#         idx_type=index_definition["type"],
#         source_name=index_definition["sourceName"],
#         params=index_definition["params"],
#         source_params=index_definition.get("sourceParams", {}),
#         plan_params=index_definition.get("planParams", {})
#     )

#     # Upsert the index (create if not exists, update if exists)
#     scope_index_manager.upsert_index(search_index)
#     logging.info(f"Index '{index_name}' successfully created/updated.")

# except QueryIndexAlreadyExistsException:
#     logging.info(f"Index '{index_name}' already exists. Skipping creation/update.")
# except InternalServerFailureException as e:
#     error_message = str(e)
#     logging.error(f"InternalServerFailureException raised: {error_message}")

#     try:
#         error_context = e.context
#         response_body = error_context.response_body
#         if response_body:
#             error_details = json.loads(response_body)
#             error_message = error_details.get('error', '')

#             if "collection: 'voyage' doesn't belong to scope: 'shared'" in error_message:
#                 raise ValueError("Collection 'voyage' does not belong to scope 'shared'. Please check the collection and scope names.")

#     except ValueError as ve:
#         logging.error(str(ve))
#         raise

#     except Exception as json_error:
#         logging.error(f"Failed to parse the error message: {json_error}")
#         raise RuntimeError(f"Internal server error while creating/updating search index: {error_message}")

# except Exception as e:
#     raise RuntimeError(f"Unexpected error creating/updating search index: {str(e)}")


# Loading the TREC Dataset
The TREC dataset, which contains text samples for information retrieval tasks, is loaded. This dataset will be used to create embeddings and perform semantic search.

In [11]:
try:
    trec = load_dataset('trec', split='train[:2000]')
    logging.info(f"Successfully loaded TREC dataset with {len(trec)} samples")
except Exception as e:
    raise ValueError(f"Error loading TREC dataset: {str(e)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

The repository for trec contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/trec.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

# Creating Embeddings
Embeddings are created using the VoyageAI model, which converts the textual data into high-dimensional vectors that capture the semantic meaning of the text.

In [14]:
try:
    os.environ["VOYAGEAI_API_KEY"] = VOYAGE_API_KEY
    embeddings = VoyageAIEmbeddings(voyage_api_key=VOYAGE_API_KEY,model="voyage-large-2")
    logging.info("Successfully created VoyageAIEmbeddings")
except Exception as e:
    raise ValueError(f"Error creating VoyageAIEmbeddings: {str(e)}")

batch size None


# Setting Up the Vector Store
A vector store is created in Couchbase to manage the embeddings. The vector store enables efficient search and retrieval of similar text entries based on their embeddings.

In [15]:
try:
    vector_store = CouchbaseVectorStore(
        cluster=cluster,
        bucket_name=CB_BUCKET_NAME,
        scope_name=SCOPE_NAME,
        collection_name=COLLECTION_NAME,
        embedding=embeddings,
        index_name=INDEX_NAME,
    )
    logging.info("Successfully created vector store")
except Exception as e:
    raise ValueError(f"Failed to create vector store: {str(e)}")

# Saving Data to Vector Store in Batches
The TREC dataset's text entries are saved to the vector store in batches. This is done to avoid overwhelming the system's memory by processing the data in manageable chunks.

In [16]:
try:
    batch_size = 50
    for i in tqdm(range(0, len(trec['text']), batch_size), desc="Processing Batches"):
        batch = trec['text'][i:i + batch_size]
        documents = [Document(page_content=text) for text in batch]
        uuids = [str(uuid4()) for _ in range(len(documents))]
        vector_store.add_documents(documents=documents, ids=uuids)
except Exception as e:
    raise RuntimeError(f"Failed to save documents to vector store: {str(e)}")

Processing Batches: 100%|██████████| 40/40 [02:13<00:00,  3.34s/it]


# Setting Up Cache
A cache is configured to store frequently accessed or intermediate data, improving performance by reducing the need for repeated calculations.

In [17]:
try:
    cache = CouchbaseCache(
        cluster=cluster,
        bucket_name=CB_BUCKET_NAME,
        scope_name=SCOPE_NAME,
        collection_name=CACHE_COLLECTION,
    )
    logging.info("Successfully created cache")
    set_llm_cache(cache)
except Exception as e:
    raise ValueError(f"Failed to create cache: {str(e)}")


# Creating Language Models (LLMs) and Chains
Two language models (LLMs) are created: one that combines retrieval-augmented generation (RAG) and another that is purely based on the LLM without retrieval. These models will generate responses based on the input queries.

In [18]:
try:
    llm = ChatOpenAI(
        openai_api_key=OPENAI_API_KEY,
        model="gpt-4o-2024-08-06",
        temperature=0
    )
    logging.info(f"Successfully created OpenAI LLM with model gpt-4o-2024-08-06")

    template = """You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below:
    {context}
    Question: {question}"""
    prompt = ChatPromptTemplate.from_template(template)
    rag_chain = (
        {"context": vector_store.as_retriever(), "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    logging.info("Successfully created RAG chain")

    pure_llm_template = """You are a helpful bot. Answer the question as truthfully as possible.
    Question: {question}"""
    pure_llm_prompt = ChatPromptTemplate.from_template(pure_llm_template)
    pure_llm_chain = (
        {"question": RunnablePassthrough()}
        | pure_llm_prompt
        | llm
        | StrOutputParser()
    )
    logging.info("Successfully created pure LLM chain")
except Exception as e:
    raise ValueError(f"Error creating LLM chains: {str(e)}")

# Performing Semantic Search
A semantic search is performed using the vector store to retrieve documents that are semantically similar to the query. This search is more advanced than keyword matching because it considers the meaning of the text.

In [19]:
query = "What caused the 1929 Great Depression?"

try:
    # Get RAG response
    start_time = time.time()
    rag_response = rag_chain.invoke(query)
    rag_elapsed_time = time.time() - start_time
    logging.info(f"RAG response generated in {rag_elapsed_time:.2f} seconds")

    # Get pure LLM response
    start_time = time.time()
    pure_llm_response = pure_llm_chain.invoke(query)
    pure_llm_elapsed_time = time.time() - start_time
    logging.info(f"Pure LLM response generated in {pure_llm_elapsed_time:.2f} seconds")

    print(f"RAG Response: {rag_response}")
    print(f"Pure LLM Response: {pure_llm_response}")

    # Perform semantic search
    start_time = time.time()
    search_results = vector_store.similarity_search_with_score(query, k=10)
    results = [{'id': doc.metadata.get('id', 'N/A'), 'text': doc.page_content, 'distance': score}
               for doc, score in search_results]
    search_elapsed_time = time.time() - start_time
    logging.info(f"Semantic search completed in {search_elapsed_time:.2f} seconds")

    print(f"\nSemantic Search Results (completed in {search_elapsed_time:.2f} seconds):")
    for result in results:
        print(f"Distance: {result['distance']:.4f}, Text: {result['text']}")
except CouchbaseException as e:
    raise RuntimeError(f"Error performing semantic search: {str(e)}")
except Exception as e:
    raise RuntimeError(f"Unexpected error: {str(e)}")

RAG Response: The 1929 Great Depression was caused by a combination of factors, including the stock market crash of October 1929, bank failures, reduction in consumer spending and investment, and flawed economic policies. These factors led to a severe economic downturn that affected many countries worldwide.
Pure LLM Response: The Great Depression, which began in 1929, was caused by a combination of factors:

1. **Stock Market Crash of 1929**: The most immediate trigger was the stock market crash in October 1929, often referred to as "Black Tuesday." This crash wiped out a significant amount of wealth and led to a loss of confidence in the economy.

2. **Bank Failures**: Following the crash, many banks failed due to a lack of liquidity and the inability to recover loans. This led to a loss of savings for many individuals and further reduced consumer spending.

3. **Reduction in Consumer Spending and Investment**: As people lost their savings and jobs, consumer spending and investment d