In [1]:
# Install python-dotenv
!pip install python-dotenv couchbase voyageai datasets langchain_core langchain_couchbase langchain_openai pyarrow requests fsspec



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Necessary Libraries
This block imports all the required libraries and modules used in the notebook. These include libraries for environment management, data handling, natural language processing, interaction with Couchbase, and embeddings generation. Each library serves a specific function, such as managing environment variables, handling datasets, or interacting with the Couchbase database.

In [3]:
import os
import warnings
from datetime import timedelta

import numpy as np
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.exceptions import CouchbaseException
from couchbase.options import ClusterOptions
from datasets import load_dataset
from dotenv import load_dotenv,find_dotenv
from langchain_core.documents import Document
from langchain_core.globals import set_llm_cache
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_couchbase.cache import CouchbaseCache
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

import voyageai

# Loading Environment Variables
The purpose of this block is to load environment variables from a .env file, which is a common practice for securely handling sensitive information like API keys and database credentials.

In [4]:
load_dotenv(find_dotenv())

True

# Get Environment Variables
This function is designed to retrieve environment variables. If the variable is not set, it either returns a default value or raises an error. This ensures that all necessary configurations are available before proceeding.


In [5]:
def get_env_variable(var_name, default_value=None):
    value = os.getenv(var_name)
    if value is None:
        if default_value is not None:
            warnings.warn(f"Environment variable {var_name} is missing. Assigning default value: {default_value}")
            return default_value
        else:
            raise ValueError(f"Environment variable {var_name} is missing and no default value is provided.")
    return value

# Connect to Couchbase
This function establishes a connection to a Couchbase cluster using the provided credentials. It uses the PasswordAuthenticator to authenticate the connection and waits until the cluster is ready.

In [6]:
def connect_to_couchbase(connection_string, db_username, db_password):
    """Connect to Couchbase"""
    auth = PasswordAuthenticator(db_username, db_password)
    options = ClusterOptions(auth)
    cluster = Cluster(connection_string, options)
    cluster.wait_until_ready(timedelta(seconds=5))
    return cluster

#  Fetch The Couchbase Vector Store
This function initializes and returns a CouchbaseVectorStore, which is used for storing and retrieving vectors (embeddings) in Couchbase. This is crucial for enabling vector-based operations like similarity search.

In [7]:
def get_vector_store(cluster, db_bucket, db_scope, db_collection, embedding, index_name):
    """Return the Couchbase vector store"""
    vector_store = CouchbaseVectorStore(
        cluster=cluster,
        bucket_name=db_bucket,
        scope_name=db_scope,
        collection_name=db_collection,
        embedding=embedding,
        index_name=index_name,
    )
    return vector_store

# Cache the Results
This function initializes and returns a CouchbaseCache, which is used for caching purposes within the application. The cache helps in reducing redundant computations and improves the performance of operations like search and retrieval.

In [8]:
def get_cache(cluster, db_bucket, db_scope, cache_collection):
    """Return the Couchbase cache"""
    cache = CouchbaseCache(
        cluster=cluster,
        bucket_name=db_bucket,
        scope_name=db_scope,
        collection_name=cache_collection,
    )
    return cache

# Save the embeddings to Vector Store
This function takes in texts and their corresponding embeddings, converts them into Document objects, and stores them in the Couchbase vector store. This is a key step for making the data searchable via semantic search.

In [9]:
def save_to_vector_store(vector_store, texts, embeddings):
    """Store the documents in the vector store"""
    documents = [
        Document(page_content=text, metadata={'embedding': embed})
        for embed, text in zip(embeddings, texts)
    ]
    vector_store.add_documents(documents)
    print(f"Stored {len(documents)} documents in Couchbase")

# Semantic Search
This function performs a semantic search on the vector store using a query. It generates an embedding for the query using the Voyage API, and then retrieves the most similar documents from the Couchbase vector store based on this embedding.

In [10]:
def semantic_search(vector_store, query, voyage_client, top_k=10):
    """Perform semantic search"""
    try:
        query_embed = voyage_client.embed(
            texts=[query],
            model='voyage-law-2',  # Adjust model based on your needs
            input_type='query',
            truncation=True
        ).embeddings[0]
    except Exception as e:
        print(f"Error creating query embedding: {e}")
        return []

    try:
        search_results = vector_store.similarity_search_by_vector(embedding=query_embed, k=top_k)

        results = [{'id': doc.metadata['id'], 'text': doc.page_content, 'distance': score}
                   for doc, score in search_results]
        return results
    except CouchbaseException as e:
        print(f"Error performing semantic search: {e}")
        return []

# Main Function
The main function orchestrates the entire process. It initializes the necessary components like the Voyage client, Couchbase connection, and vector store, loads a dataset, generates embeddings, stores them in Couchbase, sets up the cache, and finally performs a semantic search with the RAG chain.

In [11]:
def main():
    # Get environment variables or use default values
    VOYAGE_API_KEY = get_env_variable('VOYAGE_API_KEY')  # No default for VOYAGE_API_KEY, must be provided
    OPENAI_API_KEY = get_env_variable("OPENAI_API_KEY")
    CB_USERNAME = get_env_variable('CB_USERNAME', 'default-username')
    CB_PASSWORD = get_env_variable('CB_PASSWORD', 'default-password')
    CB_BUCKET_NAME = get_env_variable('CB_BUCKET_NAME', 'default-bucket-name')
    CB_HOST = get_env_variable('CB_HOST', 'couchbase://localhost')
    INDEX_NAME = get_env_variable('INDEX_NAME', 'default-index-name')
    CACHE_COLLECTION = get_env_variable('CACHE_COLLECTION', 'default-cache-collection')

    # Initialize Voyage AI client
    try:
        voyage_client = voyageai.Client(api_key=VOYAGE_API_KEY)
    except Exception as e:
        print(f"Error initializing Voyage AI client: {e}")
        return

    # Load the TREC dataset
    try:
        trec = load_dataset('trec', split='train[:1000]')
    except Exception as e:
        print(f"Error loading TREC dataset: {e}")
        return

    # Create embeddings
    try:
        num_documents = len(trec['text'])
        voyageai_batch_size = 128  # Ensure this is within the API limits
        embeds = []
        while len(embeds) < num_documents:
            embeds.extend(voyage_client.embed(
                texts=trec['text'][len(embeds):len(embeds)+voyageai_batch_size],
                model='voyage-law-2',  # Use appropriate model based on your requirement
                input_type='document',
                truncation=True
            ).embeddings)
        print(f"Embedding shape: {np.array(embeds).shape}")
    except Exception as e:
        print(f"Error creating embeddings: {e}")
        return


    try:
        # Connect to Couchbase
        cluster = connect_to_couchbase(CB_HOST, CB_USERNAME, CB_PASSWORD)
        bucket = cluster.bucket(CB_BUCKET_NAME)
        scope = bucket.scope("shared")

        # Use OpenAIEmbeddings as a fallback for compatibility
        embeddings = OpenAIEmbeddings()

        # Initialize CouchbaseVectorStore
        vector_store = get_vector_store(cluster, CB_BUCKET_NAME, "shared", "docs", embeddings, INDEX_NAME)

        # Store embeddings and metadata in Couchbase
        save_to_vector_store(vector_store, trec['text'], embeds)

        # Set the LLM cache
        cache = get_cache(cluster, CB_BUCKET_NAME, "shared", CACHE_COLLECTION)
        set_llm_cache(cache)

        # Build the prompt for the RAG
        template = """You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below:
        {context}

        Question: {question}"""

        prompt = ChatPromptTemplate.from_template(template)

        # Use OpenAI GPT-4 as the LLM for the RAG
        llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview", streaming=True)

        # RAG chain
        chain = (
            {"context": vector_store.as_retriever(), "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

        # Pure OpenAI output without RAG
        template_without_rag = """You are a helpful bot. Answer the question as truthfully as possible.

        Question: {question}"""

        prompt_without_rag = ChatPromptTemplate.from_template(template_without_rag)
        llm_without_rag = ChatOpenAI(model="gpt-4-1106-preview", streaming=True)

        chain_without_rag = (
            {"question": RunnablePassthrough()}
            | prompt_without_rag
            | llm_without_rag
            | StrOutputParser()
        )

        # Sample query for testing
        query = "What caused the 1929 Great Depression?"
        results = semantic_search(vector_store, query, voyage_client)

        for result in results:
            print(f"Distance: {result['distance']:.4f}, Text: {result['text']}")

        # Get the response from the RAG
        rag_response = chain.invoke(query)
        print(f"RAG Response: {rag_response}")

        # Get the response from the pure LLM
        pure_llm_response = chain_without_rag.invoke(query)
        print(f"Pure LLM Response: {pure_llm_response}")

    except Exception as e:
        print(f"Error: {e}")

# Running the Main Function
This block ensures that the main function is executed when the script is run directly. It acts as the entry point for the program.

In [12]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

The repository for trec contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/trec.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5452 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Embedding shape: (1000, 1024)
Stored 1000 documents in Couchbase
RAG Response: The 1929 Great Depression was caused by a combination of factors, including:

1. Stock Market Crash of 1929: A significant contributing factor was the stock market crash in October 1929, which wiped out millions of investors and led to a loss of confidence in the economy.

2. Bank Failures: Following the crash, many banks experienced runs and ultimately failed, leading to a reduction in the money supply and a decrease in consumer spending and investment.

3. Reduction in Purchasing Across the Board: As banks failed and the stock market crashed, consumers and businesses cut back on spending, leading to a decrease in production and an increase in unemployment.

4. Overproduction: Industries such as agriculture and manufacturing were producing more goods than could be consumed, leading to falling prices and profits, which in turn led to layoffs and further reductions in consumer spending.

5. High Tariffs and W