In [7]:
# Install python-dotenv
!pip install python-dotenv couchbase datasets langchain_core langchain_cohere langchain_couchbase langchain_openai pyarrow requests fsspec tqdm



# Importing Necessary Libraries
The script starts by importing a series of libraries required for various tasks, including handling JSON, logging, time tracking, Couchbase connections, embedding generation, and dataset loading. These libraries provide essential functions for working with data, managing database connections, and processing machine learning models.

In [8]:
import json
import logging
import os
import time
import warnings
import getpass
from datetime import timedelta
from uuid import uuid4

import numpy as np
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.exceptions import (CouchbaseException,
                                  InternalServerFailureException,
                                  QueryIndexAlreadyExistsException)
from couchbase.management.search import SearchIndex
from couchbase.options import ClusterOptions
from datasets import load_dataset
from dotenv import load_dotenv
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_core.documents import Document
from langchain_core.globals import set_llm_cache
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_couchbase.cache import CouchbaseCache
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from tqdm import tqdm

# Setup Logging
Logging is configured to track the progress of the script and capture any errors or warnings. This is crucial for debugging and understanding the flow of execution. The logging output includes timestamps, log levels (e.g., INFO, ERROR), and messages that describe what is happening in the script.


In [9]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Loading Environment Variables
These variables typically include sensitive information like API keys, database usernames, and passwords. Using environment variables helps keep the code clean and secure by not hardcoding sensitive information directly into the script.

In [10]:
COHERE_API_KEY = getpass.getpass('Enter your Cohere API key: ')
CB_HOST = input('Enter your Couchbase host (default: couchbase://localhost): ') or 'couchbase://localhost'
CB_USERNAME = input('Enter your Couchbase username (default: Administrator): ') or 'Administrator'
CB_PASSWORD = getpass.getpass('Enter your Couchbase password (default: password): ') or 'password'
CB_BUCKET_NAME = input('Enter your Couchbase bucket name (default: vector-search-testing): ') or 'vector-search-testing'
INDEX_NAME = input('Enter your index name (default: vector_search_cohere): ') or 'vector_search_cohere'
SCOPE_NAME = input('Enter your scope name (default: shared): ') or 'shared'
COLLECTION_NAME = input('Enter your collection name (default: cohere): ') or 'cohere'
CACHE_COLLECTION = input('Enter your cache collection name (default: cache): ') or 'cache'

# Check if the variables are correctly loaded
if not COHERE_API_KEY:
    raise ValueError("COHERE_API_KEY is not provided and is required.")
if not CB_HOST:
    warnings.warn("CB_HOST is not provided. Using default value: couchbase://localhost")
if not CB_USERNAME:
    warnings.warn("CB_USERNAME is not provided. Using default value: Administrator")
if not CB_PASSWORD:
    warnings.warn("CB_PASSWORD is not provided. Using default value: password")


# Connect to Couchbase
The script attempts to establish a connection to the Couchbase database using the credentials retrieved from the environment variables. Couchbase is a NoSQL database known for its flexibility, scalability, and support for various data models, including document-based storage. The connection is authenticated using a username and password, and the script waits until the connection is fully established before proceeding.




In [11]:
try:
    auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)
    options = ClusterOptions(auth)
    cluster = Cluster(CB_HOST, options)
    cluster.wait_until_ready(timedelta(seconds=5))
    logging.info("Successfully connected to Couchbase")
except Exception as e:
    raise ConnectionError(f"Failed to connect to Couchbase: {str(e)}")

# Load Index Definition
The search index definition is loaded from a JSON file. This index defines how the data in Couchbase should be indexed for fast search and retrieval. Indexing is critical for optimizing search queries, especially when dealing with large datasets. The JSON file contains details about the index, such as its name, source type, and parameters.

In [12]:
# index_definition_file_path = "/path/to/cohere_index.json"

# try:
#     with open(index_definition_file_path, 'r') as file:
#         index_definition = json.load(file)
# except Exception as e:
#     raise ValueError(f"Error loading index definition from {index_definition_file_path}: {str(e)}")

# Create or Update Search Index
The script checks if the search index already exists in Couchbase. If it exists, the index is updated; if not, a new index is created. This step ensures that the data is properly indexed, allowing for efficient search operations later in the script. The index is associated with a specific bucket, scope, and collection in Couchbase, which organizes the data.


In [13]:
# try:
#     scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()
#     existing_indexes = scope_index_manager.get_all_indexes()
#     index_name = index_definition["name"]

#     if index_name in [index.name for index in existing_indexes]:
#         logging.info(f"Index '{index_name}' already exists. Updating...")
#     else:
#         logging.info(f"Creating new index '{index_name}'...")

#     search_index = SearchIndex(
#         name=index_definition["name"],
#         source_type=index_definition.get("sourceType", "couchbase"),
#         idx_type=index_definition["type"],
#         source_name=index_definition["sourceName"],
#         params=index_definition["params"],
#         source_params=index_definition.get("sourceParams", {}),
#         plan_params=index_definition.get("planParams", {})
#     )

#     scope_index_manager.upsert_index(search_index)
#     logging.info(f"Index '{index_name}' successfully created/updated.")

# except QueryIndexAlreadyExistsException:
#     logging.info(f"Index '{index_name}' already exists. Skipping creation/update.")
# except InternalServerFailureException as e:
#     error_message = str(e)
#     logging.error(f"InternalServerFailureException raised: {error_message}")
#     try:
#         error_context = e.context
#         response_body = error_context.response_body
#         if response_body:
#             error_details = json.loads(response_body)
#             error_message = error_details.get('error', '')
#             if "collection: 'cohere' doesn't belong to scope: 'shared'" in error_message:
#                 raise ValueError("Collection 'cohere' does not belong to scope 'shared'. Please check the collection and scope names.")
#     except ValueError as ve:
#         logging.error(str(ve))
#         raise
#     except Exception as json_error:
#         logging.error(f"Failed to parse the error message: {json_error}")
#         raise RuntimeError(f"Internal server error while creating/updating search index: {error_message}")

# except Exception as e:
#     logging.error(f"Error creating/updating search index: {str(e)}")
#     raise RuntimeError(f"Unexpected error while creating/updating search index: {str(e)}")

# Load TREC Dataset
The TREC dataset is loaded using the datasets library. TREC is a well-known dataset used in information retrieval and natural language processing (NLP) tasks. In this script, the dataset will be used to generate embeddings, which are numerical representations of text that capture its meaning in a form suitable for machine learning models.


In [14]:
try:
    trec = load_dataset('trec', split='train[:1000]')
    logging.info(f"Successfully loaded TREC dataset with {len(trec)} samples")
except Exception as e:
    raise ValueError(f"Error loading TREC dataset: {str(e)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Create Embeddings
Embeddings are created using the Cohere API. Embeddings are vectors (arrays of numbers) that represent the meaning of text in a high-dimensional space. These embeddings are crucial for tasks like semantic search, where the goal is to find text that is semantically similar to a query. The script uses a pre-trained model provided by Cohere to generate embeddings for the text in the TREC dataset.

In [15]:
try:
    embeddings = CohereEmbeddings(
        cohere_api_key=COHERE_API_KEY,
        model="embed-english-v3.0",
    )
    logging.info("Successfully created CohereEmbeddings")
except Exception as e:
    raise ValueError(f"Error creating CohereEmbeddings: {str(e)}")

# Set Up Vector Store
The vector store is set up to manage the embeddings created in the previous step. The vector store is essentially a database optimized for storing and retrieving high-dimensional vectors. In this case, the vector store is built on top of Couchbase, allowing the script to store the embeddings in a way that can be efficiently searched.


In [16]:
try:
    vector_store = CouchbaseVectorStore(
        cluster=cluster,
        bucket_name=CB_BUCKET_NAME,
        scope_name=SCOPE_NAME,
        collection_name=COLLECTION_NAME,
        embedding=embeddings,
        index_name=INDEX_NAME,
    )
    logging.info("Successfully created vector store")
except Exception as e:
    raise ValueError(f"Failed to create vector store: {str(e)}")

# Save Data to Vector Store in Batches
To avoid overloading memory, the TREC dataset's text fields are saved to the vector store in batches. This step is important for handling large datasets, as it breaks down the data into manageable chunks that can be processed sequentially. Each piece of text is converted into a document, assigned a unique identifier, and then stored in the vector store.


In [17]:
try:
    batch_size = 50
    for i in tqdm(range(0, len(trec['text']), batch_size), desc="Processing Batches"):
        batch = trec['text'][i:i + batch_size]
        documents = [Document(page_content=text) for text in batch]
        uuids = [str(uuid4()) for _ in range(len(documents))]
        vector_store.add_documents(documents=documents, ids=uuids)
except Exception as e:
    raise RuntimeError(f"Failed to save documents to vector store: {str(e)}")

Processing Batches: 100%|██████████| 20/20 [00:23<00:00,  1.18s/it]


# Set Up Cache
 A cache is set up using Couchbase to store intermediate results and frequently accessed data. Caching is important for improving performance, as it reduces the need to repeatedly calculate or retrieve the same data. The cache is linked to a specific collection in Couchbase, and it is used later in the script to store the results of language model queries.


In [18]:
try:
    cache = CouchbaseCache(
        cluster=cluster,
        bucket_name=CB_BUCKET_NAME,
        scope_name=SCOPE_NAME,
        collection_name=CACHE_COLLECTION,
    )
    logging.info("Successfully created cache")
    set_llm_cache(cache)
except Exception as e:
    raise ValueError(f"Failed to create cache: {str(e)}")

# Create Language Model (LLM)
The script initializes a Cohere language model (LLM) that will be used for generating responses to queries. LLMs are powerful tools for natural language understanding and generation, capable of producing human-like text based on input prompts. The model is configured with specific parameters, such as the temperature, which controls the randomness of its outputs.


In [19]:
try:
    llm = ChatCohere(
        cohere_api_key=COHERE_API_KEY,
        model="command",
        temperature=0
    )
    logging.info(f"Successfully created Cohere LLM with model command")
except Exception as e:
    raise ValueError(f"Error creating Cohere LLM: {str(e)}")

# Create Retrieval-Augmented Generation (RAG) Chain
A RAG chain is created to combine the capabilities of the vector store and the language model. The RAG chain is a system that first retrieves relevant documents from the vector store and then uses the language model to generate a response based on those documents. This approach allows the model to produce more informed and contextually relevant answers to queries.

In [20]:
template = """You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below:
{context}

Question: {question}"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": vector_store.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
logging.info("Successfully created RAG chain")

# Perform Semantic Search
The script performs a semantic search by running a sample query through the RAG chain and vector store. Semantic search goes beyond keyword matching and instead looks for the meaning of the query, returning results that are conceptually similar. This step involves calculating the similarity between the query and the documents in the vector store, ranking them based on their relevance.

In [21]:
query = "What caused the 1929 Great Depression?"

# Get RAG response
start_time = time.time()
rag_response = chain.invoke(query)
rag_elapsed_time = time.time() - start_time
logging.info(f"RAG response generated in {rag_elapsed_time:.2f} seconds")
print(f"RAG Response: {rag_response}")

# Perform semantic search
try:
    start_time = time.time()
    search_results = vector_store.similarity_search_with_score(query, k=10)
    elapsed_time = time.time() - start_time
    results = [{'id': doc.metadata.get('id', 'N/A'), 'text': doc.page_content, 'distance': score}
               for doc, score in search_results]
    logging.info(f"Semantic search completed in {elapsed_time:.2f} seconds")
    print(f"\nSemantic Search Results (completed in {elapsed_time:.2f} seconds):")
    for result in results:
        print(f"Distance: {result['distance']:.4f}, Text: {result['text']}")
except CouchbaseException as e:
    raise RuntimeError(f"Error performing semantic search: {str(e)}")


RAG Response: The 1929 Great Depression was caused by a combination of factors, including the stock market crash of 1929, financial institution failures, and a decline in consumer spending and investment. The stock market crash, known as "Black Tuesday", was a significant event that marked the beginning of the Great Depression. However, it was not the only factor contributing to the economic decline. 

Financial institution failures played a role in the depression as banks and other financial institutions became insolvent, which led to customers losing their deposits and a lack of lending further deepening the recession. The decline in consumer spending and investment also contributed to the depression as people lost their savings and were unable to spend money on goods and services, leading to decreased economic activity and job losses.

Other factors included the agricultural crisis of the 1920s, which led to high unemployment rates and poverty, and the failure of President Hoover to