# Install Required Libraries
First, install all the necessary Python libraries using pip:


In [None]:
!pip install couchbase datasets langchain jina tqdm python-dotenv

# Import the Necessary Libraries
Import the libraries required for database operations, AI-related tasks, and utilities for logging, data handling, and environment management:

In [None]:
import json
import logging
import os
import time
import warnings
from datetime import timedelta
from uuid import uuid4

import numpy as np
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.exceptions import CollectionNotFoundException, CouchbaseException, InternalServerFailureException, QueryIndexAlreadyExistsException
from couchbase.management.search import SearchIndex
from couchbase.options import ClusterOptions
from datasets import load_dataset
from dotenv import load_dotenv
from langchain_community.chat_models import JinaChat
from langchain_community.embeddings import JinaEmbeddings
from langchain_core.documents import Document
from langchain_core.globals import set_llm_cache
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_couchbase.cache import CouchbaseCache
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from tqdm import tqdm

# Set Up Logging
Set up logging to monitor the progress and handle any errors that may occur:

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load Environment Variables
Load environment variables from a .env file, which contains sensitive information such as API keys and database credentials. Ensure that your .env file is correctly configured:

In [None]:
load_dotenv()

JINA_API_KEY = os.getenv('JINA_API_KEY')
JINACHAT_API_KEY = os.getenv('JINACHAT_API_KEY')
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
CB_HOST = os.getenv('CB_HOST', 'couchbase://localhost')
CB_USERNAME = os.getenv('CB_USERNAME', 'Administrator')
CB_PASSWORD = os.getenv('CB_PASSWORD', 'password')
CB_BUCKET_NAME = os.getenv('CB_BUCKET_NAME', 'vector-search-testing')
INDEX_NAME = os.getenv('INDEX_NAME', 'vector_search_jina')

SCOPE_NAME = os.getenv('SCOPE_NAME', 'shared')
COLLECTION_NAME = os.getenv('COLLECTION_NAME', 'jina')
CACHE_COLLECTION = os.getenv('CACHE_COLLECTION', 'cache')

# Connect to Couchbase
Connect to the Couchbase cluster using the credentials and connection string loaded from the environment variables:


In [None]:
try:
    auth = PasswordAuthenticator(CB_USERNAME, CB_PASSWORD)
    options = ClusterOptions(auth)
    cluster = Cluster(CB_HOST, options)
    cluster.wait_until_ready(timedelta(seconds=5))
    logging.info("Successfully connected to Couchbase")
except Exception as e:
    raise ConnectionError(f"Failed to connect to Couchbase: {str(e)}")

# Setup Collection in Couchbase
Setup a collection within a specified bucket and scope in Couchbase. If the collection doesn’t exist, create it:

In [None]:
try:
    bucket = cluster.bucket(CB_BUCKET_NAME)
    bucket_manager = bucket.collections()

    # Check if collection exists, create if it doesn't
    collections = bucket_manager.get_all_scopes()
    collection_exists = any(
        scope.name == SCOPE_NAME and COLLECTION_NAME in [col.name for col in scope.collections]
        for scope in collections
    )

    if not collection_exists:
        logging.info(f"Collection '{COLLECTION_NAME}' does not exist. Creating it...")
        bucket_manager.create_collection(SCOPE_NAME, COLLECTION_NAME)
        logging.info(f"Collection '{COLLECTION_NAME}' created successfully.")
    else:
        logging.info(f"Collection '{COLLECTION_NAME}' already exists.")

    # Wait for the collection to be available
    max_retries = 3
    retry_delay = 1
    for _ in range(max_retries):
        try:
            collection = bucket.scope(SCOPE_NAME).collection(COLLECTION_NAME)
            break
        except CollectionNotFoundException:
            time.sleep(retry_delay)
    else:
        raise RuntimeError(f"Collection '{COLLECTION_NAME}' not available after {max_retries} retries")

    # Ensure primary index exists
    try:
        cluster.query(f"CREATE PRIMARY INDEX ON `{CB_BUCKET_NAME}`.`{SCOPE_NAME}`.`{COLLECTION_NAME}`").execute()
        logging.info("Primary index created successfully.")
    except QueryIndexAlreadyExistsException:
        logging.info("Primary index already exists.")
    except Exception as e:
        logging.warning(f"Error creating primary index: {str(e)}")

    # Clear all documents in the collection
    try:
        query = f"DELETE FROM `{CB_BUCKET_NAME}`.`{SCOPE_NAME}`.`{COLLECTION_NAME}`"
        cluster.query(query).execute()
        logging.info("All documents cleared from the collection.")
    except Exception as e:
        logging.warning(f"Error while clearing documents: {str(e)}. The collection might be empty.")
except Exception as e:
    raise RuntimeError(f"Error setting up collection: {str(e)}")

#  Load Index Definition from JSON
Load the index definition for the search index from a JSON file:

In [None]:
index_definition_path = '/path_to_your_index_file/jina_index.json'

try:
    with open(index_definition_path, 'r') as file:
        index_definition = json.load(file)
except Exception as e:
    raise ValueError(f"Error loading index definition from {index_definition_path}: {str(e)}")

# Create or Update Search Index
Create or update the search index in Couchbase based on the loaded definition:

In [None]:
try:
    scope_index_manager = cluster.bucket(CB_BUCKET_NAME).scope(SCOPE_NAME).search_indexes()

    # Check if index already exists
    existing_indexes = scope_index_manager.get_all_indexes()
    index_name = index_definition["name"]

    if index_name in [index.name for index in existing_indexes]:
        logging.info(f"Index '{index_name}' already exists. Updating...")
    else:
        logging.info(f"Creating new index '{index_name}'...")

    # Create SearchIndex object
    search_index = SearchIndex(
        name=index_definition["name"],
        source_type=index_definition.get("sourceType", "couchbase"),
        idx_type=index_definition["type"],
        source_name=index_definition["sourceName"],
        params=index_definition["params"],
        source_params=index_definition.get("sourceParams", {}),
        plan_params=index_definition.get("planParams", {})
    )

    # Upsert the index (create if not exists, update if exists)
    scope_index_manager.upsert_index(search_index)
    logging.info(f"Index '{index_name}' successfully created/updated.")
except QueryIndexAlreadyExistsException:
    logging.info(f"Index '{index_name}' already exists. Skipping creation/update.")
except InternalServerFailureException as e:
    logging.error(f"InternalServerFailureException raised: {str(e)}")
    raise RuntimeError(f"Internal server error while creating/updating search index: {str(e)}")
except Exception as e:
    raise RuntimeError(f"Unexpected error creating/updating search index: {str(e)}")

# Load the TREC Dataset
Load the TREC dataset using the Hugging Face Datasets library:

In [None]:
try:
    trec = load_dataset('trec', split='train[:1000]')
    logging.info(f"Successfully loaded TREC dataset with {len(trec)} samples")
except Exception as e:
    raise ValueError(f"Error loading TREC dataset: {str(e)}")

# Create Jina Embeddings
Initialize the Jina Embeddings model, which will convert text into vector representations:

In [None]:
try:
    embeddings = JinaEmbeddings(
        jina_api_key=JINA_API_KEY, model_name="jina-embeddings-v2-base-en"
    )
    logging.info("Successfully created JinaEmbeddings")
except Exception as e:
    raise ValueError(f"Error creating JinaEmbeddings: {str(e)}")

# Setup Couchbase Vector Store
Create a vector store in Couchbase using the previously created embeddings:

In [None]:
try:
    vector_store = CouchbaseVectorStore(
        cluster=cluster,
        bucket_name=CB_BUCKET_NAME,
        scope_name=SCOPE_NAME,
        collection_name=COLLECTION_NAME,
        embedding=embeddings,
        index_name=INDEX_NAME,
    )
    logging.info("Successfully created vector store")
except Exception as e:
    raise ValueError(f"Failed to create vector store: {str(e)}")


# Save Data to Vector Store in Batches
Save the TREC dataset to the vector store in batches to optimize the process:

In [None]:
batch_size = 50
try:
    for i in tqdm(range(0, len(trec['text']), batch_size), desc="Processing Batches"):
        batch = trec['text'][i:i + batch_size]
        documents = [Document(page_content=text) for text in batch]
        uuids = [str(uuid4()) for _ in range(len(documents))]
        vector_store.add_documents(documents=documents, ids=uuids)
except Exception as e:
    raise RuntimeError(f"Failed to save documents to vector store: {str(e)}")


# Setup Couchbase Cache
Set up a Couchbase-based cache to speed up repetitive LLM operations:


In [None]:
try:
    cache = CouchbaseCache(
        cluster=cluster,
        bucket_name=CB_BUCKET_NAME,
        scope_name=SCOPE_NAME,
        collection_name=CACHE_COLLECTION,
    )
    logging.info("Successfully created cache")
    set_llm_cache(cache)
except Exception as e:
    raise ValueError(f"Failed to create cache: {str(e)}")

# Create Jina Language Model (LLM)
Initialize a Jina language model to interact with the vector store:

In [None]:
try:
    llm = JinaChat(temperature=0, jinachat_api_key=JINACHAT_API_KEY, model="jina-clip-v1")
    logging.info("Successfully created JinaChat")
except Exception as e:
    logging.error(f"Error creating JinaChat: {str(e)}. Please check your API key and network connection.")
    raise

# Create a Retrieval-Augmented Generation (RAG) Chain
Build a RAG chain that combines the LLM with the vector store to enhance responses:

In [None]:
system_template = "You are a helpful assistant that answers questions based on the provided context."
system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)

human_template = "Context: {context}\n\nQuestion: {question}"
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([
    system_message_prompt,
    human_message_prompt
])

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain = (
    {"context": lambda x: format_docs(vector_store.similarity_search(x)), "question": RunnablePassthrough()}
    | chat_prompt
    | llm
)
logging.info("Successfully created RAG chain")

# Sample Query and Perform Semantic Search
Finally, perform a semantic search using the vector store and generate a response with the RAG chain:

In [None]:
query = "What caused the 1929 Great Depression?"

# Get responses
start_time = time.time()
rag_response = chain.invoke(query)
rag_elapsed_time = time.time() - start_time
logging.info(f"RAG response generated in {rag_elapsed_time:.2f} seconds")

print(f"RAG Response: {rag_response}")

# Perform semantic search
try:
    search_results = vector_store.similarity_search_with_score(query, k=10)
    results = [{'id': doc.metadata.get('id', 'N/A'), 'text': doc.page_content, 'distance': score}
               for doc, score in search_results]
    elapsed_time = time.time() - start_time
    logging.info(f"Semantic search completed in {elapsed_time:.2f} seconds")
except CouchbaseException as e:
    raise RuntimeError(f"Error performing semantic search: {str(e)}")

print(f"\nSemantic Search Results (completed in {elapsed_time:.2f} seconds):")
for result in results:
    print(f"Distance: {result['distance']:.4f}, Text: {result['text']}")


This Colab notebook is now structured and annotated for easy understanding and execution. Replace any placeholders with your actual paths or credentials as needed, and run the cells sequentially.