# Prerequisites
In order to run this tutorial, you will need access to a Couchbase Cluster either through Couchbase Capella or by running it locally. 

In [1]:
couchbase_cluster_url = "couchbase://localhost"
couchbase_username = "Administrator"
couchbase_password = "password"
couchbase_bucket = "hugging_face"

## Imports

In [2]:
from pathlib import Path
from datetime import timedelta
from transformers import pipeline, AutoModel, AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.options import (ClusterOptions, ClusterTimeoutOptions,
                               QueryOptions)
import couchbase.search as search
from couchbase.options import SearchOptions
from couchbase.vector_search import VectorQuery, VectorSearch
import uuid

## Couchbase Connection

In [3]:
auth = PasswordAuthenticator(
    couchbase_username,
    couchbase_password
)

In [4]:
cluster = Cluster(couchbase_cluster_url, ClusterOptions(auth))
cluster.wait_until_ready(timedelta(seconds=5))

bucket = cluster.bucket(couchbase_bucket);
scope = bucket.scope("_default")
collection = scope.collection("_default")

## Creating Couchbase Vector Search Index
In order to store generated with Hugging Face embeddings onto a Couchbase Cluster, a vector search index needs to be created first. We included a sample index definition that will work with this tutorial in a file named `fts_index.json` located in the folder with this tutorial. The definition can be used to create a vector index using Couchbase server web console, on more information on vector indexes, please read [Create a Vector Search Index with the Server Web Console](https://docs.couchbase.com/server/current/vector-search/create-vector-search-index-ui.html). 

In [5]:
search_index_name = couchbase_bucket + "._default.vector_test"
search_index = cluster.search_indexes().get_index(search_index_name)

## Hugging Face Initialization

In [6]:
embedding_model = HuggingFaceEmbeddings()
embedding_model.client.tokenizer.pad_token =  embedding_model.client.tokenizer.eos_token

  embedding_model = HuggingFaceEmbeddings()


## Embedding Documents

In [7]:
texts = [
    "Couchbase Server is a multipurpose, distributed database that fuses the strengths of relational databases such as SQL and ACID transactions with JSON’s versatility, with a foundation that is extremely fast and scalable.",
    "It’s used across industries for things like user profiles, dynamic product catalogs, GenAI apps, vector search, high-speed caching, and much more."
]
embeddings = []
for i in range(0, len(texts)):
    embeddings.append(embedding_model.embed_query(texts[i]))

## Storing Embeddings in Couchbase

In [21]:
for i in range(0, len(texts)):
    doc = {
        "id": str(uuid.uuid4()),
        "text": texts[i],
        "vector": embeddings[i],
    }
    collection.upsert(doc["id"], doc)

## Searching For Embeddings

In [23]:
search_embedding = embedding_model.embed_query("name a multipurpose database with distributed capability")

search_req = search.SearchRequest.create(search.MatchNoneQuery()).with_vector_search(
    VectorSearch.from_vector_query(
        VectorQuery(
            "vector", search_embedding, num_candidates=1
        )
    )
)
result = scope.search(
    "vector_test", 
    search_req, 
    SearchOptions(
        limit=13, 
        fields=["vector", "id", "text"]
    )
)
for row in result.rows():
    print("Found answer: " + row.id + "; score: " + str(row.score))
    doc = collection.get(row.id)
    print("Answer text: " + doc.value["text"])
    


Found answer: 0d07b238-b0cb-4485-8345-959b6f5ade80; score: 0.9256537010609045
Answer text: Couchbase Server is a multipurpose, distributed database that fuses the strengths of relational databases such as SQL and ACID transactions with JSON’s versatility, with a foundation that is extremely fast and scalable.
