Copyright 2024 Google LLC.<br>
SPDX-License-Identifier: Apache-2.0

In [None]:
#@title Default title text
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<h1>Step 0: Set variables for your Bigtable instance</h1>
If you do not yet have a Bigtable instance, please see <a href="https://cloud.google.com/bigtable/docs/creating-instance">creating an instance</a>

In [None]:
PROJECT_ID = 'google.com:cloud-bigtable-dev'
INSTANCE_ID = 'crosbie-instance'
TABLE_ID = 'knn_intro'

In [None]:
#if running from colab, you will also need to authenticate the project id
from google.colab import auth
auth.authenticate_user(project_id=PROJECT_ID)

<h1>Step 1: Create a table to store the text, embeddings and search phrase</h1>


In [None]:
from google.cloud import bigtable
from google.cloud.bigtable import column_family

client = bigtable.Client(project=PROJECT_ID, admin=True)
instance = client.instance(INSTANCE_ID)
table = instance.table(TABLE_ID)

column_families = {"docs":column_family.MaxVersionsGCRule(2), "search_phrase":column_family.MaxVersionsGCRule(2)}

if not table.exists():
  table.create(column_families=column_families)
else:
  print("Table already exists")

<h1>Step 2: Embed texts with a pre-trained, foundational model from Vertex<h1>

Next, generate the text and embeddings which you will store in Bigtable along with the associated keys.

For full documentation, please see <a href="https://cloud.google.
com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings">get text embeddings</a> or [get multimodal embeddings](https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-multimodal-embeddings).


In [None]:
from typing import List, Optional
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

#defines which LLM that we should use to generate the text
model = GenerativeModel("gemini-1.5-pro-001")

#first we will use generative AI to create a list of 10 chunks for phrases
#This can be replaced with a static list of text items or your own data
chunks = []
for i in range(10):
  response = model.generate_content(
      "Generate a paragraph between 10 and 20 words that is about about either Bigtable or Generative AI"
  )
  chunks.append(response.text)
  print(response.text)

#create embeddings for the chunks of text
def embed_text(
    texts: List[str] = chunks,
    task: str = "RETRIEVAL_DOCUMENT",
    model_name: str = "text-embedding-004",
    dimensionality: Optional[int] = 128,
) -> List[List[float]]:
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)
    return [embedding.values for embedding in embeddings]

embeddings = embed_text()
print("embeddings created for text phrases")

Generative AI crafts realistic new content, like text or images, by learning patterns from existing data. 

Generative AI models learn patterns from data to create new content like text, images, and even music. 

Bigtable efficiently stores and manages massive datasets, enabling real-time data access and analysis. 

Generative AI models learn from existing data to create new, original content like text, images, or music. 

Generative AI models learn from vast datasets to create new, original content like text, images, and music. 

Bigtable, Google's distributed NoSQL database, handles massive datasets with high performance and scalability. 

Bigtable, Google's distributed NoSQL database, handles massive datasets with high performance and scalability. 

Generative AI models, like ChatGPT, learn from data to create new content, including text, images, and even music. 

Generative AI models learn from data to create new, original content like text, images, and even music. 

Bigtable effic

In [None]:
#create embeddings for the chunks of text
def embed_text(
    texts: List[str] = chunks,
    task: str = "RETRIEVAL_DOCUMENT",
    model_name: str = "text-embedding-004",
    dimensionality: Optional[int] = 128,
) -> List[List[float]]:
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)
    return [embedding.values for embedding in embeddings]

embeddings = embed_text()
print(embeddings)

[[-0.08450032770633698, 0.02769993618130684, -0.03482632339000702, -0.017077140510082245, 0.019390733912587166, 0.04392266273498535, 0.03357764706015587, 0.015328948386013508, 0.006103283725678921, -0.014262494631111622, -0.024556264281272888, 0.016103453934192657, 0.02031383290886879, 0.011007575318217278, 0.03205610811710358, -0.07855264097452164, 0.030707791447639465, 0.0038270477671176195, -0.1282244324684143, 0.027971018105745316, 0.0429173968732357, -0.012058214284479618, -0.020504970103502274, -0.061760395765304565, -0.04263300448656082, 0.015370801091194153, 0.05843237414956093, 0.01672925055027008, 0.007277846336364746, -0.019984232261776924, 0.02035752683877945, 0.09591261297464371, 0.026950383558869362, -0.03744083270430565, -0.0034160178620368242, -0.04068847373127937, -0.007037016097456217, -0.0026350365951657295, 0.05283447727560997, -0.04651201516389847, -0.04621433466672897, 0.023651549592614174, -0.021708453074097633, 0.043830644339323044, 0.011450933292508125, -0.0112

<h1>Step 3: Define functions that let you convert into byte objects</h1>
Bigtable is optimized for key-value pairs and generally will store data as byte objects.

You will need to convert the embeddings that come back from Vertex which are stored as a list of floats in Python, into byte objects that can be inserted into a Bigtable table. The below functions achieve this.


In [None]:
import struct
def floats_to_bytes(float_list):
    """
    Convert a list of floats to a bytes object, where each float is represented by 4 big-endian bytes.

    Parameters:
    float_list (list of float): The list of floats to be converted.

    Returns:
    bytes: The resulting bytes object with concatenated 4-byte big-endian representations of the floats.
    """
    byte_array = bytearray()

    for value in float_list:
        packed_value = struct.pack('>f', value)
        byte_array.extend(packed_value)

    # Convert bytearray to bytes
    return bytes(byte_array)

<h1> Step 4: Write the embeddings to Bigtable</h1>

In [None]:
from google.cloud.bigtable.data import RowMutationEntry
from google.cloud.bigtable.data import SetCell

mutations = []
embeddings = embed_text()
for i, embedding in enumerate(embeddings):

  #convert each embedding into a byte object
  vector = floats_to_bytes(embedding)

  #set the row key which will be used to pull the range of documents (ex. doc type or user id)
  row_key = f"doc_{i}"

  row = table.direct_row(row_key)

  #set the column for the embedding based on the byte object format of the embedding
  row.set_cell("docs","embedding",vector)

  #store the text associated with vector in the same key
  row.set_cell("docs","text",chunks[i])

  mutations.append(row)

#write the rows to Bigtable
table.mutate_rows(mutations)

[, , , , , , , , , ]

<h1> Step 5: Perform a KNN search using Bigtable SQL</h1>

From Python, we can use the GoogleSQL COSINE_DISTANCE function to find the similarity between our text embeddings and search phrases that we give it. Since this computation may take time to process, we will want to use Bigtable’s Asynchronous Data Client to execute the SQL query.<br>

The response returned should be one of the generated text descriptions that describes Bigtable, an open source database that was originally modeled after Bigtable.

In [None]:
from google.cloud.bigtable.data import BigtableDataClientAsync

#first embed the search phrase
search_embedding = embed_text(texts=["Apache HBase"])

query = """
        select _key, docs['text'] as description
        FROM knn_intro
        ORDER BY COSINE_DISTANCE(TO_VECTOR32(docs['embedding']), {search_embedding})
        LIMIT 1;
        """

async def execute_query():
  async with BigtableDataClientAsync(project=PROJECT_ID) as client:
    local_query = query
    async for row in await client.execute_query(query.format(search_embedding=search_embedding[0]), INSTANCE_ID):
      return(row["_key"],row["description"])

await execute_query()

(b'doc_1', b"Bigtable is Google's distributed, NoSQL database designed to handle massive datasets with high availability. \n")
