## code Using Vertex AI Vector Search and Vertex AI Embeddings for Text for StackOverflow Questions

In [1]:
! pip3 install --upgrade google-cloud-aiplatform \
                        google-cloud-storage \
                        'google-cloud-bigquery[pandas]'

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.44.0-py2.py3-none-any.whl.metadata (27 kB)
Collecting google-cloud-storage
  Downloading google_cloud_storage-2.15.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting google-cloud-bigquery[pandas]
  Downloading google_cloud_bigquery-3.19.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.34.1->google-cloud-aiplatform)
  Downloading google_api_core-2.17.1-py3-none-any.whl.metadata (2.7 kB)
Downloading google_cloud_aiplatform-1.44.0-py2.py3-none-any.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading google_cloud_storage-2.15.0-py2.py3-none-any.whl (123 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)


{'status': 'ok', 'restart': True}

In [1]:
# Setup the environment values for your project.
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION = "us-east4"

In [2]:
# initialize the Vertex AI Python SDK
import vertexai
vertexai.init(project = PROJECT_ID,
              location = REGION)

In [3]:
import math
from typing import Any, Generator

import pandas as pd
from google.cloud import bigquery

client = bigquery.Client(project=PROJECT_ID)

In [4]:
# Define the BigQuery query for the remote dataset.
QUERY_TEMPLATE = """
        SELECT distinct q.id, q.title, q.body
        FROM (SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` where Score>0 ORDER BY View_Count desc) AS q
        LIMIT {limit} OFFSET {offset};
        """

In [5]:
#Create a function to access the BigQuery data in chunks.
def query_bigquery_chunks(
    max_rows: int, rows_per_chunk: int, start_chunk: int = 0
) -> Generator[pd.DataFrame, Any, None]:
    for offset in range(start_chunk, max_rows, rows_per_chunk):
        query = QUERY_TEMPLATE.format(limit=rows_per_chunk, offset=offset)
        query_job = client.query(query)
        rows = query_job.result()
        df = rows.to_dataframe()
        df["title_with_body"] = df.title + "\n" + df.body
        yield df

In [6]:
df = next(query_bigquery_chunks(max_rows=1000, rows_per_chunk=1000))

# Examine the data
df.head()

Unnamed: 0,id,title,body,title_with_body
0,9909619,powershell embedded c# using ObservableCollection,<p>I am trying to add a type in powershell fro...,powershell embedded c# using ObservableCollect...
1,10257596,Experiencing difficulty with a bubblesort exer...,<p>I have an exercise where I need to use a W...,Experiencing difficulty with a bubblesort exer...
2,9917638,Is there a limit on the number of followers we...,<p>Is there a <em>limit</em> on the number of ...,Is there a limit on the number of followers we...
3,9277233,Wrapper for packets generation,<p>Here's the task: \nI need to generate packe...,Wrapper for packets generation\n<p>Here's the ...
4,9318558,C++ simple iterator implementation,<p>I have arrays like:</p>\n\n<pre><code>templ...,C++ simple iterator implementation\n<p>I have ...


In [8]:
# Load the Vertex AI Embeddings for Text model.
from typing import List, Optional
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [9]:
# Define an embedding method that uses the model.
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

In [10]:
# Create a generate_batches to split results in batches of 5 to be sent to the embeddings API.
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple

import numpy as np
from tqdm.auto import tqdm


# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

In [11]:
### Encapsulate the process of generating batches and calling the embeddings API in 
# a method called encode_text_to_embedding_batched. This method also handles rate-limiting using time.sleep. 
# For production use cases, you would want a more sophisticated rate-limiting mechanism that takes retries
# into account.
def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 10, batch_size: int = 5
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []

    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

In [12]:
# Encode a subset of questions for validation
questions = df.title.tolist()[:500]
is_successful, question_embeddings = encode_text_to_embedding_batched(
    sentences=df.title.tolist()[:500]
)

# Filter for successfully embedded sentences
questions = np.array(questions)[is_successful]

  0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
DIMENSIONS = len(question_embeddings[0])

print(DIMENSIONS)

768


In [14]:
""" 
Sort questions in order of similarity. According to the embedding documentation,
the similarity of embeddings is calculated using the dot-product, with np.dot. Once you 
have the similarity score, sort the results and print them for inspection. 1 means very similar, 
0 means very different.
"""
import random

question_index = random.randint(0, 99)

print(f"Query question = {questions[question_index]}")

# Get similarity scores for each embedding by using dot-product.
scores = np.dot(question_embeddings[question_index], question_embeddings.T)

# Print top 20 matches
for index, (question, score) in enumerate(
    sorted(zip(questions, scores), key=lambda x: x[1], reverse=True)[:20]
):
    print(f"\t{index}: {question}: {score}")


Query question = How does Photoshop (Or drawing programs) blit?
	0: How does Photoshop (Or drawing programs) blit?: 0.9999994525137095
	1: Draw lines using Direct2D without scaling the line thickness: 0.6454603340836131
	2: creating alpha images in java: 0.6414637840086157
	3: Java Graphics Composite inconsistencies: 0.6050554842020555
	4: How to prevent open layers DrawInteraction to draw LineString if the user clicks "wrong" area?: 0.6037664083695721
	5: C# - How is this technique called?: 0.5999846486654776
	6: What are the benefits of Watershed Segmentation in digital image processing?: 0.5974293295814623
	7: How do I allocate memory by using new in C++?: 0.5915967575663932
	8: multithread running on difference processes or same process?: 0.5866380902228935
	9: c++ vector of vectors destruction: 0.5844996501018944
	10: create Wordperfect file using Corel SDK: 0.582828886107702
	11: Setting up Patches in Netlogo that decrease in a value from each other: 0.5794306820856339
	12: Why M

In [15]:
import tempfile
from pathlib import Path

# Create temporary file to write embeddings to
embeddings_file_path = Path(tempfile.mkdtemp())

print(f"Embeddings directory: {embeddings_file_path}")

Embeddings directory: /tmp/tmpgy618oyn


In [16]:
import gc
import json

BQ_NUM_ROWS = 5000
BQ_CHUNK_SIZE = 1000
BQ_NUM_CHUNKS = math.ceil(BQ_NUM_ROWS / BQ_CHUNK_SIZE)

START_CHUNK = 0

# Create a rate limit of 300 requests per minute. Adjust this depending on your quota.
API_CALLS_PER_SECOND = 300 / 60
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 5

# Loop through each generated dataframe, convert
for i, df in tqdm(
    enumerate(
        query_bigquery_chunks(
            max_rows=BQ_NUM_ROWS, rows_per_chunk=BQ_CHUNK_SIZE, start_chunk=START_CHUNK
        )
    ),
    total=BQ_NUM_CHUNKS - START_CHUNK,
    position=-1,
    desc="Chunk of rows from BigQuery",
):
    # Create a unique output file for each chunk
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{i+START_CHUNK}.json"
    )
    with open(chunk_path, "a") as f:
        id_chunk = df.id

        # Convert batch to embeddings
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
            sentences=df.title_with_body.to_list(),
            api_calls_per_second=API_CALLS_PER_SECOND,
            batch_size=ITEMS_PER_REQUEST,
        )

        # Append to file
        embeddings_formatted = [
            json.dumps(
                {
                    "id": str(id),
                    "embedding": [str(value) for value in embedding],
                }
            )
            + "\n"
            for id, embedding in zip(id_chunk[is_successful], question_chunk_embeddings)
        ]
        f.writelines(embeddings_formatted)

        # Delete the DataFrame and any other large data structures
        del df
        gc.collect()

Chunk of rows from BigQuery:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [18]:
#Create your Cloud Storage bucket.
BUCKET_URI = f"gs://{PROJECT_ID}-unique"
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://qwiklabs-gcp-04-3588373b3a8a-unique/...
ServiceException: 409 A Cloud Storage bucket named 'qwiklabs-gcp-04-3588373b3a8a-unique' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [19]:
remote_folder = f"{BUCKET_URI}/{embeddings_file_path.stem}/"
! gsutil -m cp -r {embeddings_file_path}/* {remote_folder}

Copying file:///tmp/tmpgy618oyn/tmpgy618oyn_1.json [Content-Type=application/json]...
Copying file:///tmp/tmpgy618oyn/tmpgy618oyn_0.json [Content-Type=application/json]...
Copying file:///tmp/tmpgy618oyn/tmpgy618oyn_2.json [Content-Type=application/json]...
Copying file:///tmp/tmpgy618oyn/tmpgy618oyn_3.json [Content-Type=application/json]...
Copying file:///tmp/tmpgy618oyn/tmpgy618oyn_4.json [Content-Type=application/json]...
\ [5/5 files][ 38.2 MiB/ 38.2 MiB] 100% Done                                    
Operation completed over 5 objects/38.2 MiB.                                     


Create an Index in Vertex AI Vector Search for your embeddings

In [20]:
DISPLAY_NAME = "stack_overflow"
DESCRIPTION = "question titles and bodies from stackoverflow"

### Create the index. Notice that the index reads the embeddings from the Cloud Storage bucket. The indexing process can take from 45 minutes up to 60 minutes. Wait for completion, and then proceed. You can open a different Google Cloud Console page, navigate to Vertex AI Vector search, and see how the index is being created.
## The code you provided creates a special index called a tree-AH index using Google Cloud AI Platform's Matching Engine. Here's a breakdown of what each part does:

tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(...): This line creates a new Matching Engine Index object using the create_tree_ah_index method. This method is specifically designed for creating a tree-AH algorithm based index.

Parameters:

- display_name (required): This is a human-readable name you give to the index for easier identification.
- contents_delta_uri (required): This specifies the location of your data (likely stored in Google Cloud Storage) in JSONL format. JSONL is a text format where each line represents a JSON object. Your data needs to be in this format for the index to process it.
- dimensions (optional): This defines the dimensionality of the embeddings in your data. If you're unsure, you can leave it as the default.
- approximate_neighbors_count (optional): This sets the number of approximate nearest neighbors to search for during queries. The default is 150, but you can adjust it based on your needs.
- distance_measure_type (optional): This specifies the method used to calculate distances between embeddings. The default is "DOT_PRODUCT_DISTANCE" which is commonly used for text embeddings.
- leaf_node_embedding_count (optional): This controls the number of embeddings stored in each leaf node of the index tree. It affects search speed and memory usage.
- leaf_nodes_to_search_percent (optional): This defines the percentage of leaf nodes to explore during a search. Higher values improve accuracy but might take longer.
- description (optional): This allows you to add a description for the index, providing more context about its purpose.


In [21]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

DIMENSIONS = 768

tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=remote_folder,
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description=DESCRIPTION,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/388696548301/locations/us-east4/indexes/5808007446005809152/operations/2023010135642734592
MatchingEngineIndex created. Resource name: projects/388696548301/locations/us-east4/indexes/5808007446005809152
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/388696548301/locations/us-east4/indexes/5808007446005809152')


In [22]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

'projects/388696548301/locations/us-east4/indexes/5808007446005809152'

In [23]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

In [24]:
# Create an IndexEndpoint so that it can be accessed via an API.
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    public_endpoint_enabled=True,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/388696548301/locations/us-east4/indexEndpoints/5391917061683281920/operations/6193343390587813888
MatchingEngineIndexEndpoint created. Resource name: projects/388696548301/locations/us-east4/indexEndpoints/5391917061683281920
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/388696548301/locations/us-east4/indexEndpoints/5391917061683281920')


Deploy your index to the created endpoint. This can take up to 15 minutes.

In [None]:
DEPLOYED_INDEX_ID = "deployed_index_id_unique"

DEPLOYED_INDEX_ID


my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)

my_index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/388696548301/locations/us-east4/indexEndpoints/5391917061683281920
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/388696548301/locations/us-east4/indexEndpoints/5391917061683281920/operations/5554395193454624768


Verify number of declared items matches the number of embeddings. Each IndexEndpoint can have multiple indexes deployed to it. For each index, you can retrieve the number of deployed vectors using the index_endpoint._gca_resource.index_stats.vectors_count. The numbers may not match exactly due to potential rate-limiting failures incurred when using the embedding service.

In [None]:
number_of_vectors = sum(
    aiplatform.MatchingEngineIndex(
        deployed_index.index
    )._gca_resource.index_stats.vectors_count
    for deployed_index in my_index_endpoint.deployed_indexes
)

print(f"Expected: {BQ_NUM_ROWS}, Actual: {number_of_vectors}")

# Create online queries
After you build your indexes, you may query against the deployed index to find nearest neighbors.

In [None]:
test_embeddings = encode_texts_to_embeddings(sentences=["Install GPU for Tensorflow"])

In [None]:
NUM_NEIGHBOURS = 10

response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=NUM_NEIGHBOURS,
)

response

In [None]:
or match_index, neighbor in enumerate(response[0]):
    print(f"https://stackoverflow.com/questions/{neighbor.id}")

Task 9. Clean up the Google Cloud environment
To clean up all Google Cloud resources used in this project, you can delete the Google Cloud project you used for the lab. You can also manually delete resources that you created by running the following code.

In [None]:
import os

delete_bucket = False

# Force undeployment of indexes and delete endpoint
my_index_endpoint.delete(force=True)

# Delete indexes
tree_ah_index.delete()

if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -rf {BUCKET_URI}