In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Installation

Install the latest version of Cloud Storage and the Vertex AI SDK for Python.

In [None]:
# Install the packages
! pip3 install --upgrade google-cloud-aiplatform==1.35.0 \
                         google-cloud-storage

Install the latest version of google-cloud-vision for filtering for safe images

In [None]:
# Install the packages
! pip install google-cloud-vision

#### Set your project ID

In [1]:
PROJECT_ID = "ai-1684952810"  # @param {type:"string"}

# Set the project id
! gcloud config set project {PROJECT_ID}
REGION = "us-central1"  # @param {type: "string"}

Updated property [core/project].


#### Defining encoding functions

Create an EmbeddingPredictionClient which encapsulates the logic to call the embedding API.

In [2]:
import base64
import time
import typing

from google.cloud import aiplatform
from google.protobuf import struct_pb2

import numpy as np

class EmbeddingResponse(typing.NamedTuple):
    text_embedding: typing.Sequence[float]
    image_embedding: typing.Sequence[float]


def load_image_bytes(image_uri: str) -> bytes:
    """Load image bytes from a remote or local URI."""
    image_bytes = None
    if image_uri.startswith("http://") or image_uri.startswith("https://"):
        response = requests.get(image_uri, stream=True)
        if response.status_code == 200:
            image_bytes = response.content
    else:
        image_bytes = open(image_uri, "rb").read()
    return image_bytes

class EmbeddingPredictionClient:
    """Wrapper around Prediction Service Client."""

    def __init__(
        self,
        project: str,
        location: str = "us-central1",
        api_regional_endpoint: str = "us-central1-aiplatform.googleapis.com",
    ):
        client_options = {"api_endpoint": api_regional_endpoint}
        # Initialize client that will be used to create and send requests.
        # This client only needs to be created once, and can be reused for multiple requests.
        self.client = aiplatform.gapic.PredictionServiceClient(
            client_options=client_options
        )
        self.location = location
        self.project = project

    def get_mm_embedding(self, text: str = None, image_file: str = None):
        if not text and not image_file:
            raise ValueError("At least one of text or image_file must be specified.")

        # Load image file
        image_bytes = None
        if image_file:
            image_bytes = load_image_bytes(image_file)

        instance = struct_pb2.Struct()
        if text:
            instance.fields["text"].string_value = text

        if image_bytes:
            encoded_content = base64.b64encode(image_bytes).decode("utf-8")
            image_struct = instance.fields["image"].struct_value
            image_struct.fields["bytesBase64Encoded"].string_value = encoded_content

        instances = [instance]
        endpoint = (
            f"projects/{self.project}/locations/{self.location}"
            "/publishers/google/models/multimodalembedding@001"
        )
        response = self.client.predict(endpoint=endpoint, instances=instances)

        text_embedding = None
        if text:
            text_emb_value = response.predictions[0]["textEmbedding"]
            text_embedding = [v for v in text_emb_value]

        image_embedding = None
        if image_bytes:
            image_emb_value = response.predictions[0]["imageEmbedding"]
            image_embedding = [v for v in image_emb_value]
        
        return EmbeddingResponse(
            text_embedding=text_embedding, image_embedding=image_embedding
        )
        
    def get_text_embedding(self, text: str):
        instance = struct_pb2.Struct()
        instance.fields["text"].string_value = text

        instances = [instance]
        endpoint = (
            f"projects/{self.project}/locations/{self.location}"
            "/publishers/google/models/multimodalembedding@001"
        )
        response = self.client.predict(endpoint=endpoint, instances=instances)

        text_emb_value = response.predictions[0]["textEmbedding"]
        text_embedding = [v for v in text_emb_value]
        return text_embedding

    def get_image_embedding(self, image_file: str):
        image_bytes = load_image_bytes(image_file)

        instance = struct_pb2.Struct()
        encoded_content = base64.b64encode(image_bytes).decode("utf-8")
        image_struct = instance.fields["image"].struct_value
        image_struct.fields["bytesBase64Encoded"].string_value = encoded_content

        instances = [instance]
        endpoint = (
            f"projects/{self.project}/locations/{self.location}"
            "/publishers/google/models/multimodalembedding@001"
        )
        response = self.client.predict(endpoint=endpoint, instances=instances)

        image_emb_value = response.predictions[0]["imageEmbedding"]
        image_embedding = [v for v in image_emb_value]
        return image_embedding
    
def embedding_distance(
    embedding1: np.ndarray, embedding2: np.ndarray
) -> float:
    """
    Compute the distance between two embeddings using the dot product.

    Args:
        embedding1 (np.ndarray): The first embedding vector.
        embedding2 (np.ndarray): The second embedding vector.

    Returns:
        float: The distance between the two embeddings.
    """
    if embedding1 is None or embedding2 is None:
        raise ValueError("Both embeddings must be provided.")

    if embedding1.shape != embedding2.shape:
        raise ValueError("Embeddings must have the same shape.")

    return np.dot(embedding1, embedding2)


#### Test the encoding function


##### Image vs Text Comparison 

In [3]:
e = EmbeddingPredictionClient(PROJECT_ID)

In [23]:
r = e.get_mm_embedding("Vinho Branco", "images/screenshot-20240422-23.20.19.png")
text_embedding = r.__getattribute__("text_embedding")
image_embeddings = r.__getattribute__("image_embedding")
distances = embedding_distance(np.array(text_embedding), np.array(image_embeddings))
print(f"Distance: {distances}")

Distance: 0.0809066133683251


In [24]:
r = e.get_mm_embedding("Vinho Tinto", "images/screenshot-20240422-23.20.19.png")
text_embedding = r.__getattribute__("text_embedding")
image_embeddings = r.__getattribute__("image_embedding")
distances = embedding_distance(np.array(text_embedding), np.array(image_embeddings))
print(f"Distance: {distances}")

Distance: 0.15198525073474012


In [None]:
r = e.get_mm_embedding("Vinho Tinto", "images/screenshot-20240422-23.20.19.png")
text_embedding = r.__getattribute__("text_embedding")
image_embeddings = r.__getattribute__("image_embedding")
distances = embedding_distance(np.array(text_embedding), np.array(image_embeddings))
print(f"Distance: {distances}")

Distance: 0.15198525073474012


In [None]:
r = e.get_mm_embedding("Vinho Tinto", "images/screenshot-20240422-23.20.19.png")
text_embedding = r.__getattribute__("text_embedding")
image_embeddings = r.__getattribute__("image_embedding")
distances = embedding_distance(np.array(text_embedding), np.array(image_embeddings))
print(f"Distance: {distances}")

Distance: 0.15198525073474012


In [56]:
r = e.get_mm_embedding("Criança de 1 ano, sentada em tapete, criança para estar feliz brincando, cachorro de brinquedo, balde, cone colorido, cruz de malta, ", "images/screenshot-20240423-22.24.07.png")
text_embedding = r.__getattribute__("text_embedding")
image_embeddings = r.__getattribute__("image_embedding")
distances = embedding_distance(np.array(text_embedding), np.array(image_embeddings))
print(f"Distance: {distances}")

Distance: 0.16450461565085006


##### Text vs Text Comparison 

In [53]:
r = e.get_text_embedding("Vinho Tinto Seco De Martino Premium Gallardía 2019 - Chile 750ml")
text_embedding1 = r
r = e.get_text_embedding("Vinho Tinto Seco Echo Reserva Especial 2019 - Chile 750ml")
text_embedding2 = r
distances = embedding_distance(
    np.array(text_embedding1),np.array(text_embedding2)
)
print(f"Distance: {distances}")

Distance: 0.7984022215800726


##### Image vs Image Comparison 

In [57]:
r = e.get_image_embedding("images/screenshot-20240424-06.01.17.png")
image_embedding1 = r
r = e.get_image_embedding("images/screenshot-20240424-06.00.46.png")
image_embedding2 = r
distances = embedding_distance(
    np.array(image_embedding1),np.array(image_embedding2)
)
print(f"Distance: {distances}")

Distance: 0.6747244359544193


##### Embeddings API
[Docs](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings#sample-request)

In [None]:
!curl -X POST \
     -H "Authorization: Bearer $(gcloud auth print-access-token)" \
     -H "Content-Type: application/json; charset=utf-8" \
     -d @images/text_payload.json \
     "https://{REGION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/publishers/google/models/multimodalembedding@001:predict"