In [40]:
import os

# import openai
import sys

sys.path.append("../..")

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file
GOOGLE_STUDIO_API_KEY = os.getenv("GOOGLE_STUDIO_API_KEY")
GOOGLE_OAUTH_API_KEY = os.getenv("GOOGLE_OAUTH_API_KEY")

In [46]:
import subprocess
import weaviate


def refresh_token() -> str:
    result = subprocess.run(
        ["gcloud", "auth", "print-access-token"], capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"Error refreshing token: {result.stderr}")
        return None
    return result.stdout.strip()


def re_instantiate_weaviate() -> weaviate.Client:
    token = refresh_token()

    client = weaviate.connect_to_embedded(
        version="1.25.1",
        environment_variables={
            "ENABLE_MODULES": "backup-filesystem,multi2vec-palm",
            "BACKUP_FILESYSTEM_PATH": "/home/chris/work/backups",
        },
        headers={
            "X-Google-Vertex-Api-Key": token,
            "X-Google-Api-Key": token,
        },
    )
    return client

In [47]:
# Run this every ~60 minutes
client = re_instantiate_weaviate()

Started /home/chris/.cache/weaviate-embedded: process ID 29001


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-05-27T15:17:06-04:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-05-27T15:17:06-04:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-05-27T15:17:06-04:00"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":44483},"time":"2024-05-27T15:17:06-04:00"}
{"address":"172.20.9.248:53891","level":"info","msg":"starting cloud rpc server ...","time":"2024-05-27T15:17:06-04:00"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-05-27T15:17:06-04:00"}
{"address":"172.20.9.248:44483","level":"info","msg":"tcp transport","tcpMaxP

{"action":"telemetry_push","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:65e80a1d-4f2a-48e5-ba8e-2c86526b6fe3 Type:INIT Version:1.25.1 Modules:backup-filesystem,multi2vec-palm NumObjects:0 OS:linux Arch:amd64}","time":"2024-05-27T15:17:09-04:00"}
{"action":"bootstrap","level":"info","msg":"node reporting ready, node has probably recovered cluster from raft config. Exiting bootstrap process","time":"2024-05-27T15:17:10-04:00"}


In [48]:
client.is_ready()

True

In [33]:
meta_info = client.get_meta()
print(meta_info)

{'hostname': 'http://127.0.0.1:8079', 'modules': {'backup-filesystem': {'backupsPath': '/home/chris/work/backups'}, 'multi2vec-palm': {'documentationHref': 'https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-multimodal-embeddings', 'name': 'Google PaLM Multimodal Module'}}, 'version': '1.25.1'}


## Create Collection

In [35]:
from weaviate.classes.config import Configure

collection_name = "Rooms"

# Just checking if you ever need to re run it
if client.collections.exists(collection_name):
    client.collections.delete(collection_name)

client.collections.create(
    name=collection_name,
    vectorizer_config=Configure.Vectorizer.multi2vec_palm(
        image_fields=["image"],
        video_fields=["video"],
        project_id="semi-random-dev",
        location="us-central1",
        model_id="multimodalembedding@001",
        dimensions=1408,
    ),
)

{"level":"info","msg":"Created shard rooms_w8i9e2n3wOnN in 1.136078ms","time":"2024-05-27T15:07:20-04:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-05-27T15:07:20-04:00","took":43880}


<weaviate.collections.collection.Collection at 0x7f761a73b510>

## Helper functions

In [36]:
import base64


# Helper function to convert a file to base64 representation
def toBase64(path):
    with open(path, "rb") as file:
        return base64.b64encode(file.read()).decode("utf-8")

## Insert Images into Weaviate

In [37]:
rooms = client.collections.get(collection_name)
img_path = "./pics/test/"
source = os.listdir(img_path)

with rooms.batch.rate_limit(requests_per_minute=100) as batch:
    for name in source:
        print(f"Adding {name}")

        path = img_path + name

        batch.add_object(
            {
                "name": name,  # name of the file
                "path": path,  # path to the file to display result
                "image": toBase64(
                    path
                ),  # this gets vectorized - "image" was configured in vectorizer_config as the property holding images
                "mediaType": "image",  # a label telling us how to display the resource
            }
        )

Adding 0247O3-20.jpg
Adding 0247A1-20.jpg
Adding 0246O4-20.jpg
Adding 0133O2-20.jpg
Adding 0246O3-20.jpg
Adding 0134A1-20.jpg
Adding 0133A1-20.jpg
Adding 0247O4-20.jpg
Adding 0134O1-20.jpg
Adding 0132O1-20.jpg
Adding 0247O1-20.jpg
Adding 0247O2-20.jpg
Adding 0133O1-20.jpg


## Text to Media Search

In [50]:
rooms = client.collections.get(collection_name)

response = rooms.query.near_text(
    query="bathtub",
    return_properties=["name", "path", "mediaType"],
    limit=3,
)

WeaviateQueryError: Query call with protocol GRPC search failed with message explorer: get class: vectorize params: vectorize params: vectorize params: vectorize keywords: remote client vectorize: connection to Google failed with status: 403 error: Permission 'aiplatform.endpoints.predict' denied on resource '//aiplatform.googleapis.com/projects/semi-random-dev/locations/us-central1/publishers/google/models/multimodalembedding@001' (or it may not exist)..

In [None]:
for obj in response.objects:
    json_print(obj.properties)
    display_media(obj.properties)

In [45]:
client.close()

{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2024-05-27T15:16:58-04:00"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2024-05-27T15:16:58-04:00"}
{"action":"telemetry_push","level":"info","msg":"telemetry terminated","payload":"\u0026{MachineID:10d805aa-9452-405a-ac8d-a9c82883e4e3 Type:TERMINATE Version:1.25.1 Modules:backup-filesystem,multi2vec-palm NumObjects:0 OS:linux Arch:amd64}","time":"2024-05-27T15:16:58-04:00"}
{"level":"info","msg":"shutting down raft sub-system ...","time":"2024-05-27T15:16:58-04:00"}
{"level":"info","msg":"transferring leadership to another server","time":"2024-05-27T15:16:58-04:00"}
{"error":"cannot find peer","level":"error","msg":"transferring leadership","time":"2024-05-27T15:16:58-04:00"}
{"level":"info","msg":"close raft-net ...","time":"2024-05-27T15:16:58-04:00"}
{"level":"info","msg":"closing log store ...","time":"2024-05-27T15:16:58-04:00"}
{"lev