## Build scalable SaaS AI apps with Weaviate

## Connect to Weaviate

In [1]:
import weaviate
import os

headers = {
    "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY"),
    "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")
}  # Replace with your OpenAI API key

client = weaviate.connect_to_local(
    headers=headers,
)

## Add data

In [2]:
import weaviate.classes.config as wc
import pandas as pd
import requests
from weaviate.util import generate_uuid5
from tqdm import tqdm
import json
from datetime import datetime, timezone

client.collections.delete("Movie")

client.collections.create(
    name="Movie",
    properties=[
        wc.Property(name="title", data_type=wc.DataType.TEXT),
        wc.Property(name="overview", data_type=wc.DataType.TEXT),
        wc.Property(name="vote_average", data_type=wc.DataType.NUMBER),
        wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY),
        wc.Property(name="release_date", data_type=wc.DataType.DATE),
        wc.Property(name="tmdb_id", data_type=wc.DataType.INT),
    ],
    vectorizer_config=wc.Configure.Vectorizer.text2vec_openai(),
    generative_config=wc.Configure.Generative.openai(),
    replication_config=wc.Configure.replication(factor=3),
    vector_index_config=wc.Configure.VectorIndex.hnsw(
        quantizer=wc.Configure.VectorIndex.Quantizer.pq()
    ),
    sharding_config=wc.Configure.sharding(
        desired_count=3,
    )
)

# Get the data
data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json"
resp = requests.get(data_url)
df = pd.DataFrame(resp.json())

# Get the collection
movies = client.collections.get("Movie")

# Enter context manager
with movies.batch.rate_limit(2400) as batch:
    # Loop through the data
    for i, movie in tqdm(df.iterrows()):
        # Convert data types
        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie["release_date"], "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )
        # Convert a JSON array to a list of integers
        genre_ids = json.loads(movie["genre_ids"])

        # Build the object payload
        movie_obj = {
            "title": movie["title"],
            "overview": movie["overview"],
            "vote_average": movie["vote_average"],
            "genre_ids": genre_ids,
            "release_date": release_date,
            "tmdb_id": movie["id"],
        }

        # Add object to batch queue
        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie["id"])
            # references=reference_obj  # You can add references here
        )
        # Batcher automatically sends batches

# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")

680it [00:00, 13591.07it/s]


In [6]:
wc.Configure.VectorIndex.Quantizer.pq??

[0;31mSignature:[0m
[0mwc[0m[0;34m.[0m[0mConfigure[0m[0;34m.[0m[0mVectorIndex[0m[0;34m.[0m[0mQuantizer[0m[0;34m.[0m[0mpq[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbit_compression[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcentroids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoder_distribution[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mweaviate[0m[0;34m.[0m[0mcollections[0m[0;34m.[0m[0mclasses[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0mPQEncoderDistribution[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoder_type[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mweaviate[0m[0;34m.[0m[0mcollections[0m[0;34m.[0m[0mclasses[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0mPQEncoderType[0m[0;34m][0m [0;34m=[0m [0

## Searches

In [3]:
import weaviate.classes.query as wq

# Get the collection
movies = client.collections.get("Movie")

# Perform query
response = movies.query.near_text(
    query="dystopian future",
    limit=5,
    return_metadata=wq.MetadataQuery(distance=True),
    filters=wq.Filter.by_property("release_date").greater_than(datetime(2020, 1, 1))
)

# Inspect the response
for o in response.objects:
    print(
        o.properties["title"], o.properties["release_date"].year
    )  # Print the title and release year (note the release date is a datetime object)
    print(
        f"Distance to query: {o.metadata.distance:.3f}\n"
    )  # Print the distance of the object from the query


            If you want to use a different timezone, please specify it in the datetime object. For example:
            datetime.datetime(2021, 1, 1, 0, 0, 0, tzinfo=datetime.timezone(-datetime.timedelta(hours=2))).isoformat() = 2021-01-01T00:00:00-02:00
            


Dune 2021
Distance to query: 437.000

Godzilla vs. Kong 2021
Distance to query: 441.000

Tenet 2020
Distance to query: 446.000

Eternals 2021
Distance to query: 447.000

The Adam Project 2022
Distance to query: 447.000



## RAG

In [4]:
from pprint import pprint

# Perform query
response = movies.generate.near_text(
    query="dystopian future",
    limit=5,
    single_prompt="Translate this into French: {title}",
    grouped_task="What do these movies have in common?",
)

# Inspect the response
pprint(response.generated)  # Print the generated text (the commonalities between them)
for o in response.objects:
    print(o.properties["title"], "|", o.generated)  # Print the title & generated text (the title, in French)

('These movies all take place in a dystopian future society where the '
 'characters must navigate oppressive systems and fight for survival. They '
 'also explore themes of control, rebellion, and the consequences of '
 'technological advancements on humanity.')
In Time | À temps
Gattaca | Gattaca
I, Robot | Je, Robot
Mad Max: Fury Road | Mad Max: Fury Road
The Maze Runner | Le Labyrinthe
