# 01 Vector Search

In [0]:
%pip install --upgrade --force-reinstall -qqq databricks-vectorsearch
dbutils.library.restartPython()

# Create vector search endpoint

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

In [0]:
vector_search_endpoint_name = "vector-search"

endpoint_names = [endpoint["name"] for endpoint in vsc.list_endpoints()["endpoints"]]

if vector_search_endpoint_name not in endpoint_names:
    vsc.create_endpoint(
        name=vector_search_endpoint_name,
        endpoint_type="STANDARD"
    )
    print(f"Created endpoint {vector_search_endpoint_name}")
else:
    print(f"Endpoint {vector_search_endpoint_name} already exists")

In [0]:
endpoint = vsc.get_endpoint(name=vector_search_endpoint_name)
endpoint

# Create Azure OpenAI Embedding Model Endpoint

In [0]:
import mlflow.deployments

client = mlflow.deployments.get_deploy_client("databricks")

In [0]:
embedding_model_endpoint_name = "text-embedding-3-large"
api_base = ""
api_key = ""

endpoint_names = [endpoint["name"] for endpoint in client.list_endpoints()]

if embedding_model_endpoint_name not in endpoint_names:
    client.create_endpoint(
        name=embedding_model_endpoint_name,
        config={
            "served_entities": [
                {
                    "name": "text-embedding-3-large",
                    "external_model": {
                        "name": "text-embedding-3-large",
                        "provider": "openai",
                        "task": "llm/v1/embeddings",
                        "openai_config": {
                            "openai_api_type": "azure",
                            "openai_api_key_plaintext": api_key,
                            "openai_api_base": api_base,
                            "openai_deployment_name": "text-embedding-3-large",
                            "openai_api_version": "2024-10-21"
                        },
                    }
                }
            ]
        }
    )
    print(f"Created endpoint {embedding_model_endpoint_name}")
else:
    print(f"Endpoint {embedding_model_endpoint_name} already exists")

# Source Table

In [0]:
dataset = "databricks_databricks_documentation_dataset.v01.docs"
source_df = spark.table(dataset)
display(source_df)

In [0]:
source_table_name = "main.billing.databricks_documentation"

spark.sql(f"DROP TABLE IF EXISTS {source_table_name}")

source_df.write.format("delta").mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(source_table_name)

# Create Vector Index

In [0]:
index_name = "main.billing.databricks_documentation_index"

index = vsc.create_delta_sync_index(
    endpoint_name=vector_search_endpoint_name,
    index_name=index_name,
    primary_key="id",
    source_table_name=source_table_name,
    pipeline_type="TRIGGERED",
    embedding_dimension=3072,
    embedding_source_column="content",
    embedding_model_endpoint_name=embedding_model_endpoint_name
)

index.describe()['status']['message']

# Similarity Search

In [0]:
results = index.similarity_search(
    query_text="databricks",
    columns=["id", "content"],
    num_results=2
)

for result in results["result"]["data_array"]:
    print(f"Content:\n{result[1]}")
    print("\n-----\n")