### Water Quality Agent Setup
Only modify inputs in the `Configure` section

This notebook does the following:
1. Creates Delta Tables from CSV files
2. Creates a Volume for PDFs
3. Creates a Vector Search Endpoint
4. Creates Unity Catalog registered function to use as a Agent tool
5. Create Vector Search Index to use as a Agent tool

In [0]:
%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
import re
import pandas as pd
from pyspark.sql import SparkSession
from databricks.vector_search.client import VectorSearchClient

### Configure

In [0]:
catalog = "users"
schema = "david_hurley"
volume = "water_quality_pdfs"

vector_search_endpoint_name = "demo_endpoint"
delta_table_name = "david_water_quality_guidelines"
vector_search_index_name = "david_water_quality_guidelines_vs"

### Create Data Assets

In [0]:
spark.sql(f"""CREATE CATALOG IF NOT EXISTS {catalog}""")
spark.sql(f"""CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}""")
spark.sql(f"""CREATE VOLUME {catalog}.{schema}.{volume}""")

In [0]:
csv_to_create_tables_for = [
    "alberta_wtest_well.csv",
    "alberta_wtest_well_test.csv",
    "alberta_wtest_wqual_analysis_info.csv",
    "alberta_wtest_wqual_details.csv"
]

for filename in csv_to_create_tables_for:
    df = pd.read_csv(f"./assets/{filename}", low_memory=False)
    spark_df = spark.createDataFrame(df)
    spark_df.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.{filename.split('.csv')[0]}")
    print(f"Created {catalog}.{schema}.{filename}")

**You need to manually copy `water-quality-guideline.pdf` under the `assets` directory into the newlycreated `water_quality_pdfs` volume**

### Create Vector Search Endpoint
If working as a group only one member needs to run below

In [0]:
# client = VectorSearchClient()

# try:
#     existing_endpoint = client.get_endpoint(name=vector_search_endpoint_name)
#     print(f"Found existing vector search endpoint: {vector_search_endpoint_name}")
# except:
#     client.create_endpoint(
#         name=vector_search_endpoint_name,
#         endpoint_type="STANDARD"
#     )

### Create Unity Catalog Registered Function Agent Tool

In [0]:
spark.sql(f"""
          CREATE OR REPLACE FUNCTION {catalog}.{schema}.lookup_water_well_tests(
            longitude FLOAT COMMENT 'Longitude of the user',
            latitude FLOAT COMMENT 'Latitude of the user',
            radius INT COMMENT 'Radius to search within'
          )
          RETURNS TABLE
          COMMENT 'Returns results of water wells tested near user'
          RETURN
            WITH wells_near_user AS (
              SELECT 
                  WELL_ID,
                  st_distancespheroid(
                    st_point(LONGITUDE, LATTITUDE), 
                    st_point(longitude, latitude)
                  ) AS user_distance_to_test_well_in_meters
            FROM {catalog}.{schema}.alberta_wtest_well
            )
            SELECT 
              t4.PARAMETER_NAME,
              t4.PARAMETER_VALUE,
              t1.user_distance_to_test_well_in_meters
            FROM
            wells_near_user as t1
            LEFT JOIN {catalog}.{schema}.alberta_wtest_well_test as t2
            ON t1.WELL_ID = t2.WELL_ID
            LEFT JOIN {catalog}.{schema}.alberta_wtest_wqual_details as t3
            ON t2.WELL_TEST_ID = t3.WELL_TEST_ID
            LEFT JOIN {catalog}.{schema}.alberta_wtest_wqual_analysis_info as t4
            ON t4.WQUAL_DETAILS_ID = t3.WQUAL_DETAILS_ID
            WHERE user_distance_to_test_well_in_meters < radius
""")

In [0]:
%sql
SELECT * FROM users.david_hurley.lookup_water_well_tests(-114.0719, 51.0447, 30000)

### Parse PDF Document and Create Vector Search Index as Agent Tool

In [0]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# use the Databricks ai_parse_document() function
df = spark.sql(f"""
          WITH corpus AS (
            SELECT
              path,
              ai_parse_document(content) AS parsed
            FROM
              READ_FILES(
                "/Volumes/{catalog}/{schema}/{volume}/*.pdf",
                FORMAT => 'binaryFile'
              )
          )
          SELECT
            path,
            parsed:document:pages AS pages,
            parsed:document:elements AS elements,
            parsed:error_status AS error_status,
            parsed:metadata AS metadata
          FROM corpus;
          """)

# naively chunk the text
df_chunked = (
    df
    .select(F.explode(F.col("elements").cast("array<struct<content:string>>")).alias("element"))
    .select(F.col("element.content").alias("content"))
    .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
)

display(df_chunked)

In [0]:
df_chunked.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.{delta_table_name}")

# need to enable change data feed to make Vector Search Index
spark.sql(f"ALTER TABLE {catalog}.{schema}.{delta_table_name} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")


In [0]:
try:
  existing_index = client.get_index(index_name=f"{catalog}.{schema}.{vector_search_index_name}")
  print(f"Found existing index: {vector_search_index_name}")
except:
  index = client.create_delta_sync_index(
    endpoint_name=vector_search_endpoint_name,
    source_table_name=f"{catalog}.{schema}.{delta_table_name}",
    index_name=f"{catalog}.{schema}.{vector_search_index_name}",
    pipeline_type="TRIGGERED",
    primary_key="id",
    embedding_source_column="content",
    embedding_model_endpoint_name="databricks-gte-large-en"
  )