In [0]:
%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
import re
from pyspark.sql import SparkSession
from databricks.vector_search.client import VectorSearchClient

### Update this Section

In [0]:
# Update this to YOUR catalog and schema name
catalog = "users"
schema = "david_hurley"

# This is the path to the cloned artifacts
# TODO: This could instead be added to a Managed Volume and subsequent code repointed
path_to_artifacts = "/Workspace/Users/hurleyldave2@gmail.com/databricks-demos/turnaround-agent/artifacts"

### Create Synthetic Table Data

- Create relation between plant, equipment, and sensors monitoring equipment
- Create fake sensor data and alarm thresholds for sensors

In [0]:
plant_equipment_relations_table_name = "plant_equipment_relations"

df = spark.createDataFrame([
    ("A", "Heat Exchanger 1", [1, 2]),
    ("A", "Heat Exchanger 2", [3]),
    ("A", "Tube Bundle 1", [5]),
    ("B", "Heat Exchanger 1", [7, 8]),
], ["plantId", "equipmentName", "sensorIds"])

spark.sql(f"DROP TABLE IF EXISTS {catalog}.{schema}.{plant_equipment_relations_table_name}")

df.write.mode('overwrite').saveAsTable(f"{catalog}.{schema}.{plant_equipment_relations_table_name}")

In [0]:
sensor_temperature_table_name = "sensor_temperature"

df = spark.createDataFrame([
    (1, 32.5), 
    (1, 33.1), 
    (1, 37.9), 
    (2, 25.2), 
    (2, 26.5), 
    (2, 30.1), 
    (3, 41.5), 
    (3, 27.2), 
    (3, 30.1), 
    (5, 75.2), 
    (5, 76.1), 
    (5, 99.1), 
    (7, 10.0), 
    (7, 15.0), 
    (7, 24.4), 
    (8, 54.6), 
    (8, 56.1), 
    (8, 52.3)
], ["sensorId", "temperature"])

spark.sql(f"DROP TABLE IF EXISTS {catalog}.{schema}.{sensor_temperature_table_name}")

df.write.mode('overwrite').saveAsTable(f"{catalog}.{schema}.{sensor_temperature_table_name}")


In [0]:
sensor_alarm_threshold_temperature_table_name = "sensor_alarm_threshold_temperature"

df = spark.createDataFrame([
    (1, 35),
    (2, 28),
    (3, 45),
    (5, 100),
    (7, 22),
    (8, 64)
], ["sensorId", "alarmThresholdTemperature"])

spark.sql(f"DROP TABLE IF EXISTS {catalog}.{schema}.{sensor_alarm_threshold_temperature_table_name}")

df.write.mode('overwrite').saveAsTable(f"{catalog}.{schema}.{sensor_alarm_threshold_temperature_table_name}")

### Create Unity Catalog Functions
- Get relationship between plant, equipment, and sensors
- Get temperature data for a sensor
- Get alarm settings for a sensor

In [0]:
spark.sql(f"""
CREATE OR REPLACE FUNCTION {catalog}.{schema}.get_plant_equipment_relationship(
  plant_id STRING COMMENT 'Id of the plant to lookup. Example: "A"',
  equipment_name STRING COMMENT 'Name of the equipment to lookup. Example: "Heat Exchanger 1"'
)
RETURNS TABLE
COMMENT 'Returns the relation of plant to equipment to sensors'
RETURN
  SELECT *
  FROM {catalog}.{schema}.{plant_equipment_relations_table_name} 
  WHERE plantId = plant_id
    AND equipmentName ILIKE equipment_name;
""")


In [0]:
spark.sql(f"""
CREATE OR REPLACE FUNCTION {catalog}.{schema}.get_sensor_temperatures(
  sensor_id INT COMMENT 'Id of the sensor to lookup. Example: 1'
)
RETURNS TABLE
COMMENT 'Returns the temperature data for a given sensor'
RETURN SELECT * 
  FROM {catalog}.{schema}.{sensor_temperature_table_name} 
  WHERE sensorId = sensor_id;
""")

In [0]:
spark.sql(f"""
CREATE OR REPLACE FUNCTION {catalog}.{schema}.get_sensor_alarm_threshold_temperature(
  sensor_id INT COMMENT 'Id of the sensor to lookup; Example: 1'
)
RETURNS TABLE
COMMENT 'Returns the alarm threshold temperature for a given sensor'
RETURN SELECT alarmThresholdTemperature
  FROM {catalog}.{schema}.{sensor_alarm_threshold_temperature_table_name} 
  WHERE sensorId = sensor_id;
""")

### Create Vector Search Database
- Create table of markdown text and metadata
- Create vector search endpoint
- Create vector search index

In [0]:
inspection_report_table_name = "turnaround_inspection_reports"

# TODO: This should instead by dynamic and point to a Volume
file_names = [
    "inspection_report_plantA_heat_exchanger_1.md",
    "inspection_report_plantA_heat_exchanger_2.md",
    "inspection_report_plantA_tube_bundle_1.md",
    "inspection_report_plantB_heat_exchanger_1.md"
]

for idx, file_name in enumerate(file_names):
    with open(f"{path_to_artifacts}/{file_name}", "r") as f:
        text = f.read()

    # Extract metadata using regex
    plant_id = re.search(r"\*\*Plant ID\*\*: (.+)", text).group(1).strip()
    equipment_name = re.search(r"\*\*Equipment\*\*: (.+)", text).group(1).strip()
    equipment_id = re.search(r"\*\*Equipment ID\*\*: (.+)", text).group(1).strip()

    # Create UC Delta table with unique index
    data = [(idx, plant_id, equipment_name, equipment_id, text)]
    columns = ["id", "plantId", "equipmentName", "equipmentId", "markdown"]

    df = spark.createDataFrame(data, columns)

    df.write.mode("append").saveAsTable(f"{catalog}.{schema}.{inspection_report_table_name}")

# needed for vector search index
spark.sql(f"ALTER TABLE `{catalog}`.`{schema}`.`{inspection_report_table_name}` SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

In [0]:
endpoint_name = "turnaround_vector_search_endpoint"

client = VectorSearchClient()

client.create_endpoint(
    name=endpoint_name,
    endpoint_type="STANDARD"
)

In [0]:
vector_search_index_name = "turnaround_inspection_reports_vs"

index = client.create_delta_sync_index(
  endpoint_name=endpoint_name,
  source_table_name=f"{catalog}.{schema}.{inspection_report_table_name}",
  index_name=f"{catalog}.{schema}.{vector_search_index_name}",
  pipeline_type="TRIGGERED",
  primary_key="id",
  embedding_source_column="markdown",
  embedding_model_endpoint_name="databricks-gte-large-en"
)