# DICOM Tags Table Creation and Vector Search Database

This notebook aims to create the `dicom_tags` table, establish a vector search database for it, and provide comprehensive instructions to create a powerful Genie space. The steps include retrieving DICOM tags, extracting tag values, and ensuring accurate identification of patient and scan data.

In [0]:
%pip install databricks-vectorsearch==0.64
dbutils.library.restartPython()

In [0]:
%run ./config/proxy_prep

In [0]:
sql_warehouse_id, table = init_widgets(show_volume=False)
init_env()

catalog, schema, table_name = table.split(".")
vs_endpoint = "pixels_vs_endpoint"

# Create **dicom_tags** table.
This table will contain all the dicom tags and description available in the standard

In [0]:
import json

with open('dbx/pixels/resources/dicom_tags.ndjson', 'r') as f:
    tags = [json.loads(line) for line in f]

tags_df = spark.createDataFrame(tags)

tags_df.select("Tag","Name","Keyword","VR","VM","Retired").write.mode("overwrite").saveAsTable(schema+".dicom_tags")
spark.sql(f"ALTER TABLE {catalog}.{schema}.dicom_tags SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

# Create VectorSearch endpoint and vs index table
This step will guide you through creating a Vector Search endpoint, which enables efficient similarity search over DICOM tag embeddings. The endpoint will be used to power advanced search and retrieval capabilities in your Genie space.

In [0]:
from databricks.vector_search.client import VectorSearchClient

# The following line automatically generates a PAT Token for authentication
client = VectorSearchClient()

try:
  client.get_endpoint(vs_endpoint)
except:
  client.create_endpoint(
      name=vs_endpoint,
      endpoint_type="STANDARD"
  )

index = client.create_delta_sync_index(
  endpoint_name=vs_endpoint,
  source_table_name=f"{catalog}.{schema}.dicom_tags",
  index_name=f"{catalog}.{schema}.dicom_tags_vs",
  pipeline_type="TRIGGERED",
  primary_key="Tag",
  embedding_source_column="Name",
  embedding_model_endpoint_name="databricks-bge-large-en"
)


# Create VS Functions

In [0]:
import os
import os.path
from pathlib import Path
import dbx

path = Path(dbx.__file__).parent
sql_functions_path = f"dbx/pixels/resources/genie/CREATE_VS_FUNCTION.sql"

with open(sql_functions_path, "r") as f:
  sql_command = f.read()\
    .replace("{UC_SCHEMA}", f"{catalog}.{schema}")\
    .replace("{UC_DICOM_TAGS_VS}", f"{catalog}.{schema}.dicom_tags_vs")
  spark.sql(sql_command)
  print("SQL command executed")

# Create GENIE SPACE

In [0]:
import dbx
import os
import requests

genie_api_endpoint = f"{os.environ['DATABRICKS_HOST']}/api/2.0/genie/spaces"
token = os.environ['DATABRICKS_TOKEN']

description = "A Genie space for exploring and querying DICOM pixel metadata, enabling natural language interaction and insights for medical imaging data."
parent_path = os.getcwd()+"/genie/"
serialized_space_file = "dbx/pixels/resources/genie/serialized_space.json"
title = "Pixels - Genie"

with open(serialized_space_file, "r") as f:
  serialized_space = f.read().replace("{UC_SCHEMA}", f"{catalog}.{schema}")
  body = {
    "title": title,
    "description": description,
    "serialized_space": serialized_space,
    "warehouse_id": sql_warehouse_id,
    "parent_path": parent_path
  }
  dbutils.fs.mkdirs(parent_path)
  
  resp = requests.post(genie_api_endpoint, headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json"}, json=body)
  if resp.status_code != 200:
    raise Exception(f"Error creating Genie space: {resp.text}")
  else:
    resp_body = resp.json()
    print(f"Genie space created, id: {resp_body['space_id']} title: {resp_body['title']}")