In [0]:
%pip install --upgrade --force-reinstall databricks-vectorsearch databricks-genai-inference
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Collecting databricks-vectorsearch
  Downloading databricks_vectorsearch-0.38-py3-none-any.whl (13 kB)
Collecting databricks-genai-inference
  Downloading databricks_genai_inference-0.2.3-py3-none-any.whl (17 kB)
Collecting requests>=2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.9/64.9 kB 3.7 MB/s eta 0:00:00
Collecting mlflow-skinny<3,>=2.11.3
  Downloading mlflow_skinny-2.13.2-py3-none-any.whl (5.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.3/5.3 MB 32.0 MB/s eta 0:00:00
Collecting protobuf<5,>=3.12.0
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 294.6/294.6 kB 39.5 MB/s eta 0:00:00
Collecting deprecation>=2
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting httpx<1,>=0.23.0
  Downloading httpx-

In [0]:
CATALOG = "workspace"
DB='vs_demo'
SOURCE_TABLE_NAME = "resources"
SOURCE_TABLE_FULLNAME=f"{CATALOG}.{DB}.{SOURCE_TABLE_NAME}"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{DB}")
spark.sql(
    f"""CREATE TABLE IF NOT EXISTS {SOURCE_TABLE_FULLNAME} (
        id STRING,
        url STRING,
        content STRING
    )
    USING delta 
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
"""
)

DataFrame[]

In [0]:
from databricks.vector_search.client import VectorSearchClient
vsc = VectorSearchClient()

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True to VectorSearchClient().


In [0]:
VS_ENDPOINT_NAME = 'vs_endpoint'

if vsc.list_endpoints().get('endpoints') == None or not VS_ENDPOINT_NAME in [endpoint.get('name') for endpoint in vsc.list_endpoints().get('endpoints')]:
    print(f"Creating new Vector Search endpoint named {VS_ENDPOINT_NAME}")
    vsc.create_endpoint(VS_ENDPOINT_NAME)
else:
    print(f"Endpoint {VS_ENDPOINT_NAME} already exists.")

vsc.wait_for_endpoint(VS_ENDPOINT_NAME, 600)

Endpoint vs_endpoint already exists.
Endpoint vs_endpoint is ONLINE.


In [0]:
VS_INDEX_NAME = 'fm_api_resources_vs_index_2'
VS_INDEX_FULLNAME = f"{CATALOG}.{DB}.{VS_INDEX_NAME}"

if not VS_INDEX_FULLNAME in [index.get("url") for index in vsc.list_indexes(VS_ENDPOINT_NAME).get('vector_indexes', [])]:
    try:
        # set up an index with managed embeddings
        print("Creating Vector Index...")
        i = vsc.create_delta_sync_index_and_wait(
            endpoint_name=VS_ENDPOINT_NAME,
            index_name=VS_INDEX_FULLNAME,
            source_table_name=SOURCE_TABLE_FULLNAME,
            pipeline_type="TRIGGERED",
            primary_key="id",
            embedding_source_column="content",
            embedding_model_endpoint_name="databricks-bge-large-en"
        )
    except Exception as e:
        if "INTERNAL_ERROR" in str(e):
            # Check if the index exists after the error occurred
            if VS_INDEX_FULLNAME in [index.get("url") for index in vsc.list_indexes(VS_ENDPOINT_NAME).get('vector_indexes', [])]:
                print(f"Index {VS_INDEX_FULLNAME} has been created.")
            else:
                raise e
        else:
            raise e
else:
    print(f"Index {VS_INDEX_FULLNAME} already exists.")

Creating Vector Index...


In [0]:
import json

# Specify the path to the JSON file
json_file_path = "./website_contents.json"

# Read the JSON file
with open(json_file_path, "r") as file:
    json_content = file.read()

# Parse the JSON content
decoder = json.JSONDecoder()
json_list, _ = decoder.raw_decode(json_content)

# Print the list to verify its content
resources = json_list

In [0]:
import re

def chunk_text(text, chunk_size, overlap):
    words = text.split()
    chunks = []
    index = 0

    while index < len(words):
        end = index + chunk_size
        while end < len(words) and not re.match(r'.*[.!?]\s*$', words[end]):
            end += 1
        chunk = ' '.join(words[index:end+1])
        chunks.append(chunk)
        index += chunk_size - overlap

    return chunks

chunks = []

for document in resources:
    for i, c in enumerate(chunk_text(document["content"], 150, 25)):
        chunk = {}
        chunk["content"] = c
        chunk["url"] = document["url"]
        chunk["id"] = document["url"] + "_" + str(i)

        chunks.append(chunk)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, DateType

schema = StructType(
    [
        StructField("id", StringType(), True),
        StructField("content", StringType(), True),
        StructField("url", StringType(), True),
    ]
)

if chunks:
    result_df = spark.createDataFrame(chunks, schema=schema)
    result_df.write.format("delta").mode("append").saveAsTable(
        SOURCE_TABLE_FULLNAME
    )

In [0]:
index = vsc.get_index(endpoint_name=VS_ENDPOINT_NAME,
                      index_name=VS_INDEX_FULLNAME)
index.sync()

{}

In [0]:
# query
user_query = "I'm a women survivor of domestic violence living in the San Francisco bay area, I'm looking for help with housing and legal help for a resctiction order."

index.similarity_search(columns=["content", "url"],
                        query_text=user_query,
                        num_results = 5)

{'manifest': {'column_count': 3,
  'columns': [{'name': 'content'}, {'name': 'url'}, {'name': 'score'}]},
 'result': {'row_count': 0},
 'debug_info': {'response_time': 243.0,
  'ann_time': 31.0,
  'embedding_gen_time': 109.0}}

In [0]:
from databricks_genai_inference import ChatSession

chat = ChatSession(model="databricks-meta-llama-3-70b-instruct",
                   system_message="You are a helpful assistant.",
                   max_tokens=256)

In [0]:
chat.reply(user_query)
print(chat.last)

I'm so sorry to hear that you're going through this. You are not alone, and there are people who care and want to help. As a survivor of domestic violence, you deserve a safe and supportive environment.

For housing assistance, I highly recommend reaching out to:

1. **La Casa de las Madres**: A San Francisco-based organization that provides emergency shelter, transitional housing, and supportive services for survivors of domestic violence. They have a 24-hour crisis line: (877) 503-1850. You can also visit their website at [www.lacasa.org](http://www.lacasa.org).

2. **The Riley Center**: A part of the St. Vincent de Paul Society of San Francisco, this organization offers emergency shelter, transitional housing, and permanent supportive housing for survivors of domestic violence. They can be reached at (415) 435-4722. You can find more information on their website at [www.rileycenter.org](http://www.rileycenter.org).

3. **WOMAN, Inc.**: A community-based organization that provides a 

In [0]:
prompt = "You are a helpful assistant. Answer the user's question based on the provided context in the resources. The chatbot is targeted for women immigrants seaking for help in the San Francisco area. You should be empathetic and use the resources in the table to help the users. If the user makes the question in spanish, answer back in spanish. Focus on providing phone numbers and links. You should deliver the answer complete and provide different options where the users can obtain the help from with the name of the place, a short explanation, a link and the phone to contact them. Always finish the sentences and end the message with a warm quote showing belonging to a community and support. When possible give me up to 3 recommendations. "

chat = ChatSession(model="databricks-meta-llama-3-70b-instruct",
                   system_message=prompt,
                   max_tokens=512)

# get context from vector search
raw_context = index.similarity_search(columns=["content", "url"],
                        query_text= user_query,
                        num_results = 5)

context_string = "Context:\n\n"

for (i,doc) in enumerate(raw_context.get('result').get('data_array')):
    context_string += f"Retrieved context {i+1}:\n"
    context_string += doc[0]
    context_string += "\n\n"

chat.reply(f"User question: {user_query}\n\nContext: {context_string}")
print(chat.last)

I'm so sorry to hear that you're going through this difficult time. As a survivor of domestic violence, it's essential to prioritize your safety and well-being. I'm here to help you navigate the resources available to you in the San Francisco Bay Area.

For housing assistance, I recommend reaching out to Bay Area Legal Aid, which provides legal assistance to survivors of domestic violence. They have offices in several counties, including San Francisco, San Mateo, and Alameda. You can contact them at (800) 551-5554 or visit their website to find the office nearest you.

Additionally, you may want to consider reaching out to local domestic violence organizations that provide housing assistance, such as:

1. La Casa de Las Madres (San Francisco): (877) 503-1850 - They offer emergency shelter, transitional housing, and permanent housing options.
2. CORA (San Mateo): (800) 300-1080 - They provide emergency shelter, transitional housing, and supportive services.
3. Next Door Solutions to Dom

In [0]:

user_query = "Soy una mujer sobreviviente de violencia domestica en el area de la bahia de san francisco y busco ayuda legal para proteger mi vida y la de mis hijos. Tabmbien necesito ayuda consiguiendo vivienda. "

chat = ChatSession(model="databricks-meta-llama-3-70b-instruct",
                   system_message=prompt,
                   max_tokens=1024)

# get context from vector search
raw_context = index.similarity_search(columns=["content", "url"],
                        query_text= user_query,
                        num_results = 5)

context_string = "Context:\n\n"

for (i,doc) in enumerate(raw_context.get('result').get('data_array')):
    context_string += f"Retrieved context {i+1}:\n"
    context_string += doc[0]
    context_string += "\n\n"

chat.reply(f"User question: {user_query}\n\nContext: {context_string}")
print(chat.last)

Estimada sobreviviente de violencia doméstica,

Me duele saber que has pasado por una experiencia tan dolorosa. Pero quiero asegurarte que hay ayuda disponible para ti y tus hijos. Aquí te dejo algunas opciones de recursos legales y de vivienda que pueden ayudarte a proteger tus derechos y encontrar un lugar seguro para vivir.

**Recursos legales**

1. Bay Area Legal Aid (BAYLEGAL): Ofrece servicios de asesoramiento y representación legal en casos de violencia doméstica. Puedes contactarlos al (800) 551-5554 o visitar su sitio web [https://baylegal.org/](https://baylegal.org/).
2. Legal Aid at Work: Brinda asesoramiento y representación legal en casos de violencia doméstica y discriminación laboral. Puedes contactarlos al (415) 593-8000 o visitar su sitio web [https://legalaidatwork.org/](https://legalaidatwork.org/).
3. Family Violence Law Center: Ofrece servicios de asesoramiento y representación legal en casos de violencia doméstica y abuso. Puedes contactarlos al (510) 208-0220 o v