In [0]:
%pip install langchain langchain-community pypdf
dbutils.library.restartPython()

In [0]:
%pip install --upgrade langchain langchain-community

In [0]:
import os
from pyspark.sql.types import StructType, StructField, StringType
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

CATALOG = "rag_demo_catalog"
SCHEMA = "default1"
VOLUME_NAME = "default"
TABLE_NAME = "rag_knowledge_base" 

VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME_NAME}/"

files = [os.path.join(VOLUME_PATH, f) for f in os.listdir(VOLUME_PATH) if f.endswith('.pdf')]

all_chunks = []

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

for file_path in files:
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    chunks = text_splitter.split_documents(docs)
    
    for chunk in chunks:
        all_chunks.append({
            "page_content": chunk.page_content,
            "source_file": file_path.split("/")[-1]
        })
    print(f" Опрацьовано файл: {file_path.split('/')[-1]} (Чанків: {len(chunks)})")

print(f"\nВсього  чанків: {len(all_chunks)}")

schema = StructType([
    StructField("page_content", StringType(), True),
    StructField("source_file", StringType(), True)
])

df = spark.createDataFrame(all_chunks, schema)

full_table_name = f"{CATALOG}.{SCHEMA}.{TABLE_NAME}"

(df.write
    .format("delta")
    .mode("overwrite")
    .option("delta.enableChangeDataFeed", "true") 
    .saveAsTable(full_table_name)
)

print(f"\nТаблицю  створено: {full_table_name}")
display(df)

In [0]:
from pyspark.sql import functions as F

table_name = "rag_demo_catalog.default1.rag_knowledge_base"
df = spark.table(table_name)

df_with_id = df.withColumn("id", F.monotonically_increasing_id())

(df_with_id.write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .option("delta.enableChangeDataFeed", "true") 
  .saveAsTable(table_name)
)

display(spark.table(table_name))

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(f)

ENDPOINT_NAME = "rag_demo_endpoint"
INDEX_NAME = "rag_demo_catalog.default1.rag_knowledge_index"
SOURCE_TABLE = "rag_demo_catalog.default1.rag_knowledge_base"

vsc.create_delta_sync_index(
    endpoint_name=ENDPOINT_NAME,
    source_table_name=SOURCE_TABLE,
    index_name=INDEX_NAME,
    pipeline_type="TRIGGERED",
    primary_key="id",
    embedding_source_column="page_content",
    embedding_model_endpoint_name="databricks-bge-large-en")

In [0]:
%pip install databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
%pip install langchain-databricks databricks-vectorsearch langchain-community mlflow
dbutils.library.restartPython()

In [0]:
%pip install streamlit
dbutils.library.restartPython()

In [0]:
project_path = "/Workspace/Users/koval.pn@ucu.edu.ua/rag_chatbot_app"

dbutils.fs.mkdirs(project_path)

print(project_path)

In [0]:
app_py_content = """
import streamlit as st
import os
from databricks.sdk import WorkspaceClient
from databricks.vector_search.client import VectorSearchClient
from langchain_databricks import ChatDatabricks
from langchain_community.vectorstores import DatabricksVectorSearch
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

ENDPOINT_NAME = "rag_demo_endpoint"
INDEX_NAME = "rag_demo_catalog.default1.rag_knowledge_index"

st.set_page_config(page_title="Databricks RAG Chatbot", layout="wide")

st.title("RAG Chatbot")

# --- ФУНКЦІЇ RAG ---
@st.cache_resource
def get_retriever():
    try:
        w = WorkspaceClient()
        vsc = VectorSearchClient(
            workspace_url=w.config.host,
            personal_access_token=w.config.token,
            disable_notice=True
        )
    except Exception as e1:
        host = os.getenv("DATABRICKS_HOST")
        token = os.getenv("DATABRICKS_TOKEN")
        
        if not host or not token:
            raise ValueError("DATABRICKS_HOST та DATABRICKS_TOKEN не знайдено в оточенні")
        
        vsc = VectorSearchClient(
            workspace_url=host,
            personal_access_token=token,
            disable_notice=True
        )
    
    index = vsc.get_index(endpoint_name=ENDPOINT_NAME, index_name=INDEX_NAME)
    
    vectorstore = DatabricksVectorSearch(
        index, 
        text_column="page_content", 
        columns=["source_file"]
    )
    return vectorstore.as_retriever(search_kwargs={"k": 3})

@st.cache_resource
def get_llm():
    try:
        return ChatDatabricks(
            endpoint="databricks-meta-llama-3-3-70b-instruct",
            temperature=0.1
        )
    except Exception as e:
        st.error(f"Помилка підключення до LLM: {e}")
        return None

def format_docs(docs):
    return "\\n\\n".join([f"[Джерело: {d.metadata.get('source_file', 'Unknown')}]\\n{d.page_content}" for d in docs])

retriever = get_retriever()
llm = get_llm()

if retriever is None or llm is None:
    st.stop()

if "messages" not in st.session_state:
    st.session_state.messages = []

for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

if prompt := st.chat_input("Write your question"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)

    with st.chat_message("assistant"):
        message_placeholder = st.empty()
        full_response = ""
        
        template = '''You are a helpful assistant for Databricks. Answer in English.
Use the following pieces of retrieved context to answer the question. 
If the answer is not in the context, say "I cannot find the answer in the documents".

Контекст:
{context}

Питання: {question}
'''
        prompt_template = ChatPromptTemplate.from_template(template)
        
        chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
            | StrOutputParser()
        )
        
        try:
            for chunk in chain.stream(prompt):
                full_response += chunk
                message_placeholder.markdown(full_response + "▌")
            
            message_placeholder.markdown(full_response)
            st.session_state.messages.append({"role": "assistant", "content": full_response})
            
        except Exception as e:
            st.error(f"Помилка генерації відповіді: {e}")
"""

with open("/Workspace/Users/koval.pn@ucu.edu.ua/rag_chatbot_app/app.py", "w") as f:
    f.write(app_py_content)

In [0]:
import os

ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
host = ctx.apiUrl().get()
token = ctx.apiToken().get()

cmd = (
    f"export DATABRICKS_HOST='{host}' && "
    f"export DATABRICKS_TOKEN='{token}' && "
    "streamlit run app.py --server.port=${DATABRICKS_APP_PORT} --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false"
)

app_yaml_content = f"""
command:
  - "/bin/sh"
  - "-c"
  - "{cmd}"
"""

with open("/Workspace/Users/koval.pn@ucu.edu.ua/rag_chatbot_app/app.yaml", "w") as f:
    f.write(app_yaml_content)

In [0]:
requirements_content = """
streamlit>=1.31.0
databricks-vectorsearch>=0.22
databricks-sdk>=0.12.0
langchain>=0.1.0
langchain-databricks>=0.1.0
langchain-community>=0.0.20
mlflow>=2.9.0
"""

with open("/Workspace/Users/koval.pn@ucu.edu.ua/rag_chatbot_app/requirements.txt", "w") as f:
    f.write(requirements_content)


In [0]:
import requests
import json

token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
host = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()

app_name = "rag-chatbot"

deploy_url = f"{host}/api/2.0/apps/{app_name}/deployments"

deploy_config = {
    "source_code_path": "/Workspace/Users/koval.pn@ucu.edu.ua/rag_chatbot_app"
}

headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

deploy_response = requests.post(deploy_url, headers=headers, json=deploy_config)


print(f"{host}/apps/{app_name}")
