### Create Kusto table
.create table bookEmbeddings (document_name:string, content:string, embedding:string)


In [25]:
%pip install openai==1.12.0 azure-kusto-data langchain tenacity langchain-openai pypdf


StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 36, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [None]:
from openai import AzureOpenAI
from IPython.display import display, HTML
import os
import textwrap
import json 
from notebookutils import mssparkutils
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoServiceError
from azure.kusto.data.helpers import dataframe_from_result_table

from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

OPENAI_GPT4_DEPLOYMENT_NAME="gpt-4"
OPENAI_DEPLOYMENT_ENDPOINT="<your-azure openai endpoint>" 
OPENAI_API_KEY="<your-azure openai api key>"
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = "text-embedding-ada-002"

KUSTO_URI = "<your-kusto-uri>"
KUSTO_DATABASE = "GenAI_eventhouse"
KUSTO_TABLE = "bookEmbeddings"
accessToken = mssparkutils.credentials.getToken(KUSTO_URI)

In [None]:
client = AzureOpenAI(
        azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
        api_key=OPENAI_API_KEY,
        api_version="2023-09-01-preview"
    )

#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text): 
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return client.embeddings.create(input = [txt], model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME).data[0].embedding



## Create embeddings for the data

In [None]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

documentName = "moby dick book"
#Copy File API path
fileName = "/lakehouse/default/Files/moby dick.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split(text_splitter=splitter)
print("Number of pages: ", len(pages))

In [29]:
#save all the pages into a pandas dataframe
import pandas as pd
df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
for page in pages:
    df.loc[len(df.index)] = [documentName, page.page_content, ""]  
df.head()

StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 41, Finished, Available, Finished)

Unnamed: 0,document_name,content,embedding
0,moby dick book,The Project Gutenberg eBook of Moby -Dick; or ...,
1,moby dick book,CONTENTS \n \nETYMOLOGY. \n \nEXTRACTS (Supp...,
2,moby dick book,CHAPTER 11. Nightgown. \n \nCHAPTER 12. Biogr...,
3,moby dick book,CHAPTER 41. Moby Dick. \n \nCHAPTER 42. The W...,
4,moby dick book,CHAPTER 67. Cutting In. \n \nCHAPTER 68. The ...,


In [30]:
# calculate the embeddings using openAI ada 

df["embedding"] = df.content.apply(lambda x: generate_embeddings(x))
print(df.head(2))

StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 42, Finished, Available, Finished)

    document_name                                            content  \
0  moby dick book  The Project Gutenberg eBook of Moby -Dick; or ...   
1  moby dick book  CONTENTS  \n \nETYMOLOGY.  \n \nEXTRACTS (Supp...   

                                           embedding  
0  [-0.01946941949427128, -0.03041059710085392, -...  
1  [0.008645636960864067, 0.003408772638067603, 0...  


In [31]:
#write the data to kusto
df_sp = spark.createDataFrame(df)

df_sp.write.\
format("com.microsoft.kusto.spark.synapse.datasource").\
option("kustoCluster",KUSTO_URI).\
option("kustoDatabase",KUSTO_DATABASE).\
option("kustoTable", KUSTO_TABLE).\
option("accessToken", accessToken ).\
mode("Append").save()

StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 43, Finished, Available, Finished)

### Vector search on Fabric Eventhouse

In [32]:
def call_openAI(text):
    response = client.chat.completions.create(
        model=OPENAI_GPT4_DEPLOYMENT_NAME,
        messages = text,
        temperature=0
    )

    return response.choices[0].message.content

StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 44, Finished, Available, Finished)

In [36]:
def get_answer_from_eventhouse(question, nr_of_answers=1):
        searchedEmbedding = generate_embeddings(question)
        kusto_query = KUSTO_TABLE + " | extend similarity = series_cosine_similarity(dynamic("+str(searchedEmbedding)+"), embedding) | top " + str(nr_of_answers) + " by similarity desc "
        kustoDf  = spark.read\
        .format("com.microsoft.kusto.spark.synapse.datasource")\
        .option("kustoCluster",KUSTO_URI)\
        .option("kustoDatabase",KUSTO_DATABASE)\
        .option("accessToken", accessToken)\
        .option("kustoQuery", kusto_query).load()

        return kustoDf

StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 48, Finished, Available, Finished)

In [37]:
nr_of_answers = 2
question = "Why does the coffin prepared for Queequeg become Ishmael's life buoy once the Pequod sinks?"
answers_df = get_answer_from_eventhouse(question, nr_of_answers)

answer = ""
for row in answers_df.rdd.toLocalIterator():
    answer = answer + " " + row['content']

prompt = 'Question: {}'.format(question) + '\n' + 'Information: {}'.format(answer)
# prepare prompt
messages = [{"role": "system", "content": "You are a HELPFUL assistant answering users questions. Answer the question using the provided information and do not add anything else."},
            {"role": "user", "content": prompt}]

result = call_openAI(messages)
display(result)


StatementMeta(, 96c61d3b-354b-4de5-b731-571402b6dfe8, 49, Finished, Available, Finished)

"The coffin prepared for Queequeg becomes Ishmael's life buoy once the Pequod sinks because it was rigged as a life-buoy by the ship's crew. The crew decided to use Queequeg's coffin as a substitute for the ship's missing life-buoy, following Queequeg's own hint concerning his coffin. The carpenter was instructed to rig the coffin as a life-buoy, which involved not nailing down the lid, caulking the seams, or paying over the same with pitch, as would have been done for a burial at sea. Instead, the coffin was prepared to serve as a buoyancy device, which ultimately saved Ishmael's life when the Pequod sank."