# RAG with Azure Data Explorer  

#### IMPORTANT!! Embeddings Creation - Run this only once !!!
You only need to run this once to create the embeddings and save them to Azure Data Explorer.   
Then you can use the already created database and table in Azure Data explorer for retrieval

Requires principal permissions  
.add database embeddings admins ('aadapp=22870bd2-cd57-4696-9420-9699f9bdc0c1;16b3c013-d300-468d-ac64-7eda0820b6d3') 'App Registration'

In [1]:
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display, HTML, JSON, Markdown
import os

# Configure environment variables
load_dotenv()

AAD_TENANT_ID = os.getenv("AAD_TENANT_ID")
KUSTO_CLUSTER = os.getenv("KUSTO_CLUSTER")
KUSTO_DATABASE = os.getenv("KUSTO_DATABASE")
KUSTO_TABLE = os.getenv("KUSTO_TABLE")
KUSTO_MANAGED_IDENTITY_APP_ID = os.getenv("KUSTO_MANAGED_IDENTITY_APP_ID")
KUSTO_MANAGED_IDENTITY_SECRET = os.getenv("KUSTO_MANAGED_IDENTITY_SECRET")

# Configure OpenAI API
OPENAI_GPT35_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT35_DEPLOYMENT_NAME")
OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_DEPLOYMENT_NAME")
OPENAI_GPT4V_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4V_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")

OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoServiceError
from azure.kusto.data.helpers import dataframe_from_result_table
from tenacity import retry, wait_random_exponential, stop_after_attempt


In [3]:
embeddingmodel = AzureOpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size = 1)



NameError: name 'AzureOpenAIEmbeddings' is not defined

In [4]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

documentName = "moby dick book"
fileName = "../data/moby dick.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split(text_splitter=splitter)
print("Number of pages: ", len(pages))

Number of pages:  1475


In [None]:
#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddingmodel.embed_query(txt)

In [None]:
#save all the pages into a pandas dataframe
df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
for page in pages:
    df.loc[len(df.index)] = [documentName, page.page_content, ""]  
df.head()

In [None]:
# calculate the embeddings using openAI ada 
df["embedding"] = df.content.apply(lambda x: calc_embeddings(x))
df.to_csv('../data/adx_embeddings.csv', index=False)
print(df.head(2))

### Ingest the embeddings into Azure Data Explorer


In [None]:
# Connect to adx using AAD app registration
cluster = KUSTO_CLUSTER
kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(cluster, KUSTO_MANAGED_IDENTITY_APP_ID, KUSTO_MANAGED_IDENTITY_SECRET,  AAD_TENANT_ID)
client = KustoClient(kcsb)
kusto_db = KUSTO_DATABASE
table_name = "embeddings"

In [None]:
# create table in ADX
createTableCommand = f".create table {table_name} (document_name:string, content:string, embedding:dynamic)"
response = client.execute_mgmt(KUSTO_DATABASE, createTableCommand)
dataframe_from_result_table(response.primary_results[0])

In [None]:
# ingest the dataframe into the table
ingestTableCommand = f".ingest inline into table {table_name} with (ignoreFirstRecord=true) <| {df.to_csv(index=False)} "
response = client.execute(KUSTO_DATABASE, ingestTableCommand)
dataframe_from_result_table(response.primary_results[0])
