# RAG with Fabric Real Time Analytics
#### IMPORTANT!! Embeddings Creation - Run this only once !!!
You only need to run this once to create the embeddings and save them to Fabric Real Time Analytics.  
We will index a CSV of product names and descriptions



In [1]:
# Import required libraries
import os
import json
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoServiceError
from azure.kusto.data.helpers import dataframe_from_result_table

In [13]:
# Configure environment variables
load_dotenv()

AAD_TENANT_ID = os.getenv("AAD_TENANT_ID")
KUSTO_CLUSTER = os.getenv("KUSTO_CLUSTER")
KUSTO_DATABASE = os.getenv("KUSTO_DATABASE")
KUSTO_TABLE = os.getenv("KUSTO_TABLE")
KUSTO_MANAGED_IDENTITY_APP_ID = os.getenv("KUSTO_MANAGED_IDENTITY_APP_ID")
KUSTO_MANAGED_IDENTITY_SECRET = os.getenv("KUSTO_MANAGED_IDENTITY_SECRET")

OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")

OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
AZURE_SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID")
azure_openai_embedding_dimensions = 1536

In [4]:
# Configure OpenAI API
aoai_client = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)

In [5]:
# Generate Document Embeddings using OpenAI Ada Model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def calc_embeddings(text):
    # model = "deployment_name"
    embeddings = aoai_client.embeddings.create(input = [text], model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME).data[0].embedding
    return embeddings

In [6]:
# Read the CSV file and generate embeddings for title and description fields
import pandas as pd

# Read the CSV file
product_data = pd.read_csv("./data/Product Dataset.csv",encoding = "ISO-8859-1")
product_data.name.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
product_data.description.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
# View the first 5 rows
product_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  product_data.name.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  product_data.description.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)


Unnamed: 0,id,name,description,price
0,552,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,
1,580,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,$399.00
2,4696,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,$49.00
3,5644,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,
4,6284,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,$158.00


In [34]:
import uuid
# calculate the embeddings using openAI ada 
product_data["name_embedding"] = product_data.name.apply(lambda x: calc_embeddings(x))
product_data["description_embedding"] = product_data.description.apply(lambda x: calc_embeddings(x))

product_data.to_csv('./data/prd_data_with_embeddings.csv', index=False)

print(product_data.head(2))

    id                                               name  \
0  552                          Sony Turntable - PSLX350H   
1  580  Bose Acoustimass 5 Series III Speaker System -...   

                                         description    price  \
0  Sony Turntable - PSLX350H/ Belt Drive System/ ...      NaN   
1  Bose Acoustimass 5 Series III Speaker System -...  $399.00   

                                      name_embedding  \
0  [0.00967357773333788, -0.008138509467244148, -...   
1  [-0.011157252825796604, 0.0002722474164329469,...   

                               description_embedding  
0  [5.8577807067194954e-05, 0.007230174727737904,...  
1  [-0.0070847030729055405, 0.014446204528212547,...  


In [47]:
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.exceptions import KustoServiceError
from azure.kusto.data.helpers import dataframe_from_result_table

# Connect to adx using AAD app registration
kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(KUSTO_CLUSTER, KUSTO_MANAGED_IDENTITY_APP_ID, KUSTO_MANAGED_IDENTITY_SECRET,  AAD_TENANT_ID)
client = KustoClient(kcsb)
kusto_db = KUSTO_DATABASE


In [43]:
# Connect to adx using AAD app registration
table_name = "embeddingscsv"

# create table in ADX
createTableCommand = f".create table {table_name} (nr:int, name:string, description:string, name_embedding:dynamic, description_embedding:dynamic)"
response = client.execute_mgmt(KUSTO_DATABASE, createTableCommand)
for row in response.primary_results:
    print(row)


{"name": "Table_0", "kind": "PrimaryResult", "data": [{"TableName": "embeddingscsv", "Schema": "{\"Name\":\"embeddingscsv\",\"OrderedColumns\":[{\"Name\":\"nr\",\"Type\":\"System.Int32\",\"CslType\":\"int\"},{\"Name\":\"name\",\"Type\":\"System.String\",\"CslType\":\"string\"},{\"Name\":\"description\",\"Type\":\"System.String\",\"CslType\":\"string\"},{\"Name\":\"name_embedding\",\"Type\":\"System.Object\",\"CslType\":\"dynamic\"},{\"Name\":\"description_embedding\",\"Type\":\"System.Object\",\"CslType\":\"dynamic\"}]}", "DatabaseName": "embeddings", "Folder": null, "DocString": null}]}


In [44]:
# ingest the dataframe into the table
ingestTableCommand = f".ingest inline into table {table_name} with (ignoreFirstRecord=true) <| {product_data.to_csv(index=False)} "
response = client.execute(KUSTO_DATABASE, ingestTableCommand)
dataframe_from_result_table(response.primary_results[0])

Unnamed: 0,ExtentId,ItemLoaded,Duration,HasErrors,OperationId
0,544bdea6-c735-4002-9e91-30b20e5b21ba,inproc:3bf95958-9aea-4473-92be-4bbd0889ef08,0 days 00:00:11.482180400,False,64a9f4e3-9239-4bdd-a36d-0008ddd8ecb7
