# RAG with Azure AI search 
#### IMPORTANT!! Embeddings Creation - Run this only once !!!
You only need to run this once to create the embeddings and save them to Azure AI Search

We will index a CSV of product names and descriptions



In [1]:
# Import required libraries
import os
import json
from dotenv import load_dotenv

from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters
)


from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import json

load_dotenv()

True

In [2]:
# Configure environment variables
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
azure_openai_embedding_dimensions = 1536
index_name = "product_data_csv"



In [3]:
# Configure OpenAI API
aoai_client = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)
credential = AzureKeyCredential(key)


In [4]:
# Generate Document Embeddings using OpenAI Ada Model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def calc_embeddings(text):
    # model = "deployment_name"
    embeddings = aoai_client.embeddings.create(input = [text], model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME).data[0].embedding
    return embeddings

In [5]:
# Read the CSV file and generate embeddings for title and description fields
import pandas as pd

# Read the CSV file
product_data = pd.read_csv("./data/Product Dataset.csv",encoding = "ISO-8859-1")
product_data.name.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
product_data.description.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
# View the first 5 rows
product_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  product_data.name.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  product_data.description.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)


Unnamed: 0,id,name,description,price
0,552,Sony Turntable - PSLX350H,Sony Turntable - PSLX350H/ Belt Drive System/ ...,
1,580,Bose Acoustimass 5 Series III Speaker System -...,Bose Acoustimass 5 Series III Speaker System -...,$399.00
2,4696,Sony Switcher - SBV40S,Sony Switcher - SBV40S/ Eliminates Disconnecti...,$49.00
3,5644,Sony 5 Disc CD Player - CDPCE375,Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change...,
4,6284,Bose 27028 161 Bookshelf Pair Speakers In Whit...,Bose 161 Bookshelf Speakers In White - 161WH/ ...,$158.00


In [37]:
import uuid
# calculate the embeddings using openAI ada 
product_data["name_embedding"] = product_data.name.apply(lambda x: calc_embeddings(x))
product_data["description_embedding"] = product_data.description.apply(lambda x: calc_embeddings(x))

product_data.to_csv('./data/prd_data_with_embeddings.csv', index=False)
print(product_data.head(2))

    id                                               name  \
0  552                          Sony Turntable - PSLX350H   
1  580  Bose Acoustimass 5 Series III Speaker System -...   

                                         description    price  \
0  Sony Turntable - PSLX350H/ Belt Drive System/ ...      NaN   
1  Bose Acoustimass 5 Series III Speaker System -...  $399.00   

                                      name_embedding  \
0  [0.00967357773333788, -0.008138509467244148, -...   
1  [-0.011157252825796604, 0.0002722474164329469,...   

                               description_embedding  
0  [5.8577807067194954e-05, 0.007230174727737904,...  
1  [-0.0070847030729055405, 0.014446204528212547,...  


In [49]:
#convert the id column to string
product_data['id'] = product_data['id'].apply(str)
# Drop the price column
product_data = product_data.drop('price', axis=1)

# Output embeddings to products.json file
output_path = os.path.join('.', 'data', 'prd_data_with_embeddings.json')

with open(output_path, 'w') as f:
    product_data.to_json(f, orient='records')


In [51]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="name", type=SearchFieldDataType.String),
    SearchableField(name="description", type=SearchFieldDataType.String),
    SearchField(name="name_embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="description_embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="myVectorizer"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myVectorizer",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=OPENAI_DEPLOYMENT_ENDPOINT,
                deployment_id=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
                model_name=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
                api_key=OPENAI_API_KEY
            )
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="name"),
        content_fields=[SemanticField(field_name="description")]
    )
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 product_data_csv created


In [52]:
from azure.search.documents import SearchClient
import json

# Upload some documents to the index
output_path = os.path.join('.', 'data', 'prd_data_with_embeddings.json')
with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

Uploaded 1081 documents


If you are indexing a very large number of documents, you can use the SearchIndexingBufferedSender which is an optimized way to automatically index the docs as it will handle the batching for you

In [None]:
from azure.search.documents import SearchIndexingBufferedSender

# Upload some documents to the index 
output_path = os.path.join('.', 'data', 'prd_data_with_embeddings.json') 
with open(output_path, 'r') as file:  
    documents = json.load(file)  
  
# Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing  
with SearchIndexingBufferedSender(  
    endpoint=service_endpoint,  
    index_name=index_name,  
    credential=credential,  
) as batch_client:  
    # Add upload actions for all documents  
    batch_client.upload_documents(documents=documents)  
print(f"Uploaded {len(documents)} documents in total")  