In [1]:
import os
from dotenv import load_dotenv

# Load environment variables
if load_dotenv():
    print("Found Azure OpenAI API Base Endpoint: " + os.getenv("AZURE_OPENAI_ENDPOINT"))
else: 
    print("Azure OpenAI API Base Endpoint not found. Have you configured the .env file?")

Found Azure OpenAI API Base Endpoint: https://cog-t55y7cebih7rs.openai.azure.com/


In [12]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex

)

credential = AzureKeyCredential(os.environ["AZURE_AI_SEARCH_KEY"]) if len(os.environ["AZURE_AI_SEARCH_KEY"]) > 0 else DefaultAzureCredential()

index_name = "products"

index_client = SearchIndexClient(
    endpoint=os.environ["AZURE_AI_SEARCH_ENDPOINT"], 
    credential=credential
)

# Create a search index with the fields and a vector field which we will fill with a vector based on the overview field
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True),
    SearchableField(name="name", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String),
    SearchableField(name="price", type=SearchFieldDataType.String, sortable=True),
    SearchableField(name="imageUrl", type=SearchFieldDataType.String),
    SearchableField(name="stars", type=SearchFieldDataType.String),
    SearchableField(name="description", type=SearchFieldDataType.String, analyzer_name="en.lucene"),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

#category_name,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,description

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

# Configure the semantic search configuration to prefer title and tagline fields over overview
semantic_config = SemanticConfiguration(
    name="products-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="name"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="description")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 products created


In [13]:
import os
from openai import AzureOpenAI
from azure.search.documents import SearchClient
client = AzureOpenAI(
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version = os.getenv("AZURE_OPENAI_VERSION"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )

embedding_model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

# use an embeddingsmodel to create embeddings
def get_embedding(text, model=embedding_model):
    return client.embeddings.create(input = [text], model=model).data[0].embedding

# 1. define function to parse csv row and create embedding for overview text
def parseProduct(df, ind):
    return dict([
        ("id", str(df['asin'][ind])),
        ("category", str(df['category_name'][ind])),
        ("name", str(df['title'][ind])),
        ("price", str(df['price'][ind])),
        ("stars", str(df['stars'][ind])),
        ("imageUrl", str(df['imgUrl'][ind])),
        ("description", str(df['description'][ind])),
        ("vector", get_embedding(str(df['description'][ind])))
    ])

# 2. load products from json
products = []
import pandas as pd
import numpy as np


search_client = SearchClient(
    endpoint=os.environ["AZURE_AI_SEARCH_ENDPOINT"], 
    index_name=index_name,
    credential=credential
)

## open all csv files in the data directory
from pathlib import Path
directory = '../data'

files = Path(directory).glob('filtered_*.csv')
for file in files:
    print(file)
    products_df = pd.read_csv(file)
    products_df.info()
    line_count = 0
    for ind in products_df.index:
        productEmbedding = parseProduct(products_df, ind)
        products.append(productEmbedding)
        line_count += 1
        print(productEmbedding)
    print(f'Processed {line_count} lines.')
    print('Loaded %s products.' % len(products))

    # 3. upload documents to vector store
    result = search_client.upload_documents(products)
    print(f"Successfully loaded {len(products)} products into Azure AI Search index.")




../data/filtered_dataset_107_backpacks.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category_name      500 non-null    object 
 1   asin               500 non-null    object 
 2   title              500 non-null    object 
 3   imgUrl             500 non-null    object 
 4   productURL         500 non-null    object 
 5   stars              500 non-null    float64
 6   reviews            500 non-null    int64  
 7   price              500 non-null    float64
 8   listPrice          500 non-null    float64
 9   category_id        500 non-null    int64  
 10  isBestSeller       500 non-null    bool   
 11  boughtInLastMonth  500 non-null    int64  
 12  description        500 non-null    object 
dtypes: bool(1), float64(3), int64(3), object(6)
memory usage: 47.5+ KB
{'id': 'B0007QCQGI', 'category': 'Backpacks', 'name': 'Super

KeyboardInterrupt: 

In [9]:
from openai import AzureOpenAI
from azure.search.documents.models import (
    VectorizedQuery
)

client = AzureOpenAI(
        api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
        api_version = os.getenv("AZURE_OPENAI_VERSION"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )

deployment_name = os.getenv("AZURE_OPENAI_COMPLETION_DEPLOYMENT_NAME")
model_name = os.getenv("AZURE_OPENAI_COMPLETION_MODEL")

index_client = SearchClient(
    endpoint=os.environ["AZURE_AI_SEARCH_ENDPOINT"], 
    index_name=index_name,
    credential=credential
)

question = "Tell me about the latest Product. When was it released?"

# create a vectorized query based on the question
vector = VectorizedQuery(vector=get_embedding(question), k_nearest_neighbors=5, fields="vector")


# create search client to retrieve products from the vector store
found_docs = list(search_client.search(
    search_text=None,
    query_type="semantic",
    semantic_configuration_name="products-semantic-config",
    vector_queries=[vector],
    select=["name", "category", "description"],
    top=5
))

# print the found documents and the field that were selected
found_docs_as_text = " "
for doc in enumerate(found_docs, start=1):    
    print("Name: {}".format(doc["name"]))
    print("Category: {}".format(doc["category"]))
    print("----------")

    found_docs_as_text += " "+ "Name: {}".format(doc["name"]) +" "+ "Description: {}".format(doc["description"])

# augment the question with the found documents and ask the LLM to generate a response
system_prompt = "You are an assistant to the user, you are given some context below, please answer the query of the user with as detail as possible"

parameters = [system_prompt, ' Context:', found_docs_as_text , ' Question:', question]
joined_parameters = ''.join(parameters)

response = client.chat.completions.create(
        model = deployment_name,
        messages = [{"role" : "assistant", "content" : joined_parameters}],
    )

print (response.choices[0].message.content)

Name: TechView X500
Category: Tablets
----------
Name: GigaTab Pro 11
Category: Tablets
----------
Name: Galaxy Tab S8
Category: Tablets
----------
Name: UltraSmart Tablet Pro
Category: Electronics
----------
Name: ProTab X10
Category: Tablets
----------
The latest product is the **ProTab X10 by TechMaster**. This cutting-edge tablet is designed for both productivity and entertainment, featuring a 10.1-inch high-definition display, a powerful octa-core processor, 4GB of RAM, and 128GB of internal storage, which is expandable via microSD. It also boasts a battery life of up to 12 hours on a single charge. However, the specific release date for the ProTab X10 is not provided in the given context. For the exact release date, it would be advisable to check TechMaster's official announcements or contact their customer support.
