In [58]:
import pymongo
import ijson
from openai import AzureOpenAI
from dotenv import dotenv_values
import urllib
from tenacity import retry, stop_after_attempt, wait_random_exponential
from time import sleep


# Load environment values and intantiate clients

In [59]:

# specify the name of the .env file name 
env_name = ".env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)
cosmos_conn = config['cosmos_connection_string']
cosmos_database = config['cosmos_database_name']
cosmos_collection = config['cosmos_collection_name']
cosmos_vector_property = config['cosmos_vector_property_name']
cosmos_cache = config['cosmos_cache_collection_name']
# Create the MongoDB client
cosmos_client = pymongo.MongoClient(cosmos_conn)

storage_file_url = config['storage_file_url']

openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_version = config['openai_version']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_model = config['openai_embeddings_model']
openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])
openai_completions_deployment = config['openai_completions_deployment']
openai_completions_model = config['openai_completions_model']
# Create the OpenAI client
openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_version)



In [61]:
def create_collection_and_vector_index(database, cosmos_collection, vector_property, embeddings_dimensions):

    collection = database[cosmos_collection]

    # database.command(
    #     {
    #         "createIndexes": cosmos_collection,
    #         "indexes": [
    #             {
    #                 "name": "VectorSearchIndex",
    #                 "key": {
    #                     vector_property: "cosmosSearch"
    #                 },
    #                 "cosmosSearchOptions": { 
    #                     "kind": "vector-ivf", 
    #                     "similarity": "COS", 
    #                     "dimensions": embeddings_dimensions
    #                 } 
    #             } 
    #         ] 
    #     }
    # )  
    
# get 5 result 

    # Command for HNSW index
    database.command(
        {
            "createIndexes": cosmos_collection,
            "indexes": [
                {
                    "name": "VectorSearchIndex",
                    "key": {
                        vector_property: "cosmosSearch"
                    },
                    "cosmosSearchOptions": { 
                        "kind": "vector-hnsw", 
                        "m": 16, # default value 
                        "efConstruction": 64, # default value 
                        "similarity": "COS", 
                        "dimensions": embeddings_dimensions
                    } 
                } 
            ] 
        }
    )  

    return collection
    

In [56]:

# Check if the collection database and drop if it does
if cosmos_database in cosmos_client.list_database_names():
    cosmos_client.drop_database(cosmos_database)

# Create the database FabConfDB
database = cosmos_client[cosmos_database]

# Create the data collection with vector index
collection = create_collection_and_vector_index(database, cosmos_collection, cosmos_vector_property, openai_embeddings_dimensions)

# Create the cache collection with vector index
cache = create_collection_and_vector_index(database, cosmos_cache, cosmos_vector_property, openai_embeddings_dimensions)


In [71]:
import os
from openai import AzureOpenAI

@retry(wait=wait_random_exponential(min=1, max=200), stop=stop_after_attempt(20))
def generate_embeddings(input_string):
    """
    Retrieves embeddings for the given input string using Azure OpenAI.

    Args:
        input_string (str): The input string for which embeddings need to be retrieved.

    Returns:
        dict: A dictionary containing the response from Azure OpenAI embeddings API.
    """
    client = AzureOpenAI(
        azure_endpoint=openai_endpoint,
        azure_deployment=openai_embeddings_deployment,
        api_version=openai_version,
        api_key=openai_key
    )

    response = client.embeddings.create(
        input=input_string,
        model=os.getenv("openai_embeddings_deployment")
    )

    return response.data[0].embedding

# For testing purposes only
# object['overview'] = "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
# vectorArray = generate_embeddings(object['overview'])
# vectorArray


In [72]:
print("Started downloading file")
# open the file and stream the data to ingest
stream = urllib.request.urlopen(storage_file_url)

counter = 0
print("completed downloading file")

# iterate through the stream, generate vectors and insert into collection
for object in ijson.items(stream, 'item', use_float=True):

    #generate embeddings
    # object['overview'] = "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
    vectorArray = generate_embeddings(object['overview'])

    #add the document to the collection
    object[cosmos_vector_property] = vectorArray

    #insert the document into the collection
    collection.insert_one(object)

    counter += 1

    if counter % 100 == 0:
        print("Inserted {} documents into collection: '{}'.".format(counter, collection.name))
        sleep(.5)   # sleep for 0.5 seconds to help avoid rate limiting


print("Data inserted into collection: '{}'.\n".format(collection.name))

Started downloading file
completed downloading file
Inserted 100 documents into collection: 'movies'.
Inserted 200 documents into collection: 'movies'.
Inserted 300 documents into collection: 'movies'.
Inserted 400 documents into collection: 'movies'.
Inserted 500 documents into collection: 'movies'.
Inserted 600 documents into collection: 'movies'.
Inserted 700 documents into collection: 'movies'.
Inserted 800 documents into collection: 'movies'.
Inserted 900 documents into collection: 'movies'.
Inserted 1000 documents into collection: 'movies'.
Inserted 1100 documents into collection: 'movies'.
Inserted 1200 documents into collection: 'movies'.
Inserted 1300 documents into collection: 'movies'.
Inserted 1400 documents into collection: 'movies'.
Inserted 1500 documents into collection: 'movies'.
Inserted 1600 documents into collection: 'movies'.
Inserted 1700 documents into collection: 'movies'.
Inserted 1800 documents into collection: 'movies'.
Inserted 1900 documents into collection