In [1]:
import openai, json, os, requests, time, csv, uuid
from openai import AzureOpenAI

from tenacity import retry, wait_random_exponential, stop_after_attempt  
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from dotenv import load_dotenv
from cosmosdb_mongodb import  insert_one_if_not_exists, create_index , update_one ,copy
from urllib.parse import quote
from pymongo import MongoClient
import re
from unidecode import unidecode
import glob
import os.path

load_dotenv("./.env")

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION")
os.environ["azure_endpoint"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["OPENAI_EMBEDDINGS_MODEL_NAME"] = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

In [2]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def generate_embeddings(openai_client, text):
    """
    Generates embeddings for a given text using the OpenAI API v1.x
    """
    response = openai_client.embeddings.create(
        input = text,
        model= os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
    )
    
    embeddings = response.data[0].embedding
    return embeddings

In [3]:
def init_cosmos():
    """
    Initialize the CosmosDB client, database, and collections
    
    Returns:
        database: CosmosDB database
        products_collection: CosmosDB collection for products
        customers_collection: CosmosDB collection for customers
    """

    host = os.getenv("COSMOSDB_MONGODB_HOST")
    username = os.getenv("COSMOSDB_MONGODB_USERNAME")
    password = os.getenv("COSMOSDB_MONGODB_PASSWORD")
    database_name = "database"
    products_collection_name = "product"
    customers_collection_name = "customers"
    insurances_collection_name = "insurance"
    document_collection_name = "document"
    all_collection_name = "all"
    
    
    # Encode the password
    encoded_password = quote(password, safe='')

    connection_string = f'mongodb+srv://{username}:{encoded_password}@{host}/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000'


    connection_string = f'mongodb+srv://edesa:Toto1234@sbi6na5irt4te-mongo.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000'
     
    client = MongoClient(connection_string)
        
    database = client[database_name]
    insurances_collection = database[insurances_collection_name]
    products_collection = database[products_collection_name]
    customers_collection = database[customers_collection_name]
    document_collection = database[document_collection_name]
    all_collection = database[all_collection_name]
    
    return database, products_collection, customers_collection , insurances_collection ,document_collection, all_collection

In [4]:
def add_doc(openai_client, collection, doc):
    """ 
    Add document to Azure Cosmos DB for MongoDB vCore collection
    """
    try:
        
        doc["textContent"] = json.dumps(doc)
        doc["vectorContent"] = generate_embeddings(openai_client, doc["textContent"])
      
        
        insert_one_if_not_exists(collection, doc)
        """
        update_one(collection,doc)
        """
       
    except Exception as e:
        print(str(e))

In [5]:
def add_docdocument(openai_client, collection, doc):
    """ 
    Add document to Azure Cosmos DB for MongoDB vCore collection
    """
    try:
        print (doc)
        docu = {}
        docu["id"] = "{'re.sub('[^a-z0-9]'}"
        docu["textContent"] = json.dumps(doc)
        docu["vectorContent"] = generate_embeddings(openai_client, docu["textContent"])
      
     
        insert_one_if_not_exists(collection, docu)
        """
        update_one(collection,docu)
        """
       
    except Exception as e:
        print(str(e))

In [6]:
# Init cosmos db
database, products_collection, customers_collection, insurance_collection, document_collection, all_collection = init_cosmos()

In [18]:
%pip install pyPDF2

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import glob
import PyPDF2

openai_client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = os.getenv("AZURE_OPENAI_API_VERSION"),  
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)


with ThreadPoolExecutor(max_workers=5) as executor:
    for source in glob.glob("dataset/*.pdf"):
        with open(source, 'rb') as f:
            reader = PyPDF2.PdfReader (f)
            i = len(reader.pages)
            print(i)
            j = 0
            while  j < i : 
              pageObj = reader.pages[j]
              text = pageObj.extract_text()
              j = j + 1
              executor.submit(add_docdocument, openai_client,document_collection ,text)
        
            f.close()
            
            

In [12]:

copy(document_collection,all_collection)
copy(insurance_collection,all_collection)


did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
did not copy
