In [1]:
# Import necessary libraries
# pip install azure-search-documents==11.4.0b6
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from azure.core.credentials import AzureKeyCredential  

In [2]:
# Configure the baseline configuration of the OpenAI library for Azure OpenAI Service & Azure Cognitive Search.
OPENAI_API_KEY = "PLEASE_ENTER_YOUR_OWNED_AOAI_SERVICE_KEY"
OPENAI_API_BASE = "https://PLESAE_ENTER_YOUR_OWNED_AOAI_RESOURCE_NAME.openai.azure.com/"
OPENAI_EMBEDDING_DEPLOYMENT_NAME = "PLEASE_ENTER_YOUR_OWNED_AOAI_EMBEDDING_MODEL_NAME"
OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
OPENAI_API_VERSION = "2023-05-15"
OPENAI_API_TYPE = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION
openai.api_type = OPENAI_API_TYPE
AZURE_COGNITIVE_SEARCH_ENDPOINT_NAME = "https://PLESAE_ENTER_YOUR_OWNED_ACS_RESOURCE_NAME.search.windows.net"
AZURE_COGNITIVE_SEARCH_INDEX_NAME = "PLEASE_ENTER_YOUR_OWNED_ACS_INDEX_NAME"
AZURE_COGNITIVE_SEARCH_KEY = "PLEASE_ENTER_YOUR_OWNED_ACS_SERVICE_KEY"
acs_credential = AzureKeyCredential(AZURE_COGNITIVE_SEARCH_KEY)

In [3]:
# Create directory loader object to load the data from the directory.
pdfLoader = DirectoryLoader('./data/')

In [4]:
# Initial the document loader to load the data from the directory.
documents = []
documents.extend(pdfLoader.load())

In [5]:
# Display the number of documents in the directory.
print (f'You have {len(documents)} document(s) in your data folder.')

You have 1 document(s) in your data folder.


In [6]:
# Split the documents into chunks of 1000 characters with 200 characters overlap.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)
print(len(documents))

Created a chunk of size 1164, which is longer than the specified 1000


203


In [7]:
# Define the function to interact with Azure OpenAI Embedding Model.
embeddings = OpenAIEmbeddings(deployment=OPENAI_EMBEDDING_DEPLOYMENT_NAME, 
                                openai_api_key=OPENAI_API_KEY, 
                                model=OPENAI_EMBEDDING_MODEL_NAME, 
                                openai_api_type=OPENAI_API_TYPE, 
                                chunk_size=1)

In [8]:
# Define the function interact with Azure Cognitive Search. If the index does not exist, it will be created.
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=AZURE_COGNITIVE_SEARCH_ENDPOINT_NAME,
    azure_search_key=AZURE_COGNITIVE_SEARCH_KEY,
    index_name=AZURE_COGNITIVE_SEARCH_INDEX_NAME,
    embedding_function=embeddings.embed_query,
)

In [9]:
# Upload the embeddings to the vector store.
vector_store.add_documents(documents=documents)

['ZGUzZjEyMDItMzNiMy00ZDYyLWJmYjEtYjNhMmI4OTdhMjA0',
 'MjFjMDM1NDMtYTViMy00ZGQ3LWFjMjAtMDk1NDZiOGI1MTIx',
 'ODVhYWM4ZjAtZjg2Yi00MjM2LTg2ODAtNjM2MWUzNDNlMTlh',
 'MDc1NGMyNzQtNTJlZC00ZTBlLWI4ZTUtOTU2ZmIzOGQxZTQ5',
 'M2I1Yzg3YjktYmE2Ny00Yzc2LWJkMDYtZDJjMGZjMDMwYTEw',
 'MDgzNDhlZjEtNWUxYy00YzNlLTgwMTctMmViNGFjNDVkNjJj',
 'M2FiMTE1NjgtMTI2Ni00YmFiLTg5NWItNDA0YmY0ZDU4NTY2',
 'YTkwNzJiNjktZmQ3OC00YzI3LTliODEtYWViNzI2MzZjM2Q2',
 'MzM0ZmVkMTQtYzE5NC00MThiLWI1ZTYtZmFhMGFiMTc2MDll',
 'NDQ5Mzc0YzEtMDY5Mi00NWQxLTljMjktMGE2MmIzMDBlMGI3',
 'ZWU2NGM0NGUtNTljMi00ZWQ2LWExMDQtNjEwZDViMDc2ZGUw',
 'MzliY2RiYTQtZTQzZC00ZGIyLWEyMDgtMWI0NTc0MDFjZGZi',
 'MDEyNmQ4MjgtZGYxNC00Y2JkLThjZDMtZmFmMTA0ZTY5Y2My',
 'YjcwOWRhOTEtNzhhZS00YTEzLWI2NmYtZmY0YmJkZmI4MmQ1',
 'YWViM2MyMzUtMjY5Mi00MDQ4LWIxYjAtMDE2MjdjYWYzYzIw',
 'MzBhNmJkOTEtZTI0OC00YjI0LWExZjMtZDM4YmExNGNhMWQx',
 'ZDc1OWU5M2EtZjZkZC00MWNkLTg5YWMtN2VmZmQ4MWU4OWM2',
 'MWMyNmFiNTgtNDE3Ny00MDdhLThjODEtYjk0Y2Q0OTE2ZjAx',
 'ZTEzOWYwOTEtZWZkYS00Y2I0LWE2NTctODU5YTgzNzE1