In [1]:
# Import required libraries
import os
import json
from dotenv import load_dotenv
import pandas as pd
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter

from langchain_community.document_loaders import PyPDFLoader
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)


from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import json

load_dotenv()
# Configure environment variables
service_endpoint = os.getenv("COPILOT_DEMO_AISEARCH_ENDPOINT")
index_name = os.getenv("COPILOT_DEMO_AISEARCH_INDEX")
key = os.getenv("COPILOT_DEMO_AISEARCH_ADMIN_KEY")

In [2]:
COPILOT_DEMO_AZURE_OPENAI_ENDPOINT = os.getenv("COPILOT_DEMO_AZURE_OPENAI_ENDPOINT")
COPILOT_DEMO_EMBEDDINGS_ADA_DEPLOYMENT_NAME ="text-embedding-ada-002"
COPILOT_DEMO_AZURE_OPENAI_API_VERSION=os.getenv("COPILOT_DEMO_AZURE_OPENAI_API_VERSION")
COPILOT_DEMO_AZURE_OPENAI_API_KEY = os.getenv("COPILOT_DEMO_AZURE_OPENAI_API_KEY")

In [3]:
# Configure OpenAI API
aoai_client = AzureOpenAI(
  azure_endpoint = COPILOT_DEMO_AZURE_OPENAI_ENDPOINT,
  api_key = COPILOT_DEMO_AZURE_OPENAI_API_KEY,
  api_version = COPILOT_DEMO_AZURE_OPENAI_API_VERSION,
)
credential = AzureKeyCredential(key)

In [4]:
# Generate Document Embeddings using OpenAI Ada Model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def calc_embeddings(text):
    # model = "deployment_name"
    embeddings = aoai_client.embeddings.create(input = [text], model=COPILOT_DEMO_EMBEDDINGS_ADA_DEPLOYMENT_NAME).data[0].embedding
    return embeddings

In [None]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

documentName = "moby dick book"
fileName = "./data/moby dick.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split(text_splitter=splitter)
print("Number of pages: ", len(pages))

In [None]:

import uuid
df = pd.DataFrame(columns=['id','document_name', 'content', 'embedding'])
for page in pages:
    df.loc[len(df.index)] = [str(uuid.uuid4()), documentName, page.page_content, ""]  
df.head()

In [None]:
# calculate the embeddings using openAI ada 
df["embedding"] = df.content.apply(lambda x: calc_embeddings(x))
df.to_csv('./data/aia_embeddings.csv', index=False)
print(df.head(2))

In [None]:
# Output embeddings to json file
output_path = os.path.join('..', 'data', 'moby_dick_with_embeddings.json')

with open(output_path, 'w') as f:
    df.to_json(f, orient='records', default_handler=str)

In [8]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="document_name", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="my-vector-config")
]

vector_search = VectorSearch(
        profiles=[VectorSearchProfile(name="my-vector-config", algorithm_configuration_name="my-algorithms-config")],
        algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
    )

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

books created


In [None]:
# option 1: upload documents to the index
from azure.search.documents import SearchClient
import json

# Upload some documents to the index
output_path = os.path.join('.', 'data', 'moby_dick_with_embeddings.json')
with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} pages") 

Uploaded 1446 pages
