In [None]:
pip install pypdf azure-ai-formrecognizer azure-identity langchain azure-storage-blob python-dotenv unstructured openai azure-search-documents


In [None]:
pip install --index-url=https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ azure-search-documents==11.4.0a20230509004


In [3]:
import os, uuid
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.identity import ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core.credentials import AzureKeyCredential
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
import json
import openai 
from azure.search.documents import SearchClient

StatementMeta(, 5275eb82-3c4f-4314-9a9c-8affe181f54e, 7, Finished, Available)

**Update the fields below that will be used when creating the Azure Cognitive Search Vector Index with your data.**

In [44]:

# Azure Data Lake account information using service principal, make sure to grant 'blob reader' access to the service principle
tenant_id = ''
client_id = ''
client_secret = ''
storage_account_name = ''  #just the account name, not the full URL

#OpenAI connectivity 
openai.api_type = "azure"
openai.api_key = ""
openai.api_base = "https://<your OpenAI instance name>.openai.azure.com/"
openai.api_version = "2022-12-01"
openai_embeddings_model = "" #name of your Ada embeddings deployment

#Cognitive Search Connection
search_endpoint = "https://<your cog search instance name>.search.windows.net"
search_endpoint_for_creating_index = "https://<your cog search instance>.search.windows.net/indexes?api-version=2023-07-01-Preview"
search_api_key ="" #Cog search admin key

#Forms recognizer endpoint
forms_recognizer_endpoint = "https://<forms recognizer deployment name>.cognitiveservices.azure.com/"
forms_recognizer_key = ""

# Azure Data Lake or blob container where files are located
file_system_name = '' 
path_to_files = '/'

#Name of search index you want to populate
index_name = "<any name you want for the index>"

#Set the chunk sizes and overlap
chunk_size = 1000
chunk_overlap = 200



StatementMeta(, 359d353c-d510-4139-b9f5-dba5128af4f5, 47, Finished, Available)

**This section creates the Azure Cognitive Search vector index**

In [None]:
## This creates the JSON that will create the search index with vector search enabled.

search_json = {
    "name": f"{index_name}",
    "fields": [
        {
            "name": "id",
            "type": "Edm.String",
            "key": True,
            "filterable": False,
            "searchable": False,
            "facetable": False,
            "retrievable": True,
            "sortable": True
        },
        {
            "name": "title",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True,
            "key": False,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },
        {
            "name": "content",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True,
            "key": False,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },
        {
            "name": "path",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True,
            "key": False,
            "filterable": True,
            "facetable": True,
            "sortable": True
        },
        {
            "name": "contentVector",
            "type": "Collection(Edm.Single)",
            "searchable": True,
            "retrievable": False,
            "dimensions": 1536,
            "vectorSearchConfiguration": "my-vector-config"
        }
    ],
    "corsOptions": {
        "allowedOrigins": [
            "*"
        ],
        "maxAgeInSeconds": 60
    },
    "vectorSearch": {
        "algorithmConfigurations": [
            {
                "name": "my-vector-config",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "metric": "cosine"
                }
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "content"
                        }
                    ],
                    "prioritizedKeywordsFields": [
                        {
                            "fieldName": "content"
                        }
                    ]
                }
            }
        ]
    }
}

#This section makes the REST call to create the index

url = f'{search_endpoint_for_creating_index }'
print("URL is " + url)
headers = {'Content-Type': 'application/json', 'api-key': search_api_key}
print("Search api key is" +search_api_key)
print("Search JSON is " + str(search_json))
print("About to make the rest call")
#print("data is" + str(search_json))
response = requests.post(search_endpoint_for_creating_index , headers=headers, data=json.dumps(search_json))

if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print('Error:', response.status_code)       

print("rest call completed")   


**This section reads the content in Azure storage, chunks it, creates embeddings, then uploads it to the index created above.**

In [None]:


# Function to generate embeddings for content
def generate_embeddings(text):
    response = openai.Embedding.create(
    input=text, engine=f"{openai_embeddings_model}") #engine = deployment name of your ada-0002 model
    embeddings = response['data'][0]['embedding']
    return embeddings


# Authenticate to storage account with service principal
credential = ClientSecretCredential(tenant_id, client_id, client_secret)
service_client = DataLakeServiceClient(account_url=f"https://{storage_account_name}.dfs.core.windows.net", credential=credential)


# Get the file system client
file_system_client = service_client.get_file_system_client(file_system=file_system_name)
paths = file_system_client.get_paths(path=f"{path_to_files}") #path to files being read


count = 0
for path in paths:
    #print(path.name + '\n')
    #print("\t" + blob.name)

    ##This secion uses forms recognizer to read a file in blob store 
    endpoint = f"{forms_recognizer_endpoint}" #
    #key = f"{storage_account_key}"

    formUrl =  f"https://{storage_account_name}.blob.core.windows.net/ttdocs/{path.name}" 
    #print(formUrl)

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(forms_recognizer_key)
    )
        
    poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-read", formUrl)
    result = poller.result()
    string_result = str(result)


    #print(result)

    ## This section pulls out only the part of the document relevant for searching, which stops when it says 'langauge='
    spl_word = 'languages='
    relevant_text = string_result.partition(spl_word)[0]
    #print(relevant_text)

    #This chunks the relevant text using the chunk sizes defined above
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap,
        length_function = len,
    )
    
    texts = text_splitter.create_documents([relevant_text])

    #print(texts[0])
    #print(texts[1])
    #x = len(texts)
    #print(x)

    ##This section loops through the resulting chunks, calls the generate embeddings functions for them, then adds them to the Azure Cog Search index created above. 
    for item in texts:
        count = count + 1
        print("==========")
        item = str(item)

        embeddings = generate_embeddings(item)

        DOCUMENT= {
            "id": f"{count}",
            "title": f"{path.name}",
            "path": f"{formUrl}",
            "content": f"{item}",
            "contentVector": embeddings
        }


        # Create a client for Azure Search
        
        credential = AzureKeyCredential(search_api_key)
        client = SearchClient(endpoint=endpoint,
                            index_name=index_name,
                            credential=credential)
        #print("about to upload to Azure search")
        #print("Document = " + str(DOCUMENT))
        #print("endpoint =" + search_endpoint)
        #print("index = " + index_name)
        #print("key= " + search_key)

        search_client = SearchClient(search_endpoint, index_name, AzureKeyCredential(search_api_key))

        result2 = search_client.upload_documents(documents=[DOCUMENT])

        print("Upload of new document succeeded: {}".format(result2[0].succeeded))
    

