In [None]:
!pip install llama-index llama-parse pydantic nest_asyncio
!pip install llama-index-vector-stores-azureaisearch
!pip install azure-search-documents 
!pip install llama-index-embeddings-azure-openai
!pip install llama-index-llms-azure-openai
!pip install llama-index-readers-docling llama-index-node-parser-docling
!pip install llama-index-readers-file

In [44]:
import os
import json
import time
import logging
from copy import deepcopy
from dotenv import load_dotenv
import nest_asyncio
from typing import Optional

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
from llama_index.core.schema import TextNode, ImageNode, NodeWithScore, MetadataMode
from llama_index.core.settings import Settings
from llama_index.core.query_engine import CustomQueryEngine, SimpleMultiModalQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.prompts import PromptTemplate
from llama_index.core.base.response.schema import Response
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore, IndexManagement, MetadataIndexFieldType
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownNodeParser
import os
import json
import time
import logging
from copy import deepcopy
from dotenv import load_dotenv
import nest_asyncio
from typing import Optional, List

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient

from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.schema import TextNode, ImageNode, NodeWithScore, MetadataMode
from llama_index.core.settings import Settings
from llama_index.core.workflow import Workflow, Event, StartEvent, StopEvent, step, Context
from llama_index.core.prompts import ChatPromptTemplate
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore, IndexManagement, MetadataIndexFieldType
from llama_parse import LlamaParse
from pydantic import BaseModel, Field

In [27]:
# Load environment variables
load_dotenv()

# Azure OpenAI and Azure Search configuration
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME")
AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME")
SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
SEARCH_SERVICE_API_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")
INDEX_NAME = "private-equity-due-diligence"

In [28]:
# Initialize Azure OpenAI models
llm = AzureOpenAI(
    model=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
    deployment_name=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version="2024-10-21"
)

embed_model = AzureOpenAIEmbedding(
    model=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
    deployment_name=AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL_NAME,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version="2024-10-21"
)

Settings.llm = llm
Settings.embed_model = embed_model

# Initialize Azure Search clients
credential = AzureKeyCredential(SEARCH_SERVICE_API_KEY)
index_client = SearchIndexClient(endpoint=SEARCH_SERVICE_ENDPOINT, credential=credential)
search_client = SearchClient(endpoint=SEARCH_SERVICE_ENDPOINT, index_name=INDEX_NAME, credential=credential)


In [41]:
# Create Azure AI Search Vector Store
metadata_fields = {
    # Define any custom metadata fields for filtering if needed
}

vector_store = AzureAISearchVectorStore(
    search_or_index_client=index_client,
    filterable_metadata_field_keys=metadata_fields,
    index_name=INDEX_NAME,
    index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
    id_field_key="id",
    chunk_field_key="chunk",
    embedding_field_key="embedding",
    embedding_dimensionality=1536,  # Update based on embedding model
    metadata_string_field_key="metadata",
    doc_id_field_key="doc_id",
    language_analyzer="en.lucene",
    vector_algorithm_type="exhaustiveKnn"
)

In [42]:
# Use LlamaParse for ingestion
parser = LlamaParse(
    result_type="markdown",
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True
)

# Parse documents from directory
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_dir="data/portfolio-companies", 
    file_extractor=file_extractor, 
).load_data()

print(f"Number of documents parsed: {len(documents)}")

# Apply token splitting transformation
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=50)

# Ingest split documents into Azure AI Search
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context,
    embed_model=embed_model,
    transformations=[text_splitter],
)

Started parsing the file under job_id 5c5f746d-943d-4821-8ac3-f9a4f373ad80
Started parsing the file under job_id 92b60da5-95a5-4c6e-83e4-364ebad254f7
Started parsing the file under job_id 449af97a-dd76-4d90-a16e-d2d9f6237de5
Started parsing the file under job_id 658c2de2-1a71-41b2-bce2-1dfe902b3442
Started parsing the file under job_id f449284b-971d-474d-a01b-ea287aa7d4c7
Started parsing the file under job_id b68b3cd9-5b41-4221-abf7-c33f43b27a6d
Number of documents parsed: 464


In [48]:
# Query the index to validate ingestion
QUERY = "Summarize the key risks identified in AMD Q3 report."
result = index.as_query_engine(llm=llm).query(QUERY)

# Display results
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
for n in result.source_nodes:
    print(n.text, n.metadata)

Q: Summarize the key risks identified in AMD Q3 report.
A: The key risks identified in the AMD Q3 report include:

1. **Economic and Strategic Risks**:
   - Competition from dominant players like Intel and Nvidia may hinder AMD's ability to compete effectively.
   - The semiconductor industry is cyclical and can experience severe downturns.
   - Product demand is influenced by market conditions in various industries.
   - Timely introduction of products with valuable features is crucial for success.
   - Loss of significant customers could adversely impact the business.
   - Economic uncertainty may negatively affect operations and results.
   - Quarterly and seasonal sales patterns can influence operating results.
   - Inadequate protection of intellectual property could lead to competitive disadvantages.
   - Currency exchange rate fluctuations may have adverse effects.

2. **Operational and Technology Risks**:
   - Dependence on third-party manufacturers poses risks if they cannot d