In [1]:
import os
import json
from dotenv import load_dotenv

# Add OpenAI import
# from openai import AzureOpenAI

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import StrOutputParser
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import CharacterTextSplitter
from azure.search.documents.indexes.models import (
    FreshnessScoringFunction,
    FreshnessScoringParameters,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)
from tqdm import tqdm
import os
import pandas as pd
import nest_asyncio
nest_asyncio.apply()
import warnings
warnings.filterwarnings("ignore") 

In [3]:
# Get configuration settings 
load_dotenv()
azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT")
azure_oai_key = os.getenv("AZURE_OAI_KEY")
azure_openai_api_key = os.getenv("AZURE_OAI_KEY")
azure_oai_deployment = os.getenv("AZURE_OAI_DEPLOYMENT")
azure_oai_text_deployment = os.getenv("AZURE_OAI_TEXT_DEPLOYMENT")
azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")
azure_search_index = os.getenv("AZURE_SEARCH_INDEX")

In [4]:
def split_doc(filename_):
    print(f'Reading - {filename_}')
    loader = TextLoader(filename_, encoding="utf-8")
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)
    return docs

def add_metadata(data,time):
    for chunk in data:
        chunk.metadata['last_update'] = time
    return data

In [5]:
# msft_q1 = split_doc('MSFT_q1_2024.txt')
# msft_q2 = split_doc('MSFT_q2_2024.txt')

In [5]:
# Adding same data with different last_update 
from datetime import datetime, timedelta

q2_time = (datetime.utcnow() - timedelta(days=90)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)
q1_time = (datetime.utcnow() - timedelta(days=180)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)

In [6]:
q2_time,q1_time

('2024-02-04T22:03:52-00:00', '2023-11-06T22:03:52-00:00')

In [8]:
# documents = msft_q1 + msft_q2

In [7]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_oai_text_deployment,
    api_key=azure_openai_api_key,
    azure_endpoint=azure_oai_endpoint
)
embedding_function=embeddings.embed_query

In [8]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
    # Additional data field for last doc update
    SimpleField(
        name="last_update",
        type=SearchFieldDataType.DateTimeOffset,
        searchable=True,
        filterable=True,
    ),
]
# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile"
sc = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(weights={"content": 5}),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear",
        )
    ],
)

In [9]:
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

In [10]:
index_name = "earning_call-scoring-profile"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    fields=fields,
    scoring_profiles=[sc],
    default_scoring_profile=sc_name,
)

In [13]:
# %%time
# vector_store.add_documents(documents=documents)

In [11]:
azureai_retriever = vector_store.as_retriever(fetch_k=1)

In [20]:
index_response = azureai_retriever.invoke("Windows OEM revenue growth")

In [21]:
for document in index_response:
    print(document.metadata)
    print(document.page_content)
    print("\n\n")

{'source': 'MSFT_q2_2024.txt'}
In our on-premises server business, revenue increased 3% and 2% in constant currency, ahead of expectations, driven primarily by the better-than-expected demand related to Windows Server 2012 end of support. Enterprise and partner services revenue increased 1% and was relatively unchanged in constant currency with better-than-expected performance across enterprise support services and industry solutions. Segment gross margin dollars increased 20% and 18% in constant currency, and gross margin percentage was relatively unchanged. Excluding the impact of the change in accounting estimate, gross margin percentage increased roughly 1 point, driven by the improvement in Azure noted earlier, partially offset by the impact of scaling our AI infrastructure to meet growing demand.

Operating expenses decreased 8% and 9% in constant currency with 9 points of favorable impact from the prior-year Q2 charge. Operating income grew 40% and 37% in constant currency. Now 

In [75]:
llm = AzureChatOpenAI(azure_endpoint=azure_oai_endpoint,
                      api_key=azure_openai_api_key, 
                      api_version="2023-09-01-preview",
                      azure_deployment=azure_oai_deployment, 
                      max_tokens=100)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=azureai_retriever,
    metadata={"application_type": "question_answering"},
    return_source_documents=True,
)

In [73]:
query = "How is Windows OEM revenue growth?"

In [74]:
response = qa.invoke({"query": query})
response

{'query': 'How is Windows OEM revenue growth?',
 'result': 'Windows OEM',
 'source_documents': [Document(page_content='In our on-premises server business, revenue increased 3% and 2% in constant currency, ahead of expectations, driven primarily by the better-than-expected demand related to Windows Server 2012 end of support. Enterprise and partner services revenue increased 1% and was relatively unchanged in constant currency with better-than-expected performance across enterprise support services and industry solutions. Segment gross margin dollars increased 20% and 18% in constant currency, and gross margin percentage was relatively unchanged. Excluding the impact of the change in accounting estimate, gross margin percentage increased roughly 1 point, driven by the improvement in Azure noted earlier, partially offset by the impact of scaling our AI infrastructure to meet growing demand.\n\nOperating expenses decreased 8% and 9% in constant currency with 9 points of favorable impact f

In [68]:
# Accessing the list of documents
documents = response['source_documents']

# Extracting metadata from each document
for document in documents:
    print(f'Document is {document}')
    print(type(document))
    print(document.metadata)

Document is page_content='In our on-premises server business, revenue increased 3% and 2% in constant currency, ahead of expectations, driven primarily by the better-than-expected demand related to Windows Server 2012 end of support. Enterprise and partner services revenue increased 1% and was relatively unchanged in constant currency with better-than-expected performance across enterprise support services and industry solutions. Segment gross margin dollars increased 20% and 18% in constant currency, and gross margin percentage was relatively unchanged. Excluding the impact of the change in accounting estimate, gross margin percentage increased roughly 1 point, driven by the improvement in Azure noted earlier, partially offset by the impact of scaling our AI infrastructure to meet growing demand.\n\nOperating expenses decreased 8% and 9% in constant currency with 9 points of favorable impact from the prior-year Q2 charge. Operating income grew 40% and 37% in constant currency. Now to 

In [85]:
from azure.search.documents import SearchClient
search_client  = SearchClient(endpoint=azure_search_endpoint, index_name=index_name, credential=azure_search_key)

In [86]:
client = AzureOpenAI(
    azure_endpoint=azure_oai_endpoint,
    api_key=azure_oai_key,
    api_version="2023-09-01-preview")

In [88]:
from azure.search.documents.models import VectorizedQuery
search_text="Microsoft 365 Copilot"
embedding = client.embeddings.create(input=query, model=azure_oai_text_deployment).data[0].embedding
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["id", "content", "metadata"],
)  
  
# for result in results:  
#     print(f"id: {result['id']}")  
#     print(f"Score: {result['@search.score']}")  
#     print(f"Content: {result['content']}")  
#     print(f"metadata: {result['metadata']}\n")  

In [95]:
for i, r in enumerate(results):
    print(f"Result {i}: {r['content']}")

AttributeError: 'str' object has no attribute 'get_token'

In [96]:
results

<iterator object azure.core.paging.ItemPaged at 0x2648f689030>