In [2]:
import os
import json
from dotenv import load_dotenv

# Add OpenAI import
from openai import AzureOpenAI

In [3]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import StrOutputParser
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import CharacterTextSplitter
from azure.search.documents.indexes.models import (
    FreshnessScoringFunction,
    FreshnessScoringParameters,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)
from tqdm import tqdm
import os
import pandas as pd
import nest_asyncio
nest_asyncio.apply()
import warnings
warnings.filterwarnings("ignore") 

In [30]:
# Get configuration settings 
load_dotenv()
azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT")
azure_oai_key = os.getenv("AZURE_OAI_KEY")
azure_openai_api_key = os.getenv("AZURE_OAI_KEY")
azure_oai_deployment = os.getenv("AZURE_OAI_DEPLOYMENT")
azure_oai_text_deployment = os.getenv("AZURE_OAI_TEXT_DEPLOYMENT")
azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")
azure_search_index = os.getenv("AZURE_SEARCH_INDEX")

In [16]:
azure_oai_text_deployment

'codeninjastextemdmodel'

In [37]:
vector_store_address = os.getenv("AZURE_SEARCH_ENDPOIND")
vector_store_password = os.getenv("AZURE_SEARCH_KEY")

In [39]:
azure_search_endpoint

'https://cognitive-search-lab7.search.windows.net'

In [6]:
def split_doc(filename_):
    print(f'Reading - {filename_}')
    loader = TextLoader(filename_, encoding="utf-8")
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)
    return docs

def add_metadata(data,time):
    for chunk in data:
        chunk.metadata['last_update'] = time
    return data

In [7]:
msft_q1 = split_doc('MSFT_q1_2024.txt')
msft_q2 = split_doc('MSFT_q2_2024.txt')

Reading - MSFT_q1_2024.txt
Reading - MSFT_q2_2024.txt


In [8]:
# Adding same data with different last_update 
from datetime import datetime, timedelta

q2_time = (datetime.utcnow() - timedelta(days=90)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)
q1_time = (datetime.utcnow() - timedelta(days=180)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)

In [9]:
q2_time,q1_time

('2024-01-29T00:47:33-00:00', '2023-10-31T00:47:33-00:00')

In [10]:
documents = msft_q1 + msft_q2

In [11]:
len(documents)

51

In [12]:
documents[0]

Document(page_content="Operator\n\nGreetings and welcome to the Microsoft fiscal year 2024 first quarter earnings conference call. At this time, all participants are in a listen-only mode. A question-and-answer session will follow the formal presentation. [Operator instructions] As a reminder, this conference is being recorded.\n\nI would now like to turn the call over to your host, Brett Iversen, vice president of investor relations. Mr. Iversen, please go ahead. \n\nBrett Iversen -- General Manager, Investor Relations\n\nGood afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer; Amy Hood, chief financial officer; Alice Jolla, chief accounting officer; and Keith Dolliver, corporate secretary and deputy general counsel. On the Microsoft investor relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks during today's call and provi

In [17]:
# # Initialize the Azure OpenAI client
# client = AzureOpenAI(
#     base_url=f"{azure_oai_endpoint}/openai/deployments/{azure_oai_text_deployment}/extensions",
#     api_key=azure_oai_key,
#     api_version="2023-09-01-preview")

In [31]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_oai_text_deployment,
    api_key=azure_openai_api_key,
    azure_endpoint=azure_oai_endpoint
)
embedding_function=embeddings.embed_query

In [26]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
    # Additional data field for last doc update
    SimpleField(
        name="last_update",
        type=SearchFieldDataType.DateTimeOffset,
        searchable=True,
        filterable=True,
    ),
]
# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile"
sc = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(weights={"content": 5}),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear",
        )
    ],
)

In [28]:
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

In [36]:
vector_store_address

In [40]:
index_name = "earning_call-scoring-profile"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    fields=fields,
    scoring_profiles=[sc],
    default_scoring_profile=sc_name,
)

In [43]:
%%time
vector_store.add_documents(documents=documents)

CPU times: total: 1.11 s
Wall time: 10.5 s


['MjVkY2ZjOTEtMzU4Mi00NDg5LTg5MmEtZDE3OTc0MWQ1YWJh',
 'YmY5OWQ3MWEtMWYyMy00MzgwLTlkYWItN2NiMzVmNGVlZmE5',
 'ZGFhYjk0NjAtMTBkMC00OWM5LThmZDgtYjQ3MmUzNjFkMjgw',
 'ZDk4OTM2NzgtOWI0NS00NzZiLWIxNzgtM2QyYzg5NjZmNzY5',
 'ZGEwZmViMmQtMGVkNS00ZWMwLThmMTQtZGE5ZWJkZTVlNjlh',
 'OGRkOWUwOTMtMGE1Mi00MDdmLTk5YTItOGYyNjc0ZTc4YWU4',
 'YTkwYjhmMjktZjk1OS00MjVkLThkN2QtMTM2MzM1Y2ZiYjc4',
 'ZWM0YjY3MWItNjlmYy00NTBlLWJiZmMtZTQ0MDQ0NWI4MzMz',
 'OTAxZTc2NzItZWY0OC00MWY5LTllYjItMjFlZDIxMTQxNDdj',
 'NTU2NmU4N2UtMTEyNS00NmIzLTlmMTktZTJmNzFmNTYxMTk4',
 'ZjMyZThkZGItYjE5Yy00ZWJlLWE3NzQtMThkZWRkMzZmNDc1',
 'MTM3OWIxNDAtZDIzNi00MzM1LTgxNjYtZWZkOWJiMjdkMDMz',
 'MjAyNTQ5M2EtZWU5Zi00NDNmLThmZGYtMjcwYzc3MTJlOTNi',
 'NzgzMGNiZjgtMDU0Mi00MGQ3LTk0NjktZDFjODNmNDA2Njdi',
 'NTdmYTgyZGMtZWZjOC00YWVhLTllZjItMjYxYmNjNTRjZGUw',
 'NGQ3MDQxZGUtNzllMS00NmY2LWI0NDYtNDU5OTJkYmVkMDEy',
 'MWU2MDIyNjgtMWZlNS00Mzc5LTgwZmItMDM5YWQwYzdiMTVj',
 'Yzg5NjE5M2ItYmQ1YS00NjVhLTk3MjItNTJjMjlmMTRlMTYx',
 'MmQwOTc2ZmYtNjFkZi00YmVmLWJiNTEtZjEzNzhjMjkx

In [44]:
azureai_retriever = vector_store.as_retriever()

In [45]:
azureai_retriever.invoke("How is Windows OEM revenue growth?")

[Document(page_content='In our on-premises server business, revenue increased 3% and 2% in constant currency, ahead of expectations, driven primarily by the better-than-expected demand related to Windows Server 2012 end of support. Enterprise and partner services revenue increased 1% and was relatively unchanged in constant currency with better-than-expected performance across enterprise support services and industry solutions. Segment gross margin dollars increased 20% and 18% in constant currency, and gross margin percentage was relatively unchanged. Excluding the impact of the change in accounting estimate, gross margin percentage increased roughly 1 point, driven by the improvement in Azure noted earlier, partially offset by the impact of scaling our AI infrastructure to meet growing demand.\n\nOperating expenses decreased 8% and 9% in constant currency with 9 points of favorable impact from the prior-year Q2 charge. Operating income grew 40% and 37% in constant currency. Now to mo

In [59]:
llm = AzureChatOpenAI(azure_endpoint=azure_oai_endpoint,
                      api_key=azure_openai_api_key, 
                      api_version="2023-09-01-preview",
                      azure_deployment=azure_oai_deployment)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=azureai_retriever,
    metadata={"application_type": "question_answering"},
)

In [60]:
query = "How is Windows OEM revenue growth?"

In [61]:
qa.invoke({"query": query})

{'query': 'How is Windows OEM revenue growth?',
 'result': 'Windows OEM revenue growth increased 11% year over year, ahead of expectations, driven by slightly better performance and higher monetizing consumer markets.'}