In [39]:
import os
import json
from dotenv import load_dotenv

# Add OpenAI import
from openai import AzureOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import StrOutputParser
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import CharacterTextSplitter
from azure.search.documents.indexes.models import (
    FreshnessScoringFunction,
    FreshnessScoringParameters,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)
from pypdf import PdfReader
from tqdm import tqdm
import os
import pandas as pd
import nest_asyncio
nest_asyncio.apply()
import warnings
warnings.filterwarnings("ignore") 
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()
# Get configuration settings 
load_dotenv()
azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT")
azure_oai_key = os.getenv("AZURE_OAI_KEY")
azure_openai_api_key = os.getenv("AZURE_OAI_KEY")
azure_oai_deployment = os.getenv("AZURE_OAI_DEPLOYMENT")
azure_oai_text_deployment = os.getenv("AZURE_OAI_TEXT_DEPLOYMENT")
azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")
azure_search_index = os.getenv("AZURE_SEARCH_INDEX")

def split_doc(filename_):
    print(f'Reading - {filename_}')
    loader = TextLoader(filename_, encoding="utf-8")
    documents = loader.load()
    print(documents[:100])
    text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=2500, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)
    return docs

def add_metadata(data,time):
    for chunk in data:
        chunk.metadata['last_update'] = time
    return data

def load_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text
# Adding same data with different last_update 
from datetime import datetime, timedelta

q2_time = (datetime.utcnow() - timedelta(days=90)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)
q1_time = (datetime.utcnow() - timedelta(days=180)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)
# documents[0]
# # Initialize the Azure OpenAI client
# client = AzureOpenAI(
#     base_url=f"{azure_oai_endpoint}/openai/deployments/{azure_oai_text_deployment}/extensions",
#     api_key=azure_oai_key,
#     api_version="2023-09-01-preview")
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_oai_text_deployment,
    api_key=azure_openai_api_key,
    azure_endpoint=azure_oai_endpoint
)
embedding_function=embeddings.embed_query
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
    # Additional data field for last doc update
    SimpleField(
        name="last_update",
        type=SearchFieldDataType.DateTimeOffset,
        searchable=True,
        filterable=True,
    ),
]
# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile"
sc = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(weights={"content": 5}),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear",
        )
    ],
)


In [47]:
msft_q1 = split_doc('MSFT_q1_2024.txt')
msft_q2 = split_doc('MSFT_q2_2024.txt')
documentms = msft_q1 + msft_q2
# file = "./the-metamorphosis.pdf"
# documents = load_pdf(file)
# documentsp = split_doc(documents)
# documents = documentms + documentsp

print(len(documentms))

Reading - MSFT_q1_2024.txt
[Document(page_content="Operator\n\nGreetings and welcome to the Microsoft fiscal year 2024 first quarter earnings conference call. At this time, all participants are in a listen-only mode. A question-and-answer session will follow the formal presentation. [Operator instructions] As a reminder, this conference is being recorded.\n\nI would now like to turn the call over to your host, Brett Iversen, vice president of investor relations. Mr. Iversen, please go ahead. \n\nBrett Iversen -- General Manager, Investor Relations\n\nGood afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer; Amy Hood, chief financial officer; Alice Jolla, chief accounting officer; and Keith Dolliver, corporate secretary and deputy general counsel. On the Microsoft investor relations website, you can find our earnings press release and financial summary slide deck, which is intended to supplement our prepared remarks d

In [46]:


def extract_combine_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        combined_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            combined_text += page_text + "\n\n"  # Add "\n\n" after each page
    return combined_text

def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding="utf-8") as file:
        file.write(text)

# def clean_text(text):
#     # Remove extra spaces, newline characters, and hexadecimal characters
#     cleaned_text = re.sub(r'[\x00-\x1F\x7F-\xFF]+', ' ', text)  # Remove non-printable ASCII characters
#     cleaned_text = ' '.join(cleaned_text.split())  # Remove extra spaces and newline characters
#     return cleaned_text


file_path = "./the-metamorphosis.pdf"
output_txt_path = "./metamorph_text.txt"

# Extract and combine text from all pages
combined_text = extract_combine_text_from_pdf(file_path)

# # Clean the combined text
# cleaned_text = clean_text(combined_text)

# Save cleaned combined text to a .txt file
save_text_to_file(combined_text, output_txt_path)
# Split the combined text using existing method
chunks = split_doc(output_txt_path)
len(chunks)

Reading - ./metamorph_text.txt


73

In [48]:
documents = documentms + chunks

In [49]:
print(len(documents))

124


In [61]:
# vector_store_address
index_name = "msearning_call-scoring-profile"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    fields=fields,
    scoring_profiles=[sc],
    default_scoring_profile=sc_name,
)

# vector_store.add_documents(documents=documents)
# azureai_retriever = vector_store.as_retriever()
azureai_retriever = vector_store.as_retriever(metadata=metadata)

llm = AzureChatOpenAI(azure_endpoint=azure_oai_endpoint,
                    api_key=azure_openai_api_key, 
                    api_version="2023-09-01-preview",
                    azure_deployment=azure_oai_deployment)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=azureai_retriever,
    metadata={"application_type": "question_answering"},
)

In [62]:
query = "who is Gregor Samsa? in the Metamorphosis? in 2 paragraphs?"
print(qa.invoke({"query": query}))

{'query': 'who is Gregor Samsa? in the Metamorphosis? in 2 paragraphs?', 'result': 'Gregor Samsa is the main protagonist in Franz Kafka\'s novella "The Metamorphosis." At the beginning of the story, Gregor wakes up to find himself transformed into a gigantic insect-like creature. This sudden and bizarre transformation disrupts his ordinary life and throws his family into turmoil. Gregor\'s initial reaction to his new form is one of confusion and distress, as he struggles to come to terms with his metamorphosis and the implications it has on his relationships and sense of self.\n\nAs the story unfolds, Gregor becomes increasingly isolated and alienated from his family and society. His physical transformation serves as a metaphor for his emotional and psychological disconnection. Gregor\'s family, initially shocked and repulsed by his transformation, slowly begins to resent and reject him. They view him as a burden and a disgrace, and their attitudes towards him reflect the themes of iso

In [63]:
query = "How is Windows OEM revenue growth? Msft metadata"
# query = "who is General Manager, Investor Relations"
print(qa.invoke({"query": query}))

{'query': 'How is Windows OEM revenue growth? Msft metadata', 'result': 'According to the provided context, Windows OEM revenue growth is expected to be relatively flat as PC market unit volumes continue at pre-pandemic levels.'}


In [59]:
azureai_retriever.invoke("How is Windows OEM revenue growth?")

[Document(page_content='Growth will be driven by our Azure consumption business with continued strong contribution from AI. Our per-user business should see benefit from Microsoft 365 Suite momentum though we expect continued moderation in seat growth rates given the size of the installed base. In our on-premises server business, we expect revenue growth in the low-to-mid single-digits with continued hybrid demand, including licenses running in multi-cloud environments. And in the enterprise and partner services revenue should decline approximately 10% on a high prior-year comparable for enterprise support services and more personal computing, we expect revenue of $14.7 billion, $15.1 billion, or growth between 11% and 14%.\n\nWindows OEM revenue growth should be relatively flat as PC market unit volumes continue at pre-pandemic levels. In Windows commercial products and cloud services, customer demand for Microsoft 365 and our Advanced Security Solutions should drive revenue growth in

In [60]:
metadata = {"source": "MSFT_q2_2024.txt"}