In [1]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get Azure OpenAI configuration from environment variables
azure_openai_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_version = os.getenv("AZURE_OPENAI_VERSION")
azure_openai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
azure_openai_embedding_deployment_name = os.getenv("AZURE_OPENAI_EmBEDDING_DEPLOYMENT_NAME ")

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
from langchain_chroma import Chroma
from langchain_openai import AzureOpenAIEmbeddings
persist_directory = 'docs/chroma/'

# Initialize embeddings
embedding = AzureOpenAIEmbeddings(
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version,
    deployment=azure_openai_embedding_deployment_name
)

vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
    )

In [3]:
print(vectordb._collection.count())

208


In [4]:
question = "What are major topics for this class?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

In [5]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment=azure_openai_deployment_name,
    api_version=azure_openai_api_version,
    api_key=azure_openai_api_key,
    temperature=0
)

In [6]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [7]:
result = qa_chain.invoke({"query": question})
result["result"]

"The major topics for this class include **machine learning** concepts. The course will cover foundational material, and there will be **refresher sessions** on topics like **statistics** and **algebra** during discussion sections for those who need them. Additionally, the discussion sections will explore **extensions of the material** taught in the main lectures, as machine learning is a vast field, and some advanced topics couldn't be included in the main lectures."

In [8]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [9]:
question = "Is probability a class topic?"
result = qa_chain.invoke({"query": question})

In [10]:
result["result"]

'Yes, probability is a class topic, as the instructor assumes familiarity with basic probability concepts like random variables, expectation, and variance, and mentions using probabilistic interpretations in the course. Refresher sessions on probability are also offered in discussion sections. Thanks for asking!'

In [11]:
result["source_documents"][0]

Document(id='982e4981-6eb9-4514-8424-0c02fda4b56f', metadata={'title': '', 'moddate': '2008-07-11T11:25:23-07:00', 'source': 'docs/MachineLearning-Lecture01.pdf', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationdate': '2008-07-11T11:25:23-07:00', 'page_label': '5', 'author': '', 'page': 4, 'creator': 'PScript5.dll Version 5.2.2', 'total_pages': 22}, page_content="of this class will not be very programming intensive, although we will do some \nprogramming, mostly in either MATLAB or Octave. I'll say a bit more about that later.  \nI also assume familiarity with basic probability and statistics. So most undergraduate \nstatistics class, like Stat 116 taught here at Stanford, will be more than enough. I'm gonna \nassume all of you know what random variables are, that all of you know what expectation \nis, what a variance or a random variable is. And in case of some of you, it's been a while \nsince you've seen some of this material. At some of the discussion sections, we'll actu

In [12]:
#above was stuff technique. For efficiency, lets try other techniques like map reduce

#fixed error "" 'NoneType' object has no attribute 'startswith' ""
llm = AzureChatOpenAI(
    azure_deployment=azure_openai_deployment_name,
    api_version=azure_openai_api_version,
    api_key=azure_openai_api_key,
    temperature=0,
    model="gpt-4o" #there was an error in using the below chain if model name is not specified, it was taking 'None'. Hence intialized it again
)

qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

result = qa_chain_mr.invoke({"query": question})

result["result"]

'Yes, probability is a class topic. The text explicitly mentions that the class assumes familiarity with basic probability and statistics, including concepts like random variables, expectation, and variance. Additionally, probability concepts are applied in the course, such as using a "probabilistic interpretation" to derive a learning algorithm. Some discussion sections also cover these topics as a refresher.'

In [None]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_mr.invoke({"query": question})
result["result"]

  result = qa_chain_mr({"query": question})


'Probability is not explicitly listed as a standalone class topic, but it plays an important role in the course. The instructor uses probabilistic interpretations to derive and explain learning algorithms, such as transitioning from regression to classification problems. A basic understanding of probability and statistics—covering concepts like random variables, expectation, and variance—is assumed as a prerequisite for the class. For students who may need a refresher, these topics, along with algebra, will be reviewed during discussion sections. Additionally, discussion sections will later be used to explore extensions of the material covered in the main lectures.'

In [None]:
#since no memory the question and answer ahead does not match
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)
question = "Is probability a class topic?"
result = qa_chain({"query": question})
result["result"]

'Yes, probability is a class topic. The instructor assumes familiarity with basic probability and statistics, such as understanding random variables, expectation, and variance. These concepts are foundational and will be used in the course. Additionally, the instructor mentions that some discussion sections will serve as a refresher for these prerequisites if needed.'

In [15]:
question = "why are those prerequesites needed?"
result = qa_chain({"query": question})
result["result"]

"The prerequisites mentioned are necessary because they provide the foundational knowledge and skills required to understand and effectively apply the concepts taught in the class. Here's why each of the prerequisites is important:\n\n1. **Basic Probability and Statistics**:\n   - Machine learning algorithms often rely on probabilistic models and statistical reasoning. For example, understanding concepts like random variables, expectation, variance, and probability distributions is crucial for grasping how algorithms like Naive Bayes, Gaussian Mixture Models, or Bayesian inference work.\n   - Many machine learning techniques involve evaluating the likelihood of events, making predictions, or estimating parameters, all of which require a solid understanding of probability and statistics.\n\n2. **Basic Linear Algebra**:\n   - Linear algebra is the mathematical backbone of many machine learning algorithms. Concepts like vectors, matrices, matrix multiplication, and matrix inversion are us