In [2]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get Azure OpenAI configuration from environment variables
azure_openai_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_version = os.getenv("AZURE_OPENAI_VERSION")
azure_openai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
azure_openai_embedding_deployment_name = os.getenv("AZURE_OPENAI_EmBEDDING_DEPLOYMENT_NAME ")

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
from langchain_chroma import Chroma
from langchain_openai import AzureOpenAIEmbeddings
persist_directory = 'docs/chroma/'

# Initialize embeddings
embedding = AzureOpenAIEmbeddings(
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version,
    deployment=azure_openai_embedding_deployment_name
)

vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
    )

In [22]:
print(vectordb._collection.count())

208


In [5]:
#testing with some random texts first 
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

#create embeddings for the texts
smalldb = Chroma.from_texts(texts, embedding=embedding)


In [6]:
#search for a query using similarity search
#notice how the reply is semantically similar but misses the important detail of this mushroom being poisonous
question = "Tell me about all-white mushrooms with large fruiting bodies"
smalldb.similarity_search(question, k=2)

[Document(id='18a49c24-ed68-43c1-9ed4-386b44dba44f', metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(id='7760ea31-4b5c-4c67-bf6a-5a4106eb463c', metadata={}, page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

In [7]:
#using max marginal relevance search (MMR) to fix. Now you can see important details are not missed
smalldb.max_marginal_relevance_search(question, k=2, fetch_k=3)

[Document(id='18a49c24-ed68-43c1-9ed4-386b44dba44f', metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(id='35710a16-3d29-4bce-935c-da3b237cc128', metadata={}, page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

In [8]:
#Maximum marginal relevance strives to achieve both relevance to the query and diversity among the results.
#going back to our vector database in chroma

question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)
docs_ss[0].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [9]:
#both 0 and 1 are same due to redundant data in the vector store
docs_ss[1].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [10]:
#using MMR to fix the reundant results
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)
docs_mmr[0].page_content[:100]

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people c'

In [11]:
docs_mmr[1].page_content[:100] #now the results are different

'into his office and he said, "Oh, professor, professor, thank you so much for your \nmachine learning'

In [12]:
#working with metadata of documents to get results from third document
question = "what did they say about regression in the third lecture?"
 #the manual way to do this
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"docs/MachineLearning-Lecture03.pdf"}
)

In [13]:
for d in docs:
    print(d.metadata) #all are from lecture 3

{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'title': '', 'moddate': '2008-07-11T11:25:03-07:00', 'source': 'docs/MachineLearning-Lecture03.pdf', 'page_label': '1', 'page': 0, 'total_pages': 16, 'creator': 'PScript5.dll Version 5.2.2', 'author': '', 'creationdate': '2008-07-11T11:25:03-07:00'}
{'total_pages': 16, 'creator': 'PScript5.dll Version 5.2.2', 'page_label': '14', 'author': '', 'source': 'docs/MachineLearning-Lecture03.pdf', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationdate': '2008-07-11T11:25:03-07:00', 'page': 13, 'title': '', 'moddate': '2008-07-11T11:25:03-07:00'}
{'source': 'docs/MachineLearning-Lecture03.pdf', 'moddate': '2008-07-11T11:25:03-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'total_pages': 16, 'page': 4, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'author': '', 'page_label': '5', 'title': '', 'creationdate': '2008-07-11T11:25:03-07:00'}


In [29]:
#working with metadata using self-query retriever so that we do not need to hardcode but can infer the filter from metadata itself using a LLM
# LangChain imports
from langchain_openai import AzureChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/MachineLearning-Lecture01.pdf`, `docs/MachineLearning-Lecture02.pdf`, or `docs/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

llm = AzureChatOpenAI(
    azure_deployment=azure_openai_deployment_name,
    api_version=azure_openai_api_version,
    api_key=azure_openai_api_key,
    temperature=0
)

document_content_description = "Lecture notes"
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [30]:
question = "what did they say about regression in the third lecture?"
docs = retriever.invoke(question)

In [32]:
for d in docs:
    print(d.metadata)

{'moddate': '2008-07-11T11:25:03-07:00', 'page_label': '1', 'page': 0, 'source': 'docs/MachineLearning-Lecture03.pdf', 'title': '', 'author': '', 'creator': 'PScript5.dll Version 5.2.2', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'total_pages': 16, 'creationdate': '2008-07-11T11:25:03-07:00'}
{'creationdate': '2008-07-11T11:25:03-07:00', 'moddate': '2008-07-11T11:25:03-07:00', 'title': '', 'total_pages': 16, 'page': 10, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'author': '', 'source': 'docs/MachineLearning-Lecture03.pdf', 'creator': 'PScript5.dll Version 5.2.2', 'page_label': '11'}
{'page': 5, 'creator': 'PScript5.dll Version 5.2.2', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'docs/MachineLearning-Lecture03.pdf', 'creationdate': '2008-07-11T11:25:03-07:00', 'author': '', 'total_pages': 16, 'title': '', 'page_label': '6', 'moddate': '2008-07-11T11:25:03-07:00'}
{'page': 2, 'creationdate': '2008-07-11T11:25:03-07:00', 'moddate': '2008-07-11T11:25:03-07:00', '

In [33]:
#Compression to bring relevant results from compressed info instead of full documents
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

# Wrap our vectorstore
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [34]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

those homeworks will be done in either MATLAB or in Octave, which is sort of — I  
know some people call it a free version of MATLAB, which it sort of is, sort of isn't.  
So I guess for those of you that haven't seen MATLAB before, and I know most of you  
have, MATLAB is I guess part of the programming language that makes it very easy to  
write codes using matrices, to write code for numerical routines, to move data around, to  
plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of  
learning algorithms.  
And in case some of you want to work on your own home computer or something if you  
don't have a MATLAB license, for the purposes of this class, there's also — [inaudible]  
write that down [inaudible] MATLAB — there' s also a software package called Octave  
that you can download for free off the Internet. And it has somewhat fewer features than  
MATLAB, but it's free, and for the purposes of this class, it will work for just 

In [35]:
#since in above results we again  get redundant results as first two, we can combine the techniques
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr") #notice mmr
)

question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

those homeworks will be done in either MATLAB or in Octave, which is sort of — I  
know some people call it a free version of MATLAB, which it sort of is, sort of isn't.  
So I guess for those of you that haven't seen MATLAB before, and I know most of you  
have, MATLAB is I guess part of the programming language that makes it very easy to  
write codes using matrices, to write code for numerical routines, to move data around, to  
plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of  
learning algorithms.  
And in case some of you want to work on your own home computer or something if you  
don't have a MATLAB license, for the purposes of this class, there's also — [inaudible]  
write that down [inaudible] MATLAB — there' s also a software package called Octave  
that you can download for free off the Internet. And it has somewhat fewer features than  
MATLAB, but it's free, and for the purposes of this class, it will work for just 

In [36]:
#there are other type of retreivers which does not depend on vector store but directly from embeddings. Check the results to see their efficiency
#We are testing SVM and TF-IDF here

from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load PDF
loader = PyPDFLoader("docs/MachineLearning-Lecture01.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [38]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [39]:
question = "What are major topics for this class?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(metadata={}, page_content="Testing, testing. Okay, cool. Thanks. So all right, online resources. The class has a home page, so it's in on the handouts. I \nwon't write on the chalkboard — http:// cs229.stanford.edu. And so when there are \nhomework assignments or things like that, we usually won't sort of — in the mission of \nsaving trees, we will usually not give out many handouts in class. So homework \nassignments, homework solutions will be posted online at the course home page.  \nAs far as this class, I've also written, and I guess I've also revised every year a set of \nfairly detailed lecture notes that cover the technical content of this class. And so if you \nvisit the course homepage, you'll also find the detailed lecture notes that go over in detail \nall the math and equations and so on that I'll be doing in class.  \nThere's also a newsgroup, su.class.cs229, also written on the handout. This is a \nnewsgroup that's sort of a forum for people in the class to get 

In [40]:
question = "what did they say about matlab?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(metadata={}, page_content="yourselves. You can also come and talk to me or the TAs if you want to brainstorm ideas \nwith us.  \nOkay. So one more organizational question. I'm curious, how many of you know \nMATLAB? Wow, cool, quite a lot. Okay. So as part of the — act ually how many of you \nknow Octave or have used Octave? Oh, okay, much smaller number.  \nSo as part of this class, especially in the homeworks, we'll ask you to implement a few \nprograms, a few machine learning algorithms as part of the homeworks. And most of those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn't.  \nSo I guess for those of you that haven't seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it's s