In [2]:
import os
from dotenv import load_dotenv

# LangChain imports
from langchain_openai import AzureChatOpenAI

# Load environment variables
load_dotenv()

# Get Azure OpenAI configuration from environment variables
azure_openai_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_version = os.getenv("AZURE_OPENAI_VERSION")
azure_openai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
azure_openai_embedding_deployment_name = os.getenv("AZURE_OPENAI_EmBEDDING_DEPLOYMENT_NAME ")

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [4]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [5]:
splits = text_splitter.split_documents(docs)

In [6]:
len(splits)

208

In [7]:
from langchain_openai import AzureOpenAIEmbeddings

# Initialize embeddings
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version,
    deployment=azure_openai_embedding_deployment_name
)

In [8]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [9]:
embedding1 = embeddings.embed_query(sentence1)
embedding2 = embeddings.embed_query(sentence2)
embedding3 = embeddings.embed_query(sentence3)

In [None]:
import numpy as np #using numpy to check for similarity in embeddings

In [None]:
np.dot(embedding1, embedding2) #higher value is more similar

np.float64(0.9631511809630346)

In [13]:
np.dot(embedding1, embedding3)

np.float64(0.7702031371038216)

In [14]:
from langchain.vectorstores import Chroma #vector store as Chroma
persist_directory = 'docs/chroma/' #where to persist the vector store locally

In [15]:
!rm -rf ./docs/chroma  # remove old database files if any

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [17]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count()) #same number as splits

208


In [20]:
question = "is there an email i can ask for help" #asking a question
docs = vectordb.similarity_search(question,k=3) #k denotes number of documents to return
len(docs)

3

In [21]:
docs[0].page_content

"cs229-qa@cs.stanford.edu. This goes to an account that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework problems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \nproject. Notice on the honor code. So one thing that I think will help you to succeed and \ndo well in this class and even help you to enjoy this class more is if you form a study \ngroup.  \nSo start looking around where you're sitting now or at the end of class today, mingle a \nlittle bit and get to know your classmates. I strongly encourage you to form st

In [25]:
#lets try some failures on why this is not perfect
question = "what did they say about matlab?"
docs = vectordb.similarity_search(question,k=5)
docs[0].page_content

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than \nMATLAB, but it\'s free, and for the purposes of this class, it will work for just abo

In [None]:
docs[1].page_content #both are same as we used same document twice. This should not have happened in a real world scenario

'those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than \nMATLAB, but it\'s free, and for the purposes of this class, it will work for just abo

In [None]:
#another failure example
question = "what did they say about regression in the third lecture?" #i am asking specifically in third lecture
docs = vectordb.similarity_search(question,k=5)
for doc in docs:
    print(doc.metadata) #all are returned and not only lecture 3 because this would have been a structured search

{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'title': '', 'author': '', 'source': 'docs/MachineLearning-Lecture03.pdf', 'page': 0, 'creationdate': '2008-07-11T11:25:03-07:00', 'moddate': '2008-07-11T11:25:03-07:00', 'total_pages': 16, 'page_label': '1'}
{'source': 'docs/MachineLearning-Lecture02.pdf', 'creator': 'PScript5.dll Version 5.2.2', 'total_pages': 18, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'moddate': '2008-07-11T11:25:05-07:00', 'title': '', 'author': '', 'page': 2, 'page_label': '3', 'creationdate': '2008-07-11T11:25:05-07:00'}
{'author': '', 'creator': 'PScript5.dll Version 5.2.2', 'total_pages': 18, 'source': 'docs/MachineLearning-Lecture02.pdf', 'creationdate': '2008-07-11T11:25:05-07:00', 'title': '', 'page': 17, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'moddate': '2008-07-11T11:25:05-07:00', 'page_label': '18'}
{'moddate': '2008-07-11T11:25:23-07:00', 'author': '', 'creationdate': '2008-07-11T11:25:23-07:00'