In [1]:

import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


In [5]:
from langchain_classic.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("../docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("../docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("../docs/MachineLearning-Lecture02.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

# Split
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

len(splits)

161

In [12]:
# Let's take our splits and embed them
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"
sentence4 = "why is it so hard to figure out how to make it all work"

embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)
embedding4 = embedding.embed_query(sentence4)

import numpy as np
# np.dot(embedding1, embedding2)
# np.dot(embedding1, embedding3)
# np.dot(embedding2, embedding3)
np.dot(embedding1, embedding4)
# np.dot(embedding2, embedding4)
# np.dot(embedding3, embedding4)


np.float64(0.7400008457493766)

In [None]:
### Store vector splits in vector database

# ! pip install chromadb

from langchain_classic.vectorstores import Chroma

persist_directory = 'docs/chroma/'

# get_ipython().system('rm -rf ./docs/chroma  # remove old database files if any')

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

print(vectordb._collection.count())


'rm' is not recognized as an internal or external command,
operable program or batch file.


161


  vectordb.persist()


In [None]:
# ### Similarity Search

question = "is there an email i can ask for help"

docs = vectordb.similarity_search(question,k=3) # k documents to return

print(len(docs))

print(docs[0].page_content)

vectordb.persist()

3
cs229-qa@cs.stanford.edu. This goes to an account that's read by all the TAs and me. So 
rather than sending us email individually, if you send email to this account, it will 
actually let us get back to you maximally quickly with answers to your questions.  
If you're asking questions about homework problems, please say in the subject line which 
assignment and which question the email refers to, since that will also help us to route 
your question to the appropriate TA or to me appropriately and get the response back to 
you quickly.  
Let's see. Skipping ahead — let's see — for homework, one midterm, one open and term 
project. Notice on the honor code. So one thing that I think will help you to succeed and 
do well in this class and even help you to enjoy this class more is if you form a study 
group.  
So start looking around where you're sitting now or at the end of class today, mingle a 
little bit and get to know your classmates. I strongly encourage you to form study groups 

### Failure Modes

##### Duplicate chunks

In [None]:
# This seems great, and basic similarity search will get you 80% of the way there very easily. 
# But there are some failure modes that can creep up. 
# Here are some edge cases that can arise - we'll fix them in the next class.

question = "what did they say about matlab?"

docs = vectordb.similarity_search(question,k=5)
print(docs[0])



page_content='those homeworks will be done in either MATLAB or in Octave, which is sort of — I 
know some people call it a free version of MATLAB, which it sort of is, sort of isn't.  
So I guess for those of you that haven't seen MATLAB before, and I know most of you 
have, MATLAB is I guess part of the programming language that makes it very easy to 
write codes using matrices, to write code for numerical routines, to move data around, to 
plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of 
learning algorithms.  
And in case some of you want to work on your own home computer or something if you 
don't have a MATLAB license, for the purposes of this class, there's also — [inaudible] 
write that down [inaudible] MATLAB — there' s also a software package called Octave 
that you can download for free off the Internet. And it has somewhat fewer features than 
MATLAB, but it's free, and for the purposes of this class, it will work for just about 
e

In [None]:
# Notice that we're getting duplicate chunks (because of the duplicate `MachineLearning-Lecture01.pdf` in the index).
print(docs[1]) # Duplicate chunks due to duplicate documents

page_content='those homeworks will be done in either MATLAB or in Octave, which is sort of — I 
know some people call it a free version of MATLAB, which it sort of is, sort of isn't.  
So I guess for those of you that haven't seen MATLAB before, and I know most of you 
have, MATLAB is I guess part of the programming language that makes it very easy to 
write codes using matrices, to write code for numerical routines, to move data around, to 
plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of 
learning algorithms.  
And in case some of you want to work on your own home computer or something if you 
don't have a MATLAB license, for the purposes of this class, there's also — [inaudible] 
write that down [inaudible] MATLAB — there' s also a software package called Octave 
that you can download for free off the Internet. And it has somewhat fewer features than 
MATLAB, but it's free, and for the purposes of this class, it will work for just about 
e

##### Structured information in query lacking in results
- The structured info does not translate to the semantic vectorization taking place
- This leads to imprecise responses

In [None]:
# We can see a new failure mode.
# The question below asks a question about the third lecture, but includes results from other lectures as well.

question = "what did they say about regression in the third lecture?"

docs = vectordb.similarity_search(question,k=5)

for doc in docs:
    print(doc.metadata)


print(docs[4].page_content)

# Approaches discussed in the next lecture can be used to address both!


{'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'moddate': '2008-07-11T11:25:05-07:00', 'source': '../docs/MachineLearning-Lecture02.pdf', 'creator': 'PScript5.dll Version 5.2.2', 'page': 2, 'total_pages': 18, 'creationdate': '2008-07-11T11:25:05-07:00', 'author': '', 'page_label': '3', 'title': ''}
{'author': '', 'creationdate': '2008-07-11T11:25:05-07:00', 'source': '../docs/MachineLearning-Lecture02.pdf', 'title': '', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'moddate': '2008-07-11T11:25:05-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'page': 17, 'page_label': '18', 'total_pages': 18}
{'moddate': '2008-07-11T11:25:23-07:00', 'total_pages': 22, 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'page': 8, 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': '../docs/MachineLearning-Lecture01.pdf', 'title': '', 'page_label': '9'}
{'page_label': '9', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'page': 8, 'author':

In [None]:
# Possible failure: cannot compare across documents?

question = "what is the overlap between the two lectures?"

docs = vectordb.similarity_search(question,k=2)

for doc in docs:
    print(doc.metadata)
    print(doc.page_content)


{'page_label': '9', 'author': '', 'creator': 'PScript5.dll Version 5.2.2', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'title': '', 'creationdate': '2008-07-11T11:25:23-07:00', 'source': '../docs/MachineLearning-Lecture01.pdf', 'page': 8, 'moddate': '2008-07-11T11:25:23-07:00', 'total_pages': 22}
statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the discussion sections to go over extensions for the 
material that I'm teaching in the main lectures. So machine learning is a huge field, and 
there are a few extensions that we really want to teach but didn't have time in the main 
lectures for.
{'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'title': '', 'author': '', 'total_pages': 22, 'page': 8, 'moddate': '2008-07-11T11:25:23-07:00', 'page_label': '9', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': '../docs/