# Vectorstores and Embeddings

In [None]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
from langchain.document_loaders import PyPDFLoader

loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_spilliter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

In [None]:
splits = text_spilliter.split_documents(docs)

In [None]:
len(splits)

## Embeddings

In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [None]:
sentence1 = "i like |dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embeddings.embed_query(sentence1)
embedding2 = embeddings.embed_query(sentence2)
embedding3 = embeddings.embed_query(sentence3)

In [None]:
import numpy as np

In [None]:
np.dot(embedding1, embedding2)

In [None]:
np.dot(embedding1, embedding3)

In [None]:
np.dot(embedding2, embedding3)

## Vectorstores

In [None]:
from langchain.vectorstores import Chroma

In [None]:
persist_directory = 'docs/chroma/'

In [None]:
vectordb = Chroma.from_documents(documents=splits,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)

In [None]:
print(vectordb._collection.count())

## Similarity search

In [None]:
question = "is there an email i can ask for help"

In [None]:
docs = vectordb.similarity_search(question, k=3)

In [None]:
len(docs)

In [None]:
docs[0].page_content

## Failure modes

In [None]:
question = "what did they say about matlab?"

In [None]:
docs = vectordb.similarity_search(question, k=5)

In [None]:
docs[0]

In [None]:
docs[1]

In [None]:
question = "what did they say about regression in the third lecture?"

In [None]:
docs = vectordb.similarity_search(question, k=5)

In [None]:
for doc in docs:
    print(doc.metadata)

In [None]:
print(docs[4].page_content)