# Vector Stores

## Confirming that libraries are installed

## Importing libraries and API key

In [None]:

import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader



## Loading and splitting the documents

In [None]:
loader = TextLoader("FDR_State_of_Union_1944.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

## Creating the embeddings

In [None]:
embedding_function = OpenAIEmbeddings()

## Creating the database

In [None]:
db = Chroma.from_documents(docs, embedding_function, persist_directory="./speech_new_db")

In [None]:
db.persist()

In [None]:
db_new_connection = Chroma(persist_directory='./speech_new_db', embedding_function=embedding_function)
# necessary if you restart the computer

## Doing a similarity search

In [None]:
new_doc = "What did FDR say about the cost of food law?"
# All Chroma will do is return which document has the most similar language to this prompt. You could just write "cost of food law, FDR" and it would probably get the same result.

In [None]:
similar_docs = db.similarity_search(new_doc)

In [None]:
print(similar_docs[0].page_content)
# by default it returns the four most similar documents, going from most to least similar

## Loading another document into the database

In [None]:
loader = TextLoader("Lincoln_State_of_Union_1862.txt")
documents = loader.load()

In [None]:
docs = text_splitter.split_documents(documents)
# the warnings are because it's shorter and harder to make reasonable chunks of exactly 500

In [None]:
db = Chroma.from_documents(docs, embedding_function, persist_directory="./speech_new_db")

In [None]:
docs = db.similarity_search("slavery")

In [None]:
print(docs[0].page_content)

In [None]:
docs = db.similarity_search("cost of food law")

In [None]:
print(docs[0].page_content)
# by this you can check that the database has borth speeches in it

In [None]:
print(docs[0].metadata)
# this shows which source it's from

## Using a vector store retriever

In [None]:
retriever = db.as_retriever()
# this retriever object will be the basis of multi query retrieval and context compression

In [None]:
results = retriever.get_relevant_documents("cost of food law")

In [None]:
print(results[0].page_content)