# Vector Store and Embedding

In [1]:
from langchain.document_loaders import PyPDFLoader
loaders = [
        PyPDFLoader("pdf/MachineLearning-Lecture01.pdf"),
        PyPDFLoader("pdf/MachineLearning-Lecture01.pdf"),        
        PyPDFLoader("pdf/MachineLearning-Lecture02.pdf"),
        PyPDFLoader("pdf/MachineLearning-Lecture03.pdf"),
]
docs = []

for loader in loaders:
    docs.extend(loader.load())

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 200
)

In [3]:
splits = text_splitter.split_documents(docs)
len(splits)

209

## Embedding

In [24]:
import os
import getpass
# GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("Provide your Google API key here")

API_KEY = os.environ["GOOGLE_API_KEY"]


In [25]:
import google.generativeai as genai
genai.configure(api_key=API_KEY)

In [6]:
for model in genai.list_models():
    print(model.supported_generation_methods)

['generateMessage', 'countMessageTokens']
['generateText', 'countTextTokens', 'createTunedTextModel']
['embedText', 'countTextTokens']
['generateContent', 'countTokens']
['generateContent', 'countTokens']
['embedContent', 'countTextTokens']
['generateAnswer']


In [7]:
for model in genai.list_models():
    if 'embedContent' in model.supported_generation_methods:
        print(model.name)



models/embedding-001


In [8]:
# https://python.langchain.com/docs/integrations/text_embedding/google_generative_ai
from langchain_google_genai import GoogleGenerativeAIEmbeddings
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [9]:
embedding1 = gemini_embeddings.embed_query("I like dogs")
embedding2 = gemini_embeddings.embed_query("I like canines")
embedding3 = gemini_embeddings.embed_query("the weather is ugly outside")

In [10]:
display(len(embedding1))

768

In [11]:
import numpy as np

In [12]:
np.dot(embedding1, embedding2)

0.9793075046740478

In [13]:
np.dot(embedding1, embedding3)

0.7909725167442816

In [14]:
np.dot(embedding2, embedding3)

0.7868589379350919

## Vector Stores

In [15]:
from langchain.vectorstores import Chroma

In [16]:
persist_dir = 'docs/chroma/'

In [17]:
!rm -Rf ./docs/chroma

In [18]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=gemini_embeddings,
    persist_directory=persist_dir
)

In [19]:
print(vectordb._collection.count())

209


In [20]:
question = "what did they say about matlab?"
docs = vectordb.similarity_search(question, k=5)
len(docs)


5

In [23]:
for idx, doc in enumerate(docs):
    print(f"doc {idx} th")
    print(doc.page_content)
    print(doc.metadata)    
    print("")

doc 0 th
in industry where you're trying to get a syst em to work using a learning algorithm.  
To those of you that are not currently doing re search, one great way to do a project would 
be if you apply learning algorithms to just pick a problem that you care about. Pick a 
problem that you find interesting, and apply lear ning algorithms to that  and play with the 
ideas and see what happens.
{'page': 6, 'source': 'pdf/MachineLearning-Lecture01.pdf'}

doc 1 th
in industry where you're trying to get a syst em to work using a learning algorithm.  
To those of you that are not currently doing re search, one great way to do a project would 
be if you apply learning algorithms to just pick a problem that you care about. Pick a 
problem that you find interesting, and apply lear ning algorithms to that  and play with the 
ideas and see what happens.
{'page': 6, 'source': 'pdf/MachineLearning-Lecture01.pdf'}

doc 2 th
those homeworks will be done in either MATLA B or in Octave, which is sor

Because of the duplicated `MachineLearning-Lecture01.pdf', there are duplicated search result such as Doc[2], Doc[3]