# Setup

In [None]:
pip install langchain langchain-chroma langchain-openai

# Documents

In [3]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

# refer
# https://api.python.langchain.com/en/latest/documents/langchain_core.documents.base.Document.html

# Vector stores

- **벡터 검색(vector search)**은 구조화돼 있지 않은 데이터를 저장하고 검색하는 일반적인 방법이다.
- 인 메모리 저장소를 구현하는 **크로마(Chroma)**를 사용해 랭체인의 **벡터 스토어(vector store)** 기능을 시연할 것이다.
- 벡터 스토어를 초기화하기 위해서, 텍스트 데이터가 숫자 벡터로 변환하는 방법을 명시하는 임베딩 모델을 제공할 필요가 있다.
- 여기서는 OpenAI embeddings를 사용할 것이다.

In [7]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings(),
)

# refer
# https://api.python.langchain.com/en/latest/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html#langchain_openai.embeddings.base.OpenAIEmbeddings
# https://api.python.langchain.com/en/latest/vectorstores/langchain_core.vectorstores.VectorStore.html

- **from_documents** 메소드를 호출하면 벡터 스토어에 **documents**를 저장한다.
- **VectorStore** 인터페이스는 오브젝트가 초기화된 후에 documents를 추가하는 메소드를 구현한다.

## Examples

In [8]:
vectorstore.similarity_search("cat")

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'})]

In [9]:
await vectorstore.asimilarity_search("cat")

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'})]

In [11]:
vectorstore.similarity_search_with_score("cat")

[(Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
  0.375326931476593),
 (Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
  0.4833090305328369),
 (Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
  0.4958883225917816),
 (Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'}),
  0.4974174499511719)]

In [13]:
embedding = OpenAIEmbeddings().embed_query("cat")

vectorstore.similarity_search_by_vector(embedding)

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Parrots are intelligent birds capable of mimicking human speech.', metadata={'source': 'bird-pets-doc'})]

# Retrievers

- 랭체인의 VectorStore 오브젝트는 Runnable의 서브클래스가 아니므로, LCEL 체인과 바로 연계될 수 없다.

In [14]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

# https://api.python.langchain.com/en/latest/runnables/langchain_core.runnables.base.RunnableLambda.html#
retriever = RunnableLambda(vectorstore.similarity_search).bind(k=1)

retriever.batch(["cat", "shark"])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'})]]

- 크로마같은 벡터스토어는 **검색자(retriever)**를 생성할 **as_retriever** 메소드를 구현한다.
- 이 검색자는 **VectoreStoreRetriever** 오브젝트다.
- 이러한 검색자들은 특정 **search_type**과 **search_kwargs** 속성들을 가진다.
- 따라서 **as_retriever** 메소드를 호출할 때, **search_type**과 **search_kwargs** 속성값을 설정한다.

**refer**
- https://api.python.langchain.com/en/latest/vectorstores/langchain_chroma.vectorstores.Chroma.html#langchain_chroma.vectorstores.Chroma.as_retriever

In [15]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(["cat", "shark"])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Goldfish are popular pets for beginners, requiring relatively simple care.', metadata={'source': 'fish-pets-doc'})]]

- **VectorStoreRetriever** 오브젝트는 "similarity", "mmr" 그리고 "similarity_score_threshhold" 같은 검색 타입(search type)을 지원한다.
- 검색자는 **검색 증강 생성(retrieval-augmented generation, RAG)**과 같은 더 복잡한 애플리케이션과 쉽게 연계될 수 있다.
- RAG 애플리케이션은 검색된 컨텍스트를 가지고 주어진 질문을 LLM의 프롬프트와 결합시킨다.

In [16]:
pip install -qU langchain-openai

In [17]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

··········


In [18]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm

In [19]:
response = rag_chain.invoke("tell me about cats")

print(response.content)

Cats are independent pets that often enjoy their own space.
