In [None]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

import os

### Load files

In [11]:
current_dir = os.path.dirname(os.path.abspath("."))
print("current directory", current_dir)
loader = DirectoryLoader(f"{current_dir}/rag/files", glob="*.pdf", loader_cls=PyPDFLoader)

current directory /Users/roshi/Documents/AI/courses/platzy_agents_ai


In [14]:
pages = loader.load()
pages

[Document(metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20240912034157', 'source': '/Users/roshi/Documents/AI/courses/platzy_agents_ai/rag/files/informe_analisis_ventas_tienda_abc.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Informe de Análisis de Ventas - Tienda ABC\nEste informe presenta un análisis detallado de las ventas realizadas en la Tienda ABC durante el\núltimo trimestre. El objetivo de este informe es identificar tendencias, analizar el rendimiento de los\nproductos, y proponer recomendaciones para mejorar las ventas en el próximo período.\n1. Rendimiento General de Ventas\nEn el último trimestre, las ventas totales alcanzaron un valor de $150,000, con un incremento del\n15% respecto al trimestre anterior. El número total de transacciones fue de 2,500, lo que representa\nun promedio de 833 transacciones por mes.\n2. Análisis por Categoría de Producto\nLas siguientes categorías de productos s

#### split text

In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=30)
splits = text_splitter.split_documents(pages)

In [23]:
print(f"Corpus size: {len(splits)}")
print(splits[0])

Corpus size: 69
page_content='Informe de Análisis de Ventas - Tienda ABC
Este informe presenta un análisis detallado de las ventas realizadas en la Tienda ABC durante el' metadata={'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creator': 'PyPDF', 'creationdate': 'D:20240912034157', 'source': '/Users/roshi/Documents/AI/courses/platzy_agents_ai/rag/files/informe_analisis_ventas_tienda_abc.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}


#### Embedding and vector db

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

In [29]:
embedding_function = OpenAIEmbeddings()

vector_store = Chroma(
    collection_name="documents",
    embedding_function=embedding_function,
    persist_directory="./chroma_db",
)

vector_store.add_documents(splits)

retriever = vector_store.as_retriever()

#### Create chat model

In [None]:
system_prompt = """
You are a helpful assistant that can answer questions about the documents, also include emojis in your response.
you have the next {context}
"""

In [48]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=150)

In [53]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableBranch
from operator import itemgetter

contextualize_q_system_promp = """
    Given a chat history and the latest user question 
    which might reference context in the chat history, 
    formulate a standalone question which can be understood 
    without the chat history. Do NOT answer the question,
    "just reformulate it if needed and otherwise return it as is.
    """

def make_history_aware_retriever(llm, retriever):
    # 1) Prompt that rewrites a context-dependent question into a standalone query
    contextualize_q_prompt = ChatPromptTemplate.from_messages([
        ("system", contextualize_q_system_promp),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ])

    # 2) LLM produces rewritten query as plain string
    contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

    # 3) Branch: if chat_history exists and is non-empty, rewrite then retrieve; else retrieve directly.
    history_aware_retriever = RunnableBranch(
        (lambda x: bool(x.get("chat_history")), contextualize_q_chain | retriever),
        (itemgetter("input") | retriever),
    )

    return history_aware_retriever

In [54]:
history_aware_retriever = make_history_aware_retriever(llm, retriever)

In [55]:
qa_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

In [56]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains.retrieval import create_retrieval_chain

qa_chain = create_stuff_documents_chain(
    llm,
    qa_prompt,
)
rag_chain = create_retrieval_chain(
    history_aware_retriever,
    qa_chain,
)


ValueError: Prompt must accept context as an input variable. Received prompt with input variables: ['chat_history', 'input']

In [11]:
vector_store.similarity_search("Hello, world!", 2)

[Document(id='da844bc7-5bb5-4f76-a739-2b2b3a145388', metadata={'source': 'example.txt'}, page_content='Hello, world!'),
 Document(id='44c9b550-62b4-4f16-a44c-20419c37c015', metadata={'source': 'example.txt'}, page_content='This is a test!')]

In [12]:
vector_store.similarity_search_with_score("Hello, world!", 2)

[(Document(id='da844bc7-5bb5-4f76-a739-2b2b3a145388', metadata={'source': 'example.txt'}, page_content='Hello, world!'),
  0.0),
 (Document(id='44c9b550-62b4-4f16-a44c-20419c37c015', metadata={'source': 'example.txt'}, page_content='This is a test!'),
  0.2749747335910797)]