In [46]:
# !pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
# !pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv


In [47]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [48]:
# Load environment variables
load_dotenv()
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

# 2. Define the LLM (Language Model)

In [49]:
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize the LLM using Google Generative AI
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    api_key=GOOGLE_API_KEY  # Replace with your Google API Key
)

# Test the LLM
response = llm.invoke("Chuyện cười chó và mèo")
response

AIMessage(content='Tuyệt vời! Bạn muốn nghe chuyện cười về chó và mèo đúng không? Để mình kể cho bạn vài câu nhé:\n\n1.  **Vì sao chó lại là bạn thân nhất của con người?**\n    *   Vì nó không bao giờ cho bạn mượn tiền! (Mèo thì có khi lại đòi bạn "cúng" đồ ăn cho nó ấy chứ!)\n\n2.  **Con mèo nào sợ chuột Mickey nhất?**\n    *   Tom! (Tom & Jerry đó!)\n\n3.  **Chó và mèo khác nhau ở điểm nào khi đi bơi?**\n    *   Chó thì bơi "gâu gâu", còn mèo thì... "meo meo" xin lên bờ!\n\n4.  **Một con chó đi vào thư viện. Nó nói gì?**\n    *   "Gâu gâu! Sách ở đâu?"\n\n5.  **Hai con mèo ngồi trên mái nhà. Một con bị ngã. Con còn lại nói gì?**\n    *   "Meo!" (Ý là "Một mình tao!")\n\nBạn thấy mấy câu này thế nào? Nếu bạn muốn nghe nữa, cứ bảo mình nhé! Mình còn nhiều chuyện về chó và mèo lắm!', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run-a9ff2cc0-536b-4675-809c-7bae25d9677e-0', usage_

# 3. Process PDF Document

## 3.1 Load PDF Document

In [50]:
from langchain.document_loaders import PyPDFLoader

# Load the PDF document
loader = PyPDFLoader("data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf")
pages = loader.load()
pages

Ignoring wrong pointing object 18 0 (offset 0)


[Document(metadata={'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'creator': 'Preview', 'creationdate': "D:20240909152042Z00'00'", 'author': 'Thu Vu', 'moddate': "D:20240910141854Z00'00'", 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='APPLIED COGNITIVE PSYCHOLOGY\nAppl. Cognit. Psychol. 20: 139–156 (2006)\nPublished online 31 October 2005 in Wiley InterScience\n(www.interscience.wiley.com) DOI: 10.1002/acp.1178\nConsequences of Erudite Vernacular Utilized Irrespective\nof Necessity: Problems with Using Long Words Needlessly\nDANIEL M. OPPENHEIMER*\nPrinceton University, USA\nSUMMARY\nMost texts on writing style encourage authors to avoid overly-complex words. However, a majority\nof undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give\nthe impression of intelligen

## 3.2 Split Document into Chunks

In [60]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the document into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " "]
)
chunks = text_splitter.split_documents(pages)

In [61]:
len(chunks)

6

# 4. Create Embeddings

In [53]:
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

# Define the embedding function
def get_embedding_function():
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/text-embedding-004",  # Replace with a valid Google embedding model
        api_key=GOOGLE_API_KEY  # Replace with your Google API Key
    )
    return embeddings

# Initialize the embedding function
embedding_function = get_embedding_function()

# Test the embedding function
test_vector = embedding_function.embed_query("dog")
print(test_vector)

[-0.009113340638577938, 0.025642933323979378, 0.019517434760928154, 0.0006511401734314859, 0.027571074664592743, 0.10505122691392899, 0.03237470984458923, 0.013585681095719337, 0.04713313654065132, 0.08451203256845474, -0.03134593740105629, 0.056636206805706024, 0.09321527928113937, 0.03322876989841461, 0.028888128697872162, -0.023924704641103745, -0.046473126858472824, 0.007877393625676632, -0.0012024304596707225, 0.04353101924061775, 0.0031726756133139133, -0.02610534243285656, 0.04230271279811859, 0.010575942695140839, -0.04981819540262222, -0.014631428755819798, 0.029210373759269714, -0.004830615129321814, 0.03686540573835373, -0.05973043665289879, -0.0018622172065079212, 0.06998338550329208, 0.008795134723186493, -0.02297103963792324, 0.031142476946115494, 0.02154422365128994, 0.018226802349090576, 0.04965130239725113, 0.007079762872308493, -0.06693778187036514, -0.008708070032298565, -0.018570711836218834, -0.059379804879426956, 0.02941335178911686, 0.051292601972818375, 0.012958

# 5. Create Vector Database

## 5.1 Define Vector Database Creation Function

In [54]:
import uuid
from langchain.vectorstores import Chroma

def create_vectorstore(chunks, embedding_function, vectorstore_path):
    # Create unique IDs for each document chunk
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure unique documents
    unique_ids = set()
    unique_chunks = []
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)
    
    # Create and persist the vector database
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function,
        persist_directory=vectorstore_path
    )
    vectorstore.persist()
    return vectorstore

In [58]:

print(f"Number of vectors in the vector store: {vector_count}")

Number of vectors in the vector store: 9


## 5.2 Create and Persist Vector Database

In [None]:
# Create the vector database
vectorstore = create_vectorstore(
    chunks=chunks,
    embedding_function=embedding_function,
    vectorstore_path="vectorstore_chroma"
)

In [None]:
vector_count = vectorstore.count()
print(f"Number of vectors in the vector store: {vector_count}")

AttributeError: 'Chroma' object has no attribute 'count'

# 6. Query for Relevant Data

## 6.1 Load Vector Database

In [None]:
# Load the vector database
vectorstore = Chroma(
    persist_directory="vectorstore_chroma",
    embedding_function=embedding_function
)

## 6.2 Retrieve Relevant Chunks

In [None]:
# Create a retriever to get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the author of the paper?")
relevant_chunks

[Document(metadata={'author': 'Thu Vu', 'creationdate': "D:20240909152042Z00'00'", 'creator': 'Preview', 'moddate': "D:20240910141854Z00'00'", 'page': 0, 'page_label': '1', 'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'total_pages': 3}, page_content='APPLIED COGNITIVE PSYCHOLOGY\nAppl. Cognit. Psychol. 20: 139–156 (2006)\nPublished online 31 October 2005 in Wiley InterScience\n(www.interscience.wiley.com) DOI: 10.1002/acp.1178\nConsequences of Erudite Vernacular Utilized Irrespective\nof Necessity: Problems with Using Long Words Needlessly\nDANIEL M. OPPENHEIMER*\nPrinceton University, USA\nSUMMARY\nMost texts on writing style encourage authors to avoid overly-complex words. However, a majority\nof undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give\nthe impression of intelligen

# 7. Generate Responses

## 7.1 Define Prompt Template

In [None]:
from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. DON'T MAKE UP ANYTHING.
{context}
--
Answer the question based on the above context: {question}
"""

## 7.2 Generate Response

In [None]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create and format the prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question="What is the title of the paper?")

# Invoke the LLM to generate a response
response = llm.invoke(prompt)
print(response)

content='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run-15d2c7d8-606b-4ee3-b91b-a25283c8395e-0' usage_metadata={'input_tokens': 1346, 'output_tokens': 22, 'total_tokens': 1368, 'input_token_details': {'cache_read': 0}}


# 8. Using Langchain Expression Language

In [None]:
from langchain_core.runnables import RunnablePassthrough

# Define a function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a retrieval-augmented generation (RAG) chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)

# Invoke the RAG chain to get the title of the paper
response = rag_chain.invoke("What's the title of this paper?")
print(response)

content='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run-b2b0d23e-9f54-4351-9942-e7388118da5b-0' usage_metadata={'input_tokens': 1338, 'output_tokens': 23, 'total_tokens': 1361, 'input_token_details': {'cache_read': 0}}


# 9. Generate Structured Responses
## 9.1 Define Data Models

In [None]:
from pydantic import BaseModel, Field

# Define a model for the answer with sources and reasoning
class AnswerWithSources(BaseModel):
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")

# Define a model for extracted information about the research article
class ExtractedInfo(BaseModel):
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

## 9.2 Generate Structured Response

In [None]:
# Create a RAG chain with structured output
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm.with_structured_output(ExtractedInfo)
)

# Invoke the RAG chain to get structured information
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")

In [None]:
structured_response 

ExtractedInfo(paper_title=AnswerWithSources(answer='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', sources='Consequences of Erudite Vernacular Utilized Irrespective\nof Necessity: Problems with Using Long Words Needlessly', reasoning='The title of the paper is clearly stated at the beginning of the text.'), paper_summary=AnswerWithSources(answer='This paper explores the extent to which the strategy of using complex vocabulary to appear more intelligent is effective. Experiments 1–3 found a negative relationship between complexity and judged intelligence, regardless of essay quality or prior expectations. The negative impact of complexity was mediated by processing ﬂuency. Experiment 4 found that texts in hard to read fonts are judged to come from less intelligent authors. Experiment 5 investigated discounting of ﬂuency and found that people reduce their reliance on ﬂuency as a cue when obvious causes for low ﬂuency exi

## 9.3 Transform Response into a DataFrame

In [None]:
import pandas as pd

# Convert the structured response to a DataFrame
df = pd.DataFrame([structured_response.dict()])

# Transform into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create a new DataFrame with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame(
    [answer_row, source_row, reasoning_row],
    columns=df.columns,
    index=['answer', 'source', 'reasoning']
)

print(structured_response_df)

                                                 paper_title  \
answer     Consequences of Erudite Vernacular Utilized Ir...   
source     Consequences of Erudite Vernacular Utilized Ir...   
reasoning  The title of the paper is clearly stated at th...   

                                               paper_summary  \
answer     This paper explores the extent to which the st...   
source     Most texts on writing style encourage authors ...   
reasoning  The summary at the beginning of the paper desc...   

                                            publication_year  \
answer                                                  2006   
source     Appl. Cognit. Psychol. 20: 139–156 (2006)\nPub...   
reasoning  The publication year is included in the header...   

                                               paper_authors  
answer                                 DANIEL M. OPPENHEIMER  
source     DANIEL M. OPPENHEIMER*\nPrinceton University, USA  
reasoning  The author of the paper is li

C:\Users\Admin\AppData\Local\Temp\ipykernel_10724\919763079.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  df = pd.DataFrame([structured_response.dict()])


In [None]:
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Consequences of Erudite Vernacular Utilized Ir...,This paper explores the extent to which the st...,2006,DANIEL M. OPPENHEIMER
source,Consequences of Erudite Vernacular Utilized Ir...,Most texts on writing style encourage authors ...,Appl. Cognit. Psychol. 20: 139–156 (2006)\nPub...,"DANIEL M. OPPENHEIMER*\nPrinceton University, USA"
reasoning,The title of the paper is clearly stated at th...,The summary at the beginning of the paper desc...,The publication year is included in the header...,The author of the paper is listed under the ti...
