In [18]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.evaluation import load_evaluator

import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv

In [19]:
load_dotenv()

True

In [20]:
api_key = os.getenv("OPENAI_API_KEY")

## Create LLM Client

In [21]:
llm = ChatOpenAI(model="gpt-4o-mini")

In [22]:
llm.invoke("tell a joke")

AIMessage(content='Why don’t skeletons fight each other?\n\nThey don’t have the guts!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 10, 'total_tokens': 27, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_b376dfbbd5', 'id': 'chatcmpl-BJROUXnrKnSiWc7Y5W1OAWkv8A1XZ', 'finish_reason': 'stop', 'logprobs': None}, id='run-356e13fd-5a33-4f4d-9c19-99362e374dd0-0', usage_metadata={'input_tokens': 10, 'output_tokens': 17, 'total_tokens': 27, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Process PDF Document 

In [23]:
loader = PyPDFLoader("./wwf_report.pdf")
pages = loader.load()

In [24]:
len(pages)

94

In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                      chunk_overlap=200,
                                      length_function=len,
                                      separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

## Create embeddings

In [26]:
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [27]:
def get_embedding(model, chunk):
    return model.embed_query(chunk)

In [29]:
evaluator = load_evaluator(evaluator="embedding_distance",
                           embeddings=embedding_model)

In [35]:
evaluator.evaluate_strings(prediction="nature", reference="lmao")

{'score': 0.21930473337633283}

## Create vector database

In [None]:
# vectorstore = Chroma.from_documents(documents=chunks,
#                                     embedding=embedding_model,
#                                     persist_directory="vectorstore")
# vectorstore.persist()

  vectorstore.persist()


In [39]:
vectorstore = Chroma(persist_directory="vectorstore", embedding_function=embedding_model)

In [59]:
retriever = vectorstore.as_retriever(search_type="similarity")

In [60]:
def query_embedding(embedding_retriever, query):
    return embedding_retriever.invoke(query)

In [66]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering task.
Use the following pieces of retrieved context to answer the question.
If you don't know the answe, say that you don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer : the question based on the above context: {question}
"""

In [69]:
query = "Give a summary of the report"
relevant_chunks = query_embedding(retriever, query)

context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text,
                                question=query)
print(prompt)

Human: 
You are an assistant for question-answering task.
Use the following pieces of retrieved context to answer the question.
If you don't know the answe, say that you don't know. DON'T MAKE UP ANYTHING.

WWF LIVING PLANET REPORT 2024
13 EXECUTIVE SUMMARY
Making it happen
With every issue of the WWF Living Planet Report, we see a further decline in the state of nature and a 
destabilization of the climate. This cannot continue.
It is no exaggeration to say that what happens in the next five years will determine the future of life on Earth. 
We have five years to place the world on a sustainable trajectory before negative feedbacks of combined 
nature degradation and climate change place us on the downhill slope of runaway tipping points. The risk  
of failure is real – and the consequences almost unthinkable.
As a global community, we have agreed on a way forward. The global goals show where we want to be and 
the path we need to take. All of us – governments, companies, organization

In [None]:
response = llm.invoke(prompt)
response.content