In [17]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
from pdfminer.high_level import extract_text

import fitz

## Load PDF File

In [3]:
local_path = "wpp2022_summary_of_results.pdf"

In [19]:
loader = PyMuPDFLoader(file_path=local_path)
data = loader.load()

In [21]:
# preview first page
data[3].page_content

'United Nations Department of Economic and Social Affairs, Population Division\nThe Department of Economic and Social Affairs of the United Nations Secretariat is a vital interface between \nglobal policies in the economic, social and environmental spheres and national action.  The Department \nworks in three main interlinked areas:  (i) it compiles, generates and analyses a wide range of economic, \nsocial and environmental data and information on which States Members of the United Nations draw to \nreview common problems and take stock of policy options; (ii) it facilitates the negotiations of Member \nStates in many intergovernmental bodies on joint courses of action to address ongoing or emerging \nglobal challenges; and (iii) it advises interested Governments on the ways and means of translating policy \nframeworks developed in United Nations conferences and summits into programmes at the country level \nand, through technical assistance, helps build national capacities.\nThe Popu

## Embeddings

In [32]:
!ollama pull mistral

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ 

In [33]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED       
mistral:latest         	f974a74358d6	4.1 GB	21 seconds ago	
nomic-embed-text:latest	0a109f422b47	274 MB	15 hours ago  	
llama2:latest          	78e26419b446	3.8 GB	7 days ago    	


In [23]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [24]:
# Split and Chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [25]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|██████████| 50/50 [06:05<00:00,  7.31s/it]


## Retrieval

In [26]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [27]:
# LLM from Ollama
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [28]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [29]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """"Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [30]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [34]:
chain.invoke(input(""))

OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.25s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


' The document is titled "World Population Prospects 2022: Summary of Results" and appears to be a summary or report from the United Nations Department of Economic and Social Affairs, Population Division. The text discusses the medium scenario presented in the World Population Prospects, which is considered the most likely future trend among various projections. This scenario is based on distinct trajectories of fertility and mortality for individual countries and areas, and it assumes a continuing decline in the level of fertility for countries where women are having more than two births over a lifetime, and a slight increase in the level of fertility for countries where women are having fewer than two births. The text also mentions that long-term mortality trends are assumed to continue improving, with recovery from the COVID-19 pandemic expected between 2022 and 2025. The document does not appear to have a clear subject or title beyond the page title "World Population Prospects 2022

In [35]:
chain.invoke("Where does the projected increase in the global population between 2022 an 2050 is expected to be concentrated at?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]


' The projected increase in the global population between 2022 and 2050 is expected to be concentrated in eight countries: the Democratic Republic of the Congo, Egypt, Ethiopia, India, Nigeria, Pakistan, the Philippines, and the United Republic of Tanzania.'

In [None]:
# Delete all collections in the db
vector_db.delete_collection()