In [1]:
import os
import sys

In [18]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

### Helper Functions
---

In [21]:
def replace_t_with_space(documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document.

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in documents:
        doc.page_content = doc.page_content.replace("\t", " ")
    return documents

In [22]:
def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves relevant context and unique URLs for a given question using the chunks query retriever.

    Args:
        question: The question for which to retrieve context and URLs.

    Returns:
        A tuple containing:
        - A string with the concatenated content of relevant documents.
        - A list of unique URLs from the metadata of the relevant documents.
    """

    # Retrieve relevant documents for the given question
    docs = chunks_query_retriever.invoke(question)

    # Concatenate document content
    # context = " ".join(doc.page_content for doc in docs)
    context = [doc.page_content for doc in docs]

    return context

In [23]:
def show_context(context):
    """
    Display the contents of the provided context list.

    Args:
        context (list): A list of context items to be displayed.

    Prints each context item in the list with a heading indicating its position.
    """
    for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n")

### Ingest the Document
---

In [24]:
path = "./data/Understanding_Climate_Change.pdf"

### Encode Document
---

In [25]:
def encode_pdf(path, hf_model, chunk_size=1000, chunk_overlap=200):
    """
    Encode a PDF into a vector store using HuggingFace Embedding Model
    """
    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = replace_t_with_space(texts)

    # Initialize HuggingFace embeddings
    hf_embeddings = HuggingFaceEmbeddings(model_name=hf_model)

    # Use FAISS.from_documents instead of FAISS.from_texts
    vectorstore = FAISS.from_documents(cleaned_texts, hf_embeddings)

    return vectorstore

In [26]:
docpath = path
hf_model = "sentence-transformers/all-MiniLM-L6-v2"
chunks_vector_store = encode_pdf(path, hf_model, chunk_size=1000, chunk_overlap=200)

### Create Retriever
---

In [27]:
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 2})

In [28]:
test_query = "What is the main cause of climate change?"
context = retrieve_context_per_question(test_query, chunks_query_retriever)
show_context(context)

Context 1:
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which 
began at the end of the last ice age, human societies f lourished, but the industrial era has seen 
unprecedented changes.  
Modern Observations  
Modern scientific observations indicate a rapid increase in global temperatures, sea levels, 
and extreme weather events. The Intergovernmental Panel on Climate Change (IPCC) has 
documented these changes extensively. Ice core samples, tree rings, and ocean sediments 
provide a historical record that scientists use to understand past climate conditions and 
predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhou se gases.  
Chapter 2: Causes of Climate Change  
Greenhouse Gases


Context 2:
Changing Seasons  
Climate change is altering the timing and l