# Import dependencies

In [1]:
import chromadb
import os

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnablePassthrough
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas.llms import LangchainLLMWrapper
from ragas import evaluate

from langchain_community.retrievers import BM25Retriever
from langchain.docstore.document import Document
from langchain.retrievers import EnsembleRetriever
from langchain_community.vectorstores import Chroma
from ragas.testset.synthesizers import default_query_distribution
from datasets import Dataset

from PyPDF2 import PdfReader
import pandas as pd

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity,
)

  from .autonotebook import tqdm as notebook_tqdm


# Global variables

In [2]:
base_url = "http://61.28.230.60:11434"

llama3_model_name = "llama3.1:8b"
gemma2_model_name = "gemma2:9b"

pdf_path = "./data/google-2023-environmental-report.pdf"
user_query = "What are Google's environmental initiatives?"

collection_name = "google_environmental_report"

In [3]:
str_output_parser = StrOutputParser()

In [4]:
embedding_function = OllamaEmbeddings(model=llama3_model_name, base_url=base_url)

llm = ChatOllama(
    model=llama3_model_name, base_url=base_url, temperature=0.1, num_predict=1024
)

generator_llm = ChatOllama(
    model=llama3_model_name, base_url=base_url, temperature=0.1, num_predict=1024
)

critic_llm = ChatOllama(
    model=gemma2_model_name, base_url=base_url, temperature=0.1, num_predict=1024
)

critic_llm_embedding = OllamaEmbeddings(model=gemma2_model_name, base_url=base_url)

# Indexing

Load PDF file and extract text

In [5]:
pdf_reader = PdfReader(pdf_path)
text = ""
for page in pdf_reader.pages:
    text += page.extract_text()

In [6]:
print(text[:300])

Environmental 
Report
2023What’s 
inside
About this report
Google’s 2023 Environmental Report provides an overview of our environmental 
sustainability strategy and targets and our annual progress towards them. 1  
This report features data, performance highlights, and progress against our targets f


# Split

In [7]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1000, chunk_overlap=200
)

splits = character_splitter.split_text(text)

In [8]:
dense_documents = [
    Document(page_content=text, metadata={"id": str(i), "source": "dense"})
    for i, text in enumerate(splits)
]
sparse_documents = [
    Document(page_content=text, metadata={"id": str(i), "source": "sparse"})
    for i, text in enumerate(splits)
]

In [9]:
chroma_client = chromadb.Client()
vectorstore = Chroma.from_documents(
    documents=dense_documents,
    embedding=embedding_function,
    collection_name=collection_name,
    client=chroma_client,
)

In [10]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(sparse_documents, k=10)
ensemble_retriever = EnsembleRetriever(
    retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0
)

# RETRIEVAL and GENERATION

In [11]:
# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise
prompt = hub.pull("jclemens24/rag-prompt")



In [12]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.

    Question: {question}
    Retrieved Context: {retrieved_context}
    
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Relevance Score:"""
)

In [13]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [14]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0


# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x["relevance_score"])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x["answer"]

In [15]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {
            "relevance_score": (
                RunnablePassthrough()
                | (
                    lambda x: relevance_prompt_template.format(
                        question=x["question"], retrieved_context=x["context"]
                    )
                )
                | llm
                | str_output_parser
            ),
            "answer": (RunnablePassthrough() | prompt | llm | str_output_parser),
        }
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [16]:
rag_chain_similarity = RunnableParallel(
    {"context": dense_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [17]:
rag_chain_hybrid = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [18]:
# Question - Submitted to the similarity / dense vector search
result = rag_chain_similarity.invoke(user_query)
retrieved_docs = result["context"]

print(f"Original Question to Similarity Search: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(
        f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['source']}"
    )
    print(f"Content:\n{doc.page_content}\n")

Original Question to Similarity Search: What are Google's environmental initiatives?

Relevance Score: 4

Final Answer:
Google's environmental initiatives include:

1. **CDP (formerly known as the Carbon Disclosure Project)**: Google has been reporting its carbon footprint to CDP since 2009 and has partnered with CDP to host its annual conference, a hack-a-thon, and to launch CDP scores in Google Finance.
2. **Clean Energy Buyers Association (CEBA)**: Google was involved in the creation of CEBA in 2018 and continues to serve as the Board Chair of this organization.
3. **Exponential Roadmap Initiative**: Google joined this initiative in 2021, which is committed to halving emissions before 2030 towards net-zero emissions by no later than 2050.
4. **First Movers Coalition**: Google joined this coalition in 2022 and committed to contract for durable and scalable net carbon dioxide removal to be achieved by the end of 2030.
5. **Google Earth Engine**: This platform provides access to reliab

In [19]:
# Question - Submitted to the hybrid / multi-vector search
result = rag_chain_hybrid.invoke(user_query)
retrieved_docs = result["context"]

print(f"Original Question to Dense Search: {user_query}\n")
print(f"Relevance Score: {result['answer']['relevance_score']}\n")
print(f"Final Answer:\n{result['answer']['final_answer']}\n\n")
print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(
        f"Document {i}: Document ID: {doc.metadata['id']} source: {doc.metadata['source']}"
    )
    print(f"Content:\n{doc.page_content}\n")

Original Question to Dense Search: What are Google's environmental initiatives?

Relevance Score: 5

Final Answer:
It appears that you have provided a large block of text from Google's 2023 Environmental Report. I'll do my best to summarize the main points related to the question "Emerging opportunities" and specifically answer the implied question about AI for sustainability.

**Summary of Emerging Opportunities:**

The report highlights several emerging opportunities, including:

1. **Artificial Intelligence (AI) for Sustainability**: Google is focusing on using AI to help individuals and organizations reduce their environmental impact.
2. **Data-Driven Decision Making**: Google's data analytics tools are helping organizations make more informed decisions about their operations and supply chains.

**Specific Answer:**

The report mentions that Google is focusing on using AI to help build a more sustainable future, but it does not provide specific details about the initiatives or proj

# SYNTHETIC DATA GENERATION

In [20]:
# generator with openai models
generator = TestsetGenerator.from_langchain(
    generator_llm, critic_llm, embedding_function
)

In [21]:
# Create a list of Document objects from the chunks
documents = [Document(page_content=chunk) for chunk in splits]

query_distribution = default_query_distribution(LangchainLLMWrapper(generator_llm))


#### FOR FOLLOWING CODE: Uncomment and run once to generate source for test dataset! ####
# generate testset -
testset = generator.generate_with_langchain_docs(
    documents,
    testset_size=10,
    query_distribution=query_distribution,
)

                                                                            

KeyboardInterrupt: 

In [None]:
# comparison dataframe
testset_df = testset.to_pandas()

# save dataframes to CSV files in the specified directory
testset_df.to_csv(os.path.join("./data/testset_data.csv"), index=False)

print("testset DataFrame saved successfully in the local directory.")

In [20]:
# pull data from saved testset, rather than generating above
### load dataframs from CSV file
saved_testset_df = pd.read_csv(os.path.join("./data/testset_data.csv"))
print("testset DataFrame loaded successfully from local directory.")
saved_testset_df.head(5)

testset DataFrame loaded successfully from local directory.


Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How did Google prioritize the human experience...,"['Moving ahead, we’ll continue to build partne...",Google prioritized the human experience and co...,simple,[{}],True
1,How does the climate-conscious data center coo...,"['In 2022, we described our climate-conscious ...",The climate-conscious data center cooling stra...,simple,[{}],True
2,How does Google aim to promote sustainability ...,['that Google can make a meaningful difference...,Google aims to promote sustainability through ...,simple,[{}],True
3,What is Google's involvement in the iMasons Cl...,['iMasons Climate AccordGoogle is a founding m...,Google is a founding member and part of the go...,simple,[{}],True
4,What is the impact of the Rødby solar farm in ...,['0246\n2.49Scope 2 emissions \n(million tCO2e...,The impact of the Rødby solar farm in Denmark ...,simple,[{}],True


# PREPARE SIMILARITY SEARCH DATASET

In [21]:
# Convert the DataFrame to a dictionary
saved_testing_data = saved_testset_df.astype(str).to_dict(orient="list")

# Create the testing_dataset
saved_testing_dataset = Dataset.from_dict(saved_testing_data)

# Update the testing_dataset to include only these columns -
# "question", "ground_truth", "answer", "contexts"
saved_testing_dataset_sm = saved_testing_dataset.remove_columns(
    ["evolution_type", "episode_done"]
)

In [22]:
saved_testing_dataset_sm

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'metadata'],
    num_rows: 10
})

# EVAL SETS FOR EACH CHAIN


In [23]:
# Function to generate answers using the RAG chain
def generate_answer(question, ground_truth, rag_chain):
    result = rag_chain.invoke(question)
    return {
        "question": question,
        "answer": result["answer"]["final_answer"],
        "contexts": [doc.page_content for doc in result["context"]],
        "ground_truth": ground_truth,
    }

In [24]:
# Add the "question", "answer", "contexts", and "ground_truth" to the testing_dataset
testing_dataset_similarity = saved_testing_dataset_sm.map(
    lambda x: generate_answer(x["question"], x["ground_truth"], rag_chain_similarity),
    remove_columns=saved_testing_dataset_sm.column_names,
)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map: 100%|██████████| 10/10 [00:19<00:00,  1.99s/ examples]


# EVAL SCORING

In [30]:
# Similarity search score
score_similarity = evaluate(
    testing_dataset_similarity,
    llm=critic_llm,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        answer_correctness,
        answer_similarity,
    ],
    embeddings=critic_llm_embedding,
)
similarity_df = score_similarity.to_pandas()
similarity_df

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Exception raised in Job[7]: ResponseError({})
Evaluating:   2%|▏         | 1/60 [01:55<1:53:28, 115.40s/it]Exception raised in Job[5]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[0]: TimeoutError()
Exception raised in Job[2]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Evaluating:   3%|▎         | 2/60 [03:00<1:22:42, 85.56s/it] Exception raised in Job[6]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[1]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Evaluating:  28%|██▊       | 17/60 [03:34<06:03,  8.44s/it] Exception raised in Job[31]: AttributeError('StringIO' object has no at

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall,answer_correctness,semantic_similarity
0,How did Google prioritize the human experience...,"[Given the scale of the problem, innovation w...",To answer your question about how Google prior...,Google prioritized the human experience and co...,,,,,,
1,How does the climate-conscious data center coo...,"[Given the scale of the problem, innovation w...",The climate-conscious data center cooling stra...,The climate-conscious data center cooling stra...,,,,,,
2,How does Google aim to promote sustainability ...,"[Given the scale of the problem, innovation w...",Google aims to promote sustainability through ...,Google aims to promote sustainability through ...,,,,,,0.541176
3,What is Google's involvement in the iMasons Cl...,[CDP (formerly \nknown as the Carbon Disclosu...,I don't know.,Google is a founding member and part of the go...,,,,,,0.283941
4,What is the impact of the Rødby solar farm in ...,[CDP (formerly \nknown as the Carbon Disclosu...,I don't know.,The impact of the Rødby solar farm in Denmark ...,,,,,,0.55019
5,How did Google work with CSIRO and Kaggle on a...,"[Given the scale of the problem, innovation w...",I don't know.,Google collaborated with the Commonwealth Scie...,,,,,,
6,What's the recycled aluminum percentage in new...,"[Given the scale of the problem, innovation w...",I don't know.,Recycled aluminum in the enclosures of new Goo...,,,,,,0.354894
7,How has Google worked with BEF to support wate...,[CDP (formerly \nknown as the Carbon Disclosu...,I don't know the specific details about Google...,Google has partnered closely with Bonneville E...,,,,,,
8,What environmental data does an independent au...,[We respect the independence and agency of tra...,"Based on the provided context, an independent ...",An independent auditor reviews select environm...,,,,,,
9,How many Olympic-sized swimming pools are equi...,[AI for sustainability\nSeven years into our j...,To calculate how many Olympic-sized swimming p...,The answer to given question is not present in...,,,,,0.818011,
