# Setup

In [20]:
import os

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

True

## Observability & Monitoring

Phoenix is an open-source observability library designed for experimentation, evaluation, and troubleshooting. It allows AI Engineers and Data Scientists to quickly visualize their data, evaluate performance, track down issues, and export data to improve.
 

In [21]:
from phoenix.trace.langchain import LangChainInstrumentor
import phoenix as px

px.close_app()
session = px.launch_app()

LangChainInstrumentor().instrument()

Attempting to instrument while already instrumented


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [22]:
session.view()

📺 Opening a view to the Phoenix app. The app is running at http://localhost:6006/


# Data Loading

In [23]:
df = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,1280,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,,"[""Qatar Petroleum ( QP) is targeting aggressiv...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,1281,India Launches Its First 700 MW PHWR,2021-01-15,,"[""• Nuclear Power Corp. of India Ltd. ( NPCIL)...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,1283,New Chapter for US-China Energy Trade,2021-01-20,,"[""New US President Joe Biden took office this ...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,1284,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,"[""The slow pace of Japanese reactor restarts c...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,1285,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,,"[""Two of New York City's largest pension funds...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...


# Preprocessing & Indexing

## Splitting content into paragraphs

The content is currently stored as a list of strings. We will convert this into a single string with paragraphs separated by two newlines.

In [24]:
import ast

df['content'] = df['content'].apply(ast.literal_eval)
df['content'] = df['content'].apply(lambda x: [str(i) for i in x])
df['content'] = df['content'].apply(lambda x: '\n\n'.join(x))
df['content'].head()

0    Qatar Petroleum ( QP) is targeting aggressive ...
1    • Nuclear Power Corp. of India Ltd. ( NPCIL) s...
2    New US President Joe Biden took office this we...
3    The slow pace of Japanese reactor restarts con...
4    Two of New York City's largest pension funds s...
Name: content, dtype: object

# Indexing

For the indexing we use the VectorStore class which bundles the embeddings and ChromaDB.

In [25]:
from src.vectorstore import VectorStore
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en", 
    model_kwargs={"device": "cpu"}, 
    encode_kwargs={"normalize_embeddings": True}
)

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=32, length_function=len)

In [27]:
from langchain_core.documents import Document

def create_documents(df, splitter):
    docs = []
    for index, row in tqdm(df.iterrows()):
        content = row['content']

        row = row.fillna('')

        metadata = {
            "url": row['url'],
            "domain": row['domain'],
            "title": row['title'],
            "author": row['author'],
            "date": row['date']
        }
        
        for chunk in splitter.split_text(content):
            docs.append(Document(page_content=chunk, metadata=metadata))
    
    return docs

documents = create_documents(df, recursive_text_splitter)

0it [00:00, ?it/s]

9593it [00:01, 6393.11it/s]


In [28]:
print("ChromeDB Host: ", os.getenv('CHROMADB_HOST'))
print("ChromeDB Port: ", os.getenv('CHROMADB_PORT'))

bge_vector_store = VectorStore(embedding_function=bge_embeddings,
                               collection="cleantech-bge-small-en")

ChromeDB Host:  100.109.183.32
ChromeDB Port:  8192


In the next step we will add the documents to the vector store. This will take a while depending on the number of documents.

In [29]:
%%script false --no-raise-error

bge_vector_store.add_documents(documents, verbose=True, batch_size=128)

After adding the documents to the vector store we can now perform similarity searches.

In [30]:
bge_vector_store.similarity_search_w_scores("The company is also aiming to reduce gas flaring?")

[(Document(page_content='The company in 2020 announced its intention to reduce operational GHG emissions intensity by 35% to 45% by 2030, a revision from the previous goal of 5% to 15%, and to reach net zero emissions by 2050.\n\nThe path to net zero will involve tools such as emissions offsets and carbon capture, utilization and sequestration, Lance said.\n\nWithin the ESG initiatives, Lance and Sheffield also stressed the importance of curbing routine natural gas flaring, which remains a stubborn challenge in the Permian despite progress made on reducing flaring intensity.\n\nOther measures such as electrifying fracture fleets, and better monitoring of methane leaks, will be crucial as well, they said.', metadata={'author': '', 'date': '2021-03-05', 'domain': 'naturalgasintel', 'title': 'ConocoPhillips, Pioneer Natural CEOs Preach Discipline as Permian Activity Rebounds', 'url': 'https://www.naturalgasintel.com/conocophillips-pioneer-natural-ceos-preach-discipline-as-permian-activity

# Retrieval & Generation

In [31]:
from src.generation import get_llm_model, LLMModel

azure_model = get_llm_model(LLMModel.GPT_3_AZURE)

In [32]:
rag_prompt = """
Answer the question to your best knowledge when looking at the following context:
{context}
                
Question: {question}
"""

In [33]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | ChatPromptTemplate.from_template(rag_prompt)
        | azure_model
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {
        "context": bge_vector_store.get_retriever(), 
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

In [34]:
rag_chain_with_source.invoke("Is the company aiming to reduce gas flaring?")

{'context': [Document(page_content='However, the report does concede that “ there does seem to be a sounder logic of fewer high-intensity gas users connecting to a smaller network ”. These users include gas peaker plants, heavy industry requiring high-temperature heat, aviation and long-distance haulage.\n\nThe study does not point out that when hydrogen is combusted, it reacts with nitrogen in the air to produces nitrogen oxides that are greenhouse gases.\n\n* LETI is a UK-based voluntary network consisting of more than 1,000 developers, engineers, housing associations, architects, planners, academics, sustainability professionals, contractors and facilities managers.\n\nRecharge is part of DN Media Group. To read more about DN Media Group, click here\n\nRecharge is part of DN Media Group AS. From November 1st DN Media Group is responsible for controlling your data on Recharge.', metadata={'author': '', 'date': '2021-02-10', 'domain': 'rechargenews', 'title': 'Why using clean hydrogen

# Evaluation

In [35]:
df_eval = pd.read_csv('data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')
df_eval.head()

Unnamed: 0,example_id,question_id,question,relevant_chunk,article_url
0,1,1,What is the innovation behind Leclanché's new ...,Leclanché said it has developed an environment...,https://www.sgvoice.net/strategy/technology/23...
1,2,2,What is the EU’s Green Deal Industrial Plan?,The Green Deal Industrial Plan is a bid by the...,https://www.sgvoice.net/policy/25396/eu-seeks-...
2,3,2,What is the EU’s Green Deal Industrial Plan?,The European counterpart to the US Inflation R...,https://www.pv-magazine.com/2023/02/02/europea...
3,4,3,What are the four focus areas of the EU's Gree...,The new plan is fundamentally focused on four ...,https://www.sgvoice.net/policy/25396/eu-seeks-...
4,5,4,When did the cooperation between GM and Honda ...,What caught our eye was a new hookup between G...,https://cleantechnica.com/2023/05/08/general-m...


In [36]:
from src.generation import get_llm_model, LLMModel
from langchain.evaluation import load_evaluator, EvaluatorType

EVAL_TYPES = {
    "accuracy": True
}

evaluator = load_evaluator(EvaluatorType.LABELED_SCORE_STRING, llm=get_llm_model(LLMModel.GPT_3_AZURE))

for _, item in df_eval.iterrows():
    question, relevant_chunk = item['question'], item['relevant_chunk']
    
    chain_result = rag_chain_with_source.invoke(question)
    answer, context = chain_result['answer'], chain_result['context']
    
    for key, _ in EVAL_TYPES.items():        
        score = evaluator.evaluate_strings(prediction=answer, reference=context, input=question)
        print(f"{key}: {score}")
        print()

accuracy: {'reasoning': "The submission is helpful and correct in stating that the article does not mention Leclanché's new method to produce lithium-ion batteries. However, it lacks depth in providing additional information or insights. \n\nRating: [[7]]", 'score': 7}

accuracy: {'reasoning': 'The assistant\'s response is helpful and relevant to the question. It accurately describes the EU\'s Green Deal Industrial Plan, including its objectives, financing, and focus on four pillars. The response also mentions the plan\'s aim to create a regulatory framework to speed up the deployment of clean energy solutions, ensure Europe remains competitive, and measures to facilitate "open and fair trade" in the clean tech space. Overall, the response demonstrates a good depth of thought and provides a comprehensive overview of the plan. \n\nRating: [[9]]', 'score': 9}

accuracy: {'reasoning': "The assistant provides a clear and concise answer to the user's question and covers the main aspects of 

Further try evaluating the model with Phoenix's evaluation tools. Read more: https://docs.arize.com/phoenix/evaluation/running-pre-tested-evals/retrieval-rag-relevance