# Setup

In [1]:
import os

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

True

## Observability & Monitoring

Phoenix is an open-source observability library designed for experimentation, evaluation, and troubleshooting. It allows AI Engineers and Data Scientists to quickly visualize their data, evaluate performance, track down issues, and export data to improve.
 

In [2]:
from phoenix.trace.langchain import LangChainInstrumentor
import phoenix as px

px.close_app()
session = px.launch_app()

LangChainInstrumentor().instrument()

  from .autonotebook import tqdm as notebook_tqdm


No active session to close
🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [3]:
session.view()

📺 Opening a view to the Phoenix app. The app is running at http://localhost:6006/


# Data Loading

In [4]:
import pandas as pd
        
df = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23_subset.csv')
df

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,77927,'Next level ': Iberdrola plans $ 2.6bn gigawat...,2021-12-02,,['Plans for a gigawatt-scale electrolyser to f...,rechargenews,https://www.rechargenews.com/news/2-1-1111100
1,81316,"Appalachian Power Issues RFP for 1,100 MW of R...",2022-01-19,,['Appalachian Power has issued a new request f...,solarindustrymag,https://solarindustrymag.com/appalachian-power...
2,1323,Orsted's Green Plans Lag Its Peers,2021-06-04,,"[""Danish utility Orsted intends to spend over ...",energyintel,https://www.energyintel.com/0000017b-a7dd-de4c...
3,78039,Offshore wind ' a massive untapped potential '...,2022-08-29,,['Offshore wind power is rapidly emerging as a...,rechargenews,https://www.rechargenews.com/news/2-1-1286193
4,16584,Germany Threatens To Derail EU Green Deal Over...,2023-03-03,,['The “ Climate Chancellor ” needs to draw a l...,cleantechnica,https://cleantechnica.com/2023/03/03/germany-t...
...,...,...,...,...,...,...,...
97,80490,How the Inflation Reduction Act will help more...,2022-12-21,,['The Inflation Reduction Act ( IRA) signed in...,solarpowerworldonline,https://www.solarpowerworldonline.com/2022/12/...
98,37851,Valmet Launches a Zero Effluent System for Enz...,2023-02-27,,"[""By clicking `` Allow All '' you agree to the...",azocleantech,https://www.azocleantech.com/news.aspx?newsID=...
99,6752,Scottish Government mulls upping offshore wind...,2023-01-11,,['Scottish ministers are considering whether t...,energyvoice,https://www.energyvoice.com/renewables-energy-...
100,78408,Strasbourg wants to have its say on geothermal...,2021-11-22,,['Committee investigating earthquakes connecte...,thinkgeoenergy,https://www.thinkgeoenergy.com/strasbourg-want...


# Preprocessing & Indexing

## Splitting content into paragraphs

The content is currently stored as a list of strings. We will convert this into a single string with paragraphs separated by two newlines.

## Cleaning

In [5]:
from src.preprocessing.preprocessor import Preprocessor

preprocessor = Preprocessor(df, explode=False, concatenate_contents=True)
df = preprocessor.preprocess()

In [6]:
len(df)

102

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,content,language,title,date,author,domain,url
0,1284,The slow pace of Japanese reactor restarts con...,en,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,1323,Danish utility Orsted intends to spend over 5...,en,Orsted's Green Plans Lag Its Peers,2021-06-04,,energyintel,https://www.energyintel.com/0000017b-a7dd-de4c...
2,6093,A new report has set out the distinct lack of ...,en,Report - Scottish content in wind projects in ...,2021-09-10,,energyvoice,https://www.energyvoice.com/renewables-energy-...
3,6121,Global energy companies are shifting to ESG in...,en,Taking the temperature: ESG investing in the e...,2021-09-29,,energyvoice,https://www.energyvoice.com/markets/352513/esg...
4,6272,Oil and gas giant BP LSE BP has handed out a ...,en,BP hands out southern North Sea CCS contract,2022-01-19,,energyvoice,https://www.energyvoice.com/renewables-energy-...


# Indexing

For the indexing we use the VectorStore class which bundles the embeddings and ChromaDB.

In [11]:
from src.vectorstore import VectorStore
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en", 
    model_kwargs={"device": "cpu"}, 
    encode_kwargs={"normalize_embeddings": True}
)

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=32, length_function=len)

In [17]:
from langchain_core.documents import Document
import ast

def concat_chunks(chunks):
    chunks = ast.literal_eval(chunks)

def create_documents(df, splitter):
    docs = []
    for index, row in tqdm(df.iterrows()):
        content = row['content']

        row = row.fillna('')

        metadata = {
            "url": row['url'],
            "domain": row['domain'],
            "title": row['title'],
            "author": row['author'],
            "date": row['date']
        }
        
        docs.append(Document(page_content=content, metadata=metadata))
        
    return docs

documents = create_documents(df, recursive_text_splitter)

102it [00:00, 8038.54it/s]


In [18]:
print("ChromeDB Host: ", os.getenv('CHROMADB_HOST'))
print("ChromeDB Port: ", os.getenv('CHROMADB_PORT'))

bge_vector_store = VectorStore(embedding_function=bge_embeddings,
                               collection="cleantech-subset-analysis-nils")

ChromeDB Host:  100.109.183.32
ChromeDB Port:  8192


In the next step we will add the documents to the vector store. This will take a while depending on the number of documents.

In [19]:
#%%script false --no-raise-error

bge_vector_store.add_documents(documents, verbose=True, batch_size=128)

100%|██████████| 1/1 [00:18<00:00, 18.65s/it]


After adding the documents to the vector store we can now perform similarity searches.

In [20]:
bge_vector_store.similarity_search_w_scores("The company is also aiming to reduce gas flaring?")

[(Document(page_content='LNG  Mexico  Natural Gas Prices  NGI All News Access  NGI The Weekly Gas Market Report Mexico Pacific Ltd LLC  MPL continues to advance its liquefied natural gas  LNG export project for the Pacific coast of the country This week the company signed a collaboration agreement with ConocoPhillips LNG Licensing LLC and Bechtel Together the companies plan to work with construction firm Techint SA de CV to pursue  innovative lower carbon LNG design solutions  for potential future phases of the project The collaboration plans to further reduce baseline emissions by exploring  energy transition and greenhouse gas emission reduction  technologies developed by ConocoPhillips LNG and designed by Bechtel This would include  carbonadvantaged gas feedstock  and  developing a commercial strategy for supplying carbon neutral LNG in the future  said ConocoPhillips Tom Mathiasmeier president of Global Gas Power and LNG  Get in the know Access to pipelines processing plants and LN

# Retrieval & Generation

## First Retrieval Cooking 


In [21]:
from src.generation import get_llm_model, LLMModel

azure_model = get_llm_model(LLMModel.GPT_3_AZURE)

In [22]:
rag_prompt = """
Answer the question to your best knowledge when looking at the following context:
{context}
                
Question: {question}
"""

In [23]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | ChatPromptTemplate.from_template(rag_prompt)
        | azure_model
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {
        "context": bge_vector_store.get_retriever(), 
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

In [24]:
rag_chain_with_source.invoke("Is the company aiming to reduce gas flaring?")

{'context': [Document(page_content='Global energy companies are shifting to ESG investing but what are the drivers to decarbonise and the innovative approaches being deployed In this new series Mike Scott takes the temperature of the industry s response to date as well as the challenges ahead that this shift will present The energy sector faces a turbulent transitional autumn framed by the latest report from the Intergovernmental Panel on Climate Change  IPCC and the crucial COP26 Glasgow Scotland meeting in November We re seeing a growing number of companies divesting their highcarbon assets as BHP has just done in selling its oil and gas unit to Australian compatriot Woodside However investors and campaigners are starting to point out that while this helps the seller to decarbonise it makes no difference to total carbon emissions  indeed if you sell to someone less concerned about climate change you may end up increasing carbon emissions The coal mining sector should give energy comp

## Alternative Retrieval Cooking

In [25]:
rag_prompt = """
Answer the question to your best knowledge when looking at the following context:
{context}
                
Question: {question}
"""

In [26]:
rag_gpt_improved_prompt = """

Critically answer the question, using the following context and your own knowledge:

Context: {context}
Question: {question}

In case that the provided context is not sufficient, explain that the knowledge is not available but that given your own knowledge you can provide an answer which you will tag as (own knowledge).

Ensure that the answer is well-structured and provides a clear and concise response.
"""

In [27]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | ChatPromptTemplate.from_template(rag_gpt_improved_prompt)
        | azure_model
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {
        "context": bge_vector_store.get_retriever(), 
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

In [28]:
rag_chain_from_docs

RunnableAssign(mapper={
  context: RunnableLambda(lambda x: format_docs(x['context']))
})
| ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='\n\nCritically answer the question, using the following context and your own knowledge:\n\nContext: {context}\nQuestion: {question}\n\nIn case that the provided context is not sufficient, explain that the knowledge is not available but that given your own knowledge you can provide an answer which you will tag as (own knowledge).\n\nEnsure that the answer is well-structured and provides a clear and concise response.\n'))])
| AzureChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x2d879dc70>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x2d879ede0>, openai_api_key=SecretStr('**********'), openai_proxy='', azure_endpoint='https://p-oai-htdsnlp-01.openai.azure.com/', deploymen

In [29]:
rag_chain_with_source.invoke("Is the company aiming to reduce gas flaring?")

{'context': [Document(page_content='Global energy companies are shifting to ESG investing but what are the drivers to decarbonise and the innovative approaches being deployed In this new series Mike Scott takes the temperature of the industry s response to date as well as the challenges ahead that this shift will present The energy sector faces a turbulent transitional autumn framed by the latest report from the Intergovernmental Panel on Climate Change  IPCC and the crucial COP26 Glasgow Scotland meeting in November We re seeing a growing number of companies divesting their highcarbon assets as BHP has just done in selling its oil and gas unit to Australian compatriot Woodside However investors and campaigners are starting to point out that while this helps the seller to decarbonise it makes no difference to total carbon emissions  indeed if you sell to someone less concerned about climate change you may end up increasing carbon emissions The coal mining sector should give energy comp

# Evaluation

In [30]:
df_eval = pd.read_csv('data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')
df_eval.head()

Unnamed: 0,example_id,question_id,question,relevant_chunk,article_url
0,1,1,What is the innovation behind Leclanché's new ...,Leclanché said it has developed an environment...,https://www.sgvoice.net/strategy/technology/23...
1,2,2,What is the EU’s Green Deal Industrial Plan?,The Green Deal Industrial Plan is a bid by the...,https://www.sgvoice.net/policy/25396/eu-seeks-...
2,3,2,What is the EU’s Green Deal Industrial Plan?,The European counterpart to the US Inflation R...,https://www.pv-magazine.com/2023/02/02/europea...
3,4,3,What are the four focus areas of the EU's Gree...,The new plan is fundamentally focused on four ...,https://www.sgvoice.net/policy/25396/eu-seeks-...
4,5,4,When did the cooperation between GM and Honda ...,What caught our eye was a new hookup between G...,https://cleantechnica.com/2023/05/08/general-m...


In [31]:
df_eval_subset = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23_subset_eval.csv')
df_eval_subset

Unnamed: 0,question,relevant_chunk,answer,question_complexity,episode_done
0,How does the IRS guidance on the domestic cont...,"[""['The IRS’ guidance on the domestic content ...",The IRS guidance on the domestic content tax b...,simple,True
1,What is the fallacy of energy independence and...,"[""['Anytime anyone uses the term ‘ energy inde...",The fallacy of energy independence is that it ...,simple,True
2,What is the DOE doing to jump-start solar ener...,"[""clean energy economy, ” said U.S. Secretary ...",The DOE is working to jump-start solar energy ...,simple,True
3,How could the participation of prominent playe...,"['US supply chain, advance flexibility in tran...",The participation of prominent players in the ...,simple,True
4,What are the projected growth rates and instal...,"[""as markets/projects mature '' ( WGI Feb.24'2...",Orsted expects offshore wind to have an annual...,simple,True
5,Why is sustained effort and consistency import...,"[""pupils in your area or does it begin with gr...",Sustained effort and consistency are important...,simple,True
6,What is Appalachian Power's request for renewa...,"[""['Appalachian Power has issued a new request...",Appalachian Power has issued a request for pro...,reasoning,True
7,Can you provide examples of companies and ener...,"[""company’, combined with a SPAC in a deal tha...",Examples of companies investing in green energ...,reasoning,True
8,What actions is the Department of Energy takin...,"[""clean energy economy, ” said U.S. Secretary ...",The Department of Energy is working to jump-st...,reasoning,True
9,What are the potential benefits of the Califor...,"['US supply chain, advance flexibility in tran...",The California lease auction for offshore wind...,multi_context,True


In [32]:
from src.evaluation import RAGEvaluator

rag_evaluator = RAGEvaluator(chain=rag_chain_with_source,
                             llm_model=azure_model,
                             embeddings=bge_vector_store)

rag_evaluator.create_dataset_from_df(df_eval_subset)
#rag_evaluator.create_dataset_from_df_async(df_eval_subset, max_concurrency=2)
rag_evaluator.evaluate()

100%|██████████| 49/49 [05:02<00:00,  6.18s/it]


ArrowTypeError: Expected bytes, got a 'float' object

Further try evaluating the model with Phoenix's evaluation tools. Read more: https://docs.arize.com/phoenix/evaluation/running-pre-tested-evals/retrieval-rag-relevance