In [26]:
from langchain_community.vectorstores import Chroma
import chromadb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

import nltk
from nltk.corpus import stopwords

from html import unescape
import re

RANDOM_SEED = 1337

## Preprocessing

In [27]:
nltk.download('stopwords')

documents = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')

documents.sample(5, random_state=RANDOM_SEED)

[nltk_data] Downloading package stopwords to /Users/dmnk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
5840,77870,World's first clean-hydrogen investment fund '...,2021-07-28,,['The world’ s first clean-hydrogen investment...,rechargenews,https://www.rechargenews.com/energy-transition...
526,6169,COP26: Scotland's peatlands are a carbon sink ...,2021-11-09,,"['West of the COP26 climate conference, among ...",energyvoice,https://www.energyvoice.com/renewables-energy-...
3589,37939,How Do Climate Change Mitigation Policies Impa...,2023-03-16,,"[""By clicking `` Allow All '' you agree to the...",azocleantech,https://www.azocleantech.com/news.aspx?newsID=...
8311,80434,Senate passes Inflation Reduction Act with lon...,2022-08-08,,['Update 08/16: President Biden signed the IRA...,solarpowerworldonline,https://www.solarpowerworldonline.com/2022/08/...
3537,37887,Researchers Use Drones to Monitor the Fragile ...,2023-03-07,,"[""By clicking `` Allow All '' you agree to the...",azocleantech,https://www.azocleantech.com/news.aspx?newsID=...


In [28]:
import ast

documents['content'] = documents['content'].apply(ast.literal_eval)

documents = documents.explode('content')

### Text Normalization

In [29]:
# set everything to lowercase
documents['content'] = documents['content'].str.lower()

# remove stopwords from content
stop_words = set(stopwords.words('english'))
documents['content'] = documents['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# remove quotes
documents['content'] = documents['content'].str.replace("'", '')

# unescape HTML entities
documents['content'] = unescape(documents['content'])

# remove HTML tags if any
documents['content'] = documents['content'].str.replace(r'<[^>]+>', '')

# Removes specific unwanted characters
documents['content'] = documents['content'].str.replace(r"[\'\/`:“`’]+", '', regex=True)

# Removes non-ASCII (Unicode) characters
documents['content'] = documents['content'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

# Keeps only letters and whitespace
documents['content'] = documents['content'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# trim extra spaces
documents['content'] = documents['content'].str.strip()

documents.sample(5, random_state=RANDOM_SEED)

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
7102,79155,A closer look at liquid air energy storage – p...,2021-08-02,,content protected copyright may reused want co...,pv-magazine,https://www.pv-magazine.com/2021/08/02/a-close...
6683,78724,Pertamina Geothermal to issue USD 400 million ...,2023-04-24,,earlier year pge announced initial public offe...,thinkgeoenergy,https://www.thinkgeoenergy.com/pertamina-geoth...
7058,79111,Albania – pv magazine International,2021-06-03,,albanian utility kesh built mw groundmounted ...,pv-magazine,https://www.pv-magazine.com/region/albania/
7108,79162,PV-powered geothermal heat pump system for gre...,2021-08-16,,cookie settings website set allow cookies gi...,pv-magazine,https://www.pv-magazine.com/2021/08/16/pv-powe...
6919,78963,Finnish firm Voltan secures funding to strengt...,2023-10-25,,company founders miika richard considerable ex...,thinkgeoenergy,https://www.thinkgeoenergy.com/finnish-firm-vo...


## CSV - Embedding - Chroma

In [5]:
# read in evaluation
evaluation_set = pd.read_csv('./data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')


In [48]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings


embeddings = HuggingFaceEmbeddings()


text_splitter = SemanticChunker(embeddings)

docs = []

# create batches
BatchSize = 41666
batches = [documents[i:i + BatchSize] for i in range(0, len(documents), BatchSize)]

for batch in batches:
    docs.extend(text_splitter.create_documents(batch.content, metadatas=[
        {
            "url": url,
            "title": title,
            "date": date,
            "author": author,
            "domain": domain,
        }
        for url, title, date, author, domain in zip(batch.url, batch.title, batch.date, batch.author, batch.domain)
    ]))


len(docs)

181397

In [49]:
chunked_documents = docs


[Document(page_content='qatar petroleum  qp targeting aggressive cuts greenhouse gas emissions prepares launch phase  planned  million ton per year lng expansion latest sustainability report published wednesday qp said goals include  reducing emissions intensity qatars lng facilities  upstream facilities least   company also aiming reduce gas flaring intensity across upstream facilities  raised carbon capture storage ambitions  million tonsyr  million tonsyr   million tonsyr carbon capture goal come  million tonyr phase  lng expansion also known north field east project  million tonsyr come phase  known north field south project raise qatars lng capacity  million tonsyr qatar currently lng production capacity around  million tonsyr eyeing phased expansion  million tonsyr qp says able eliminate routine gas flaring  methane emissions limited  setting methane intensity target  across facilities   company also plans build  gigawatts solar energy capacity  half come siraj solar power projec

In [ ]:
from langchain_community.document_loaders import DataFrameLoader 

# this is useless
from langchain.text_splitter import CharacterTextSplitter

# dataframe from list 


loader = DataFrameLoader(chunked_documents, page_content_column='content')

documents = loader.load()




In [None]:
from langchain_community.embeddings import FakeEmbeddings
from langchain_openai import OpenAIEmbeddings


chroma_client = chromadb.HttpClient(host='localhost', port=8000)
chroma_client.reset()

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="my_langchain_collection",
    embedding_function=embeddings,
)

# create batches
BatchSize = 41666
batches = [chunked_documents[i:i + BatchSize] for i in range(0, len(chunked_documents), BatchSize)]

for batch in batches:
    langchain_chroma.add_documents(documents=batch)

## Similarity Search
Query against the collection

In [11]:
# based on the rag evaluation set
print(evaluation_set.iloc[0].question)
docs = langchain_chroma.similarity_search(evaluation_set.iloc[0].question)
print(docs[0].page_content)
print(docs[0].metadata)

What is the innovation behind Leclanché's new method to produce lithium-ion batteries?
leclanch said developed environmentally friendly way produce lithiumion  liion batteries replaced highly toxic organic solvents commonly used production process waterbased process make nickelmanganesecobaltaluminium cathodes  nmca
{'Unnamed: 0': 6780, 'date': '2023-01-20', 'domain': 'energyvoice', 'title': 'Leclanché’ s new disruptive battery boosts energy density', 'url': 'https://sgvoice.energyvoice.com/strategy/technology/23971/leclanches-new-disruptive-battery-boosts-energy-density/'}


In [12]:
# comparison with the intended RAG context 

evaluation_set.iloc[0].relevant_chunk

'Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).'

## similarity_search_with_score

In [13]:
docs_score = langchain_chroma.similarity_search_with_score(evaluation_set.iloc[0].question)
print(docs_score[0][0].page_content)
print(docs_score[0][0].metadata)
print("Score: ", docs_score[0][1])


leclanch said developed environmentally friendly way produce lithiumion  liion batteries replaced highly toxic organic solvents commonly used production process waterbased process make nickelmanganesecobaltaluminium cathodes  nmca
{'Unnamed: 0': 6780, 'date': '2023-01-20', 'domain': 'energyvoice', 'title': 'Leclanché’ s new disruptive battery boosts energy density', 'url': 'https://sgvoice.energyvoice.com/strategy/technology/23971/leclanches-new-disruptive-battery-boosts-energy-density/'}
Score:  0.63947594165802


In [14]:
evaluation_set.iloc[0].relevant_chunk

'Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).'

In [24]:
from fuzzywuzzy import fuzz


score = 0
fuzzy_score = 0

# iterate through evaluation set
for index, row in evaluation_set.iterrows():
    similarity_search = langchain_chroma.similarity_search_with_score(row.question)[:3]
    print("Question: ", row.question)
    print("Relevant Chunk: ", row.relevant_chunk)
    for i in range(3):
        print("Result ", i, ": ", similarity_search[i][0].page_content)
        print("Score ", i, ": ", similarity_search[i][1])
    # define own similarity based on fuzzy matching
    fuzzy_match = fuzz.token_set_ratio(row.relevant_chunk, similarity_search[0][0].page_content)
    print("Fuzzy Match: ", fuzzy_match)
    print("Article Url: ", row.article_url)
    print("Content Url: ", similarity_search[0][0].metadata['url'])
    fuzzy_match_url = fuzz.token_set_ratio(row.article_url, similarity_search[0][0].metadata['url'])
    print("Fuzzy Match Url: ", fuzzy_match_url)
    print("----")
    
    score += similarity_search[0][1]
    fuzzy_score += fuzzy_match
    
print("Average Score: ", score/len(evaluation_set))
print("Average Fuzzy Score: ", fuzzy_score/(100*len(evaluation_set)))
    

Question:  What is the innovation behind Leclanché's new method to produce lithium-ion batteries?
Relevant Chunk:  Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).
Result  0 :  leclanch said developed environmentally friendly way produce lithiumion  liion batteries replaced highly toxic organic solvents commonly used production process waterbased process make nickelmanganesecobaltaluminium cathodes  nmca
Score  0 :  0.63947594165802
Result  1 :  way saying took well  years lithiumion replaced leadacid energy storage technology electric vehicles  process still  complete  along comes whole new battery chemistry innolith name  istate  new battery available commercially following testing pilot production
Score  1 :  0.7710361480712891
Result  2 :  rings bell sho

## Collection Query

In [None]:
langchain_collection = chroma_client.get_collection("my_langchain_collection")

langchain_collection.query(
    query_embeddings=[i for i in range(768)],
    n_results=1,
)

## SelfQueryRetriever

In [16]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="url",
        description="Url of the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date of the document",
        type="string",
    ),
    AttributeInfo(
        name="author",
        description="Author of the document",
        type="string",
    ),
    AttributeInfo(
        name="domain",
        description="Domain of the document, closely related to the source of the document",
        type="string",
    ),
]
document_content_description = "Article listing"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, langchain_chroma, document_content_description, metadata_field_info, verbose=True
)

# Based on rag evaluation set 
result = retriever.get_relevant_documents(evaluation_set.iloc[0].question)
print(result[0].page_content)
print(result[0].metadata)

lithium batteries great powering rechargeable electronics store lot energy long lifespans
{'Unnamed: 0': 6573, 'date': '2022-10-07', 'domain': 'energyvoice', 'title': 'Batteries: Contenders emerge for super-solutions - News for the Energy Sector', 'url': 'https://www.energyvoice.com/technology/446761/batteries-technology/'}
