In [105]:
from langchain_community.vectorstores import Chroma
import chromadb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

import nltk
from nltk.corpus import stopwords

from html import unescape
import re

RANDOM_SEED = 1337

## Preprocessing

In [106]:
nltk.download('stopwords')

documents = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')

documents.sample(5, random_state=RANDOM_SEED)

[nltk_data] Downloading package stopwords to /Users/nils/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
5840,77870,World's first clean-hydrogen investment fund '...,2021-07-28,,['The world’ s first clean-hydrogen investment...,rechargenews,https://www.rechargenews.com/energy-transition...
526,6169,COP26: Scotland's peatlands are a carbon sink ...,2021-11-09,,"['West of the COP26 climate conference, among ...",energyvoice,https://www.energyvoice.com/renewables-energy-...
3589,37939,How Do Climate Change Mitigation Policies Impa...,2023-03-16,,"[""By clicking `` Allow All '' you agree to the...",azocleantech,https://www.azocleantech.com/news.aspx?newsID=...
8311,80434,Senate passes Inflation Reduction Act with lon...,2022-08-08,,['Update 08/16: President Biden signed the IRA...,solarpowerworldonline,https://www.solarpowerworldonline.com/2022/08/...
3537,37887,Researchers Use Drones to Monitor the Fragile ...,2023-03-07,,"[""By clicking `` Allow All '' you agree to the...",azocleantech,https://www.azocleantech.com/news.aspx?newsID=...


### Text Normalization

In [107]:
# set everything to lowercase
documents['content'] = documents['content'].str.lower()

# remove non-alphanumeric characters
documents['content'] = documents['content'].apply(lambda x: ' '.join(eval(x)))

# remove stopwords from content
stop_words = set(stopwords.words('english'))
documents['content'] = documents['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# remove quotes
documents['content'] = documents['content'].str.replace("'", '')

# unescape HTML entities
documents['content'] = unescape(documents['content'])

# remove HTML tags if any
documents['content'] = documents['content'].str.replace(r'<[^>]+>', '')

# Removes specific unwanted characters
documents['content'] = documents['content'].str.replace(r"[\'\/`:“`’]+", '', regex=True)

# Removes non-ASCII (Unicode) characters
documents['content'] = documents['content'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

# Keeps only letters and whitespace
documents['content'] = documents['content'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# trim extra spaces
documents['content'] = documents['content'].str.strip()

documents.sample(5, random_state=RANDOM_SEED)

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
5840,77870,World's first clean-hydrogen investment fund '...,2021-07-28,,world first cleanhydrogen investment fund cou...,rechargenews,https://www.rechargenews.com/energy-transition...
526,6169,COP26: Scotland's peatlands are a carbon sink ...,2021-11-09,,west cop climate conference among lochs glens ...,energyvoice,https://www.energyvoice.com/renewables-energy-...
3589,37939,How Do Climate Change Mitigation Policies Impa...,2023-03-16,,clicking allow agree storing cookies device ...,azocleantech,https://www.azocleantech.com/news.aspx?newsID=...
8311,80434,Senate passes Inflation Reduction Act with lon...,2022-08-08,,update president biden signed ira law here de...,solarpowerworldonline,https://www.solarpowerworldonline.com/2022/08/...
3537,37887,Researchers Use Drones to Monitor the Fragile ...,2023-03-07,,clicking allow agree storing cookies device ...,azocleantech,https://www.azocleantech.com/news.aspx?newsID=...


## CSV - Embedding - Chroma

In [108]:
# read in evaluation
evaluation_set = pd.read_csv('./data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')


In [109]:
from langchain_community.document_loaders import DataFrameLoader
# this is useless
from langchain.text_splitter import CharacterTextSplitter

loader = DataFrameLoader(documents, page_content_column='content')

documents = loader.load()

In [110]:
from langchain_community.embeddings import FakeEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

chroma_client = chromadb.HttpClient(host='localhost', port=8000)
chroma_client.reset()

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="my_langchain_collection",
    embedding_function=embeddings,
)

langchain_chroma.add_documents(documents=documents)

KeyboardInterrupt: 

## Similarity Search
Query against the collection

In [None]:
# based on the rag evaluation set
print(evaluation_set.iloc[0].question)
docs = langchain_chroma.similarity_search(evaluation_set.iloc[0].question)
print(docs[0].page_content)
print(docs[0].metadata)

What is the innovation behind Leclanché's new method to produce lithium-ion batteries?
: 6780
content: ['Energy storage company Leclanché ( SW.LECN) has designed a new battery cell that uses less cobalt and boosts energy density by 20%. The company says it is also produced in an environmentally friendly way, making it more recyclable or easy to dispose of at end-of-life.', 'Leclanché said it has developed an environmentally friendly way to produce lithium-ion ( Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes ( NMCA).', 'Organic solvents, such as N-methyl pyrrolidone ( NMP), are highly toxic and harmful to the environment. The use of NMP has been restricted by the European Commission, having been added to the list of Substances of Very High Concern, which can have serious irreversible effects on human health and the environment.', 'Besides being techni

In [None]:
# comparison with the intended RAG context 

evaluation_set.iloc[0].relevant_chunk

'Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).'

## similarity_search_with_score

In [None]:
docs_score = langchain_chroma.similarity_search_with_score(evaluation_set.iloc[0].question)
print(docs_score[0][0].page_content)
print(docs_score[0][0].metadata)
print("Score: ", docs_score[0][1])


: 6780
content: ['Energy storage company Leclanché ( SW.LECN) has designed a new battery cell that uses less cobalt and boosts energy density by 20%. The company says it is also produced in an environmentally friendly way, making it more recyclable or easy to dispose of at end-of-life.', 'Leclanché said it has developed an environmentally friendly way to produce lithium-ion ( Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes ( NMCA).', 'Organic solvents, such as N-methyl pyrrolidone ( NMP), are highly toxic and harmful to the environment. The use of NMP has been restricted by the European Commission, having been added to the list of Substances of Very High Concern, which can have serious irreversible effects on human health and the environment.', 'Besides being technically simpler, eliminating the use of organic solvents also eliminates the risk of expl

In [None]:
evaluation_set.iloc[0].relevant_chunk

'Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).'

## Collection Query

In [None]:
langchain_collection = chroma_client.get_collection("my_langchain_collection")

langchain_collection.query(
    query_embeddings=[i for i in range(768)],
    n_results=1,
)

{'ids': [['031cc4e4-d64d-11ee-b185-92b1fc8ab9ab']],
 'distances': [[150699328.0]],
 'embeddings': None,
 'metadatas': [[{'author': '',
    'date': '2022-10-06',
    'domain': 'pv-magazine',
    'row': 7416,
    'source': './data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv',
    'title': 'World’ s largest compressed air energy storage project goes online in China – pv magazine International',
    'url': 'https://www.pv-magazine.com/2022/10/06/worlds-largest-compressed-air-energy-storage-project-goes-online-in-china/'}]],
 'documents': [[': 79492\ncontent: [\'The Chinese Academy of Sciences has switched on a 100 MW compressed air energy storage system in China’ s Hebei province. The facility can store more than 132 million kWh of electricity per year.\', \'A 100 MW compressed air energy storage system in Zhangjiakou, China.\', "The Institute of Engineering Thermophysics of the Chinese Academy of Sciences has switched on a 100 MW compressed air energy storage ( CAES)

## SelfQueryRetriever

In [None]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="url",
        description="Url of the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date of the document",
        type="string",
    ),
    AttributeInfo(
        name="author",
        description="Author of the document",
        type="string",
    ),
    AttributeInfo(
        name="domain",
        description="Domain of the document, closely related to the source of the document",
        type="string",
    ),
]
document_content_description = "Article listing"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, langchain_chroma, document_content_description, metadata_field_info, verbose=True
)

# Based on rag evaluation set 
result = retriever.get_relevant_documents(evaluation_set.iloc[0].question)
print(result[0].page_content)
print(result[0].metadata)

ImportError: Cannot import lark, please install it with 'pip install lark'.