In [4]:
from langchain_community.vectorstores import Chroma
import chromadb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

True

In [5]:
# read in evaluation
evaluation_set = pd.read_csv('./data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')


## CSV - Embedding - Chroma

In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader
# this is useless
from langchain.text_splitter import CharacterTextSplitter


loader = CSVLoader(file_path='./data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv',
                   metadata_columns=['title', 'url', 'date', 'author', 'domain'],)

documents = loader.load()

In [7]:
from langchain_community.embeddings import FakeEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

chroma_client = chromadb.HttpClient(host='localhost', port=8000)
chroma_client.reset()

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="my_langchain_collection",
    embedding_function=embeddings,
)

langchain_chroma.add_documents(documents=documents)

['d987b5fe-d592-11ee-852e-1aa6e3a10dcb',
 'd987b7f2-d592-11ee-852e-1aa6e3a10dcb',
 'd987b81a-d592-11ee-852e-1aa6e3a10dcb',
 'd987b82e-d592-11ee-852e-1aa6e3a10dcb',
 'd987b84c-d592-11ee-852e-1aa6e3a10dcb',
 'd987b860-d592-11ee-852e-1aa6e3a10dcb',
 'd987b874-d592-11ee-852e-1aa6e3a10dcb',
 'd987b89c-d592-11ee-852e-1aa6e3a10dcb',
 'd987b8b0-d592-11ee-852e-1aa6e3a10dcb',
 'd987b8c4-d592-11ee-852e-1aa6e3a10dcb',
 'd987b8d8-d592-11ee-852e-1aa6e3a10dcb',
 'd987b8ec-d592-11ee-852e-1aa6e3a10dcb',
 'd987b8f6-d592-11ee-852e-1aa6e3a10dcb',
 'd987b90a-d592-11ee-852e-1aa6e3a10dcb',
 'd987b91e-d592-11ee-852e-1aa6e3a10dcb',
 'd987b932-d592-11ee-852e-1aa6e3a10dcb',
 'd987b93c-d592-11ee-852e-1aa6e3a10dcb',
 'd987b950-d592-11ee-852e-1aa6e3a10dcb',
 'd987b964-d592-11ee-852e-1aa6e3a10dcb',
 'd987b978-d592-11ee-852e-1aa6e3a10dcb',
 'd987b98c-d592-11ee-852e-1aa6e3a10dcb',
 'd987b996-d592-11ee-852e-1aa6e3a10dcb',
 'd987b9aa-d592-11ee-852e-1aa6e3a10dcb',
 'd987b9be-d592-11ee-852e-1aa6e3a10dcb',
 'd987b9d2-d592-

## Similarity Search
Query against the collection

In [8]:
# based on the rag evaluation set
print(evaluation_set.iloc[0].question)
docs = langchain_chroma.similarity_search(evaluation_set.iloc[0].question)
print(docs[0].page_content)
print(docs[0].metadata)

What is the innovation behind Leclanché's new method to produce lithium-ion batteries?
: 6780
content: ['Energy storage company Leclanché ( SW.LECN) has designed a new battery cell that uses less cobalt and boosts energy density by 20%. The company says it is also produced in an environmentally friendly way, making it more recyclable or easy to dispose of at end-of-life.', 'Leclanché said it has developed an environmentally friendly way to produce lithium-ion ( Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes ( NMCA).', 'Organic solvents, such as N-methyl pyrrolidone ( NMP), are highly toxic and harmful to the environment. The use of NMP has been restricted by the European Commission, having been added to the list of Substances of Very High Concern, which can have serious irreversible effects on human health and the environment.', 'Besides being techni

In [9]:
# comparison with the intended RAG context 

evaluation_set.iloc[0].relevant_chunk

'Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).'

## similarity_search_with_score

In [10]:
docs_score = langchain_chroma.similarity_search_with_score(evaluation_set.iloc[0].question)
print(docs_score[0][0].page_content)
print(docs_score[0][0].metadata)
print("Score: ", docs_score[0][1])


: 6780
content: ['Energy storage company Leclanché ( SW.LECN) has designed a new battery cell that uses less cobalt and boosts energy density by 20%. The company says it is also produced in an environmentally friendly way, making it more recyclable or easy to dispose of at end-of-life.', 'Leclanché said it has developed an environmentally friendly way to produce lithium-ion ( Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes ( NMCA).', 'Organic solvents, such as N-methyl pyrrolidone ( NMP), are highly toxic and harmful to the environment. The use of NMP has been restricted by the European Commission, having been added to the list of Substances of Very High Concern, which can have serious irreversible effects on human health and the environment.', 'Besides being technically simpler, eliminating the use of organic solvents also eliminates the risk of expl

In [11]:
evaluation_set.iloc[0].relevant_chunk

'Leclanché said it has developed an environmentally friendly way to produce lithium-ion (Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes (NMCA).'

## Collection Query

In [12]:
langchain_collection = chroma_client.get_collection("my_langchain_collection")

langchain_collection.query(
    query_embeddings=[i for i in range(768)],
    n_results=1,
)

{'ids': [['d98a67ae-d592-11ee-852e-1aa6e3a10dcb']],
 'distances': [[150699479.91140756]],
 'embeddings': None,
 'metadatas': [[{'author': '',
    'date': '2023-07-25',
    'domain': 'solarindustrymag',
    'row': 9512,
    'source': './data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv',
    'title': 'Corre Energy Agrees to West Texas Energy Storage Purchase',
    'url': 'https://solarindustrymag.com/corre-energy-agrees-to-west-texas-energy-storage-purchase'}]],
 'documents': [[": 81733\ncontent: ['Corre Energy US Development Company LLC, a North American subsidiary of Netherlands-based Corre Energy B.V., has entered into an exclusive agreement to acquire a 280 MW / 4.2 GWh energy storage project from Contour Energy, a Texas-based energy storage infrastructure developer.', 'Located in the West Texas region of ERCOT, the utility-scale storage project will be capable of continuously discharging 280 MW of electrical power for up to 15 hours, equating to 4.2 GWh of full

## SelfQueryRetriever

In [14]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="url",
        description="Url of the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date of the document",
        type="string",
    ),
    AttributeInfo(
        name="author",
        description="Author of the document",
        type="string",
    ),
    AttributeInfo(
        name="domain",
        description="Domain of the document, closely related to the source of the document",
        type="string",
    ),
]
document_content_description = "Article listing"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, langchain_chroma, document_content_description, metadata_field_info, verbose=True
)

# Based on rag evaluation set 
result = retriever.get_relevant_documents(evaluation_set.iloc[0].question)
print(result[0].page_content)
print(result[0].metadata)

: 6780
content: ['Energy storage company Leclanché ( SW.LECN) has designed a new battery cell that uses less cobalt and boosts energy density by 20%. The company says it is also produced in an environmentally friendly way, making it more recyclable or easy to dispose of at end-of-life.', 'Leclanché said it has developed an environmentally friendly way to produce lithium-ion ( Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes ( NMCA).', 'Organic solvents, such as N-methyl pyrrolidone ( NMP), are highly toxic and harmful to the environment. The use of NMP has been restricted by the European Commission, having been added to the list of Substances of Very High Concern, which can have serious irreversible effects on human health and the environment.', 'Besides being technically simpler, eliminating the use of organic solvents also eliminates the risk of expl