In [None]:
from langchain_community.vectorstores import Chroma
import chromadb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

In [None]:
# read in evaluation
evaluation_set = pd.read_csv('./data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')


## CSV - Embedding - Chroma

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
# this is useless
from langchain.text_splitter import CharacterTextSplitter


loader = CSVLoader(file_path='./data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv',
                   metadata_columns=['title', 'url', 'date', 'author', 'domain'],)

documents = loader.load()

In [None]:
from langchain_community.embeddings import FakeEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

chroma_client = chromadb.HttpClient(host='localhost', port=8000)
# chroma_client.reset()

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="my_langchain_collection",
    embedding_function=embeddings,
)

langchain_chroma.add_documents(documents=documents)

## Similarity Search
Query against the collection

In [None]:
# based on the rag evaluation set
print(evaluation_set.iloc[0].question)
docs = langchain_chroma.similarity_search(evaluation_set.iloc[0].question)
print(docs[0].page_content)
print(docs[0].metadata)

In [None]:
# comparison with the intended RAG context 

evaluation_set.iloc[0].relevant_chunk

## similarity_search_with_score

In [27]:
docs_score = langchain_chroma.similarity_search_with_score(evaluation_set.iloc[0].question)
print(docs_score[0][0].page_content)
print(docs_score[0][0].metadata)
print("Score: ", docs_score[0][1])


: 6780
content: ['Energy storage company Leclanché ( SW.LECN) has designed a new battery cell that uses less cobalt and boosts energy density by 20%. The company says it is also produced in an environmentally friendly way, making it more recyclable or easy to dispose of at end-of-life.', 'Leclanché said it has developed an environmentally friendly way to produce lithium-ion ( Li-ion) batteries. It has replaced highly toxic organic solvents, commonly used in the production process, with a water-based process to make nickel-manganese-cobalt-aluminium cathodes ( NMCA).', 'Organic solvents, such as N-methyl pyrrolidone ( NMP), are highly toxic and harmful to the environment. The use of NMP has been restricted by the European Commission, having been added to the list of Substances of Very High Concern, which can have serious irreversible effects on human health and the environment.', 'Besides being technically simpler, eliminating the use of organic solvents also eliminates the risk of expl

In [ ]:
evaluation_set.iloc[0].relevant_chunk

## Collection Query

In [None]:
langchain_collection = chroma_client.get_collection("my_langchain_collection")

langchain_collection.query(
    query_embeddings=[i for i in range(768)],
    n_results=1,
)

## SelfQueryRetriever

DOESN'T WORK WITH CHROMA but it should:

https://arc.net/l/quote/yerdxjxz

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="url",
        description="Url of the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date of the document",
        type="string",
    ),
    AttributeInfo(
        name="author",
        description="Author of the document",
        type="string",
    ),
    AttributeInfo(
        name="domain",
        description="Domain of the document, closely related to the source of the document",
        type="string",
    ),
]
document_content_description = "Product listing"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, chroma_client, document_content_description, metadata_field_info, verbose=True
)

# Based on rag evaluation set 
retriever.get_relevant_documents(evaluation_set.iloc[0].question)