In [None]:
from langchain_community.vectorstores import Chroma
import chromadb
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

import nltk
from nltk.corpus import stopwords

from html import unescape
import re

RANDOM_SEED = 1337

## Preprocessing

In [None]:
nltk.download('stopwords')

documents = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')

documents.sample(5, random_state=RANDOM_SEED)

In [None]:
import ast

documents['content'] = documents['content'].apply(ast.literal_eval)

documents = documents.explode('content')

### Text Normalization

In [None]:
# set everything to lowercase
documents['content'] = documents['content'].str.lower()

# remove stopwords from content
stop_words = set(stopwords.words('english'))
documents['content'] = documents['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# remove quotes
documents['content'] = documents['content'].str.replace("'", '')

# unescape HTML entities
documents['content'] = unescape(documents['content'])

# remove HTML tags if any
documents['content'] = documents['content'].str.replace(r'<[^>]+>', '')

# Removes specific unwanted characters
documents['content'] = documents['content'].str.replace(r"[\'\/`:“`’]+", '', regex=True)

# Removes non-ASCII (Unicode) characters
documents['content'] = documents['content'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

# Keeps only letters and whitespace
documents['content'] = documents['content'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# trim extra spaces
documents['content'] = documents['content'].str.strip()

documents.sample(5, random_state=RANDOM_SEED)

## CSV - Embedding - Chroma

In [None]:
# read in evaluation
evaluation_set = pd.read_csv('./data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')


In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings


embeddings = HuggingFaceEmbeddings()


text_splitter = SemanticChunker(embeddings)

docs = []

# create batches
BatchSize = 41666
batches = [documents[i:i + BatchSize] for i in range(0, len(documents), BatchSize)]

for batch in batches:
    docs.extend(text_splitter.create_documents(batch.content, metadatas=[
        {
            "url": url,
            "title": title,
            "date": date,
            "author": author,
            "domain": domain,
        }
        for url, title, date, author, domain in zip(batch.url, batch.title, batch.date, batch.author, batch.domain)
    ]))


len(docs)

In [None]:
chunked_documents = docs


In [None]:
from langchain_community.document_loaders import DataFrameLoader 

# this is useless
from langchain.text_splitter import CharacterTextSplitter

# dataframe from list 


loader = DataFrameLoader(chunked_documents, page_content_column='content')

documents = loader.load()




In [None]:
from langchain_community.embeddings import FakeEmbeddings
from langchain_openai import OpenAIEmbeddings


chroma_client = chromadb.HttpClient(host='localhost', port=8000)
chroma_client.reset()

langchain_chroma = Chroma(
    client=chroma_client,
    collection_name="my_langchain_collection",
    embedding_function=embeddings,
)

# create batches
BatchSize = 41666
batches = [chunked_documents[i:i + BatchSize] for i in range(0, len(chunked_documents), BatchSize)]

for batch in batches:
    langchain_chroma.add_documents(documents=batch)

## Similarity Search
Query against the collection

In [None]:
# based on the rag evaluation set
print(evaluation_set.iloc[0].question)
docs = langchain_chroma.similarity_search(evaluation_set.iloc[0].question)
print(docs[0].page_content)
print(docs[0].metadata)

In [None]:
# comparison with the intended RAG context 

evaluation_set.iloc[0].relevant_chunk

## similarity_search_with_score

In [None]:
docs_score = langchain_chroma.similarity_search_with_score(evaluation_set.iloc[0].question)
print(docs_score[0][0].page_content)
print(docs_score[0][0].metadata)
print("Score: ", docs_score[0][1])


In [None]:
evaluation_set.iloc[0].relevant_chunk

In [None]:
from fuzzywuzzy import fuzz


score = 0
fuzzy_score = 0

# iterate through evaluation set
for index, row in evaluation_set.iterrows():
    similarity_search = langchain_chroma.similarity_search_with_score(row.question)[:3]
    print("Question: ", row.question)
    print("Relevant Chunk: ", row.relevant_chunk)
    for i in range(3):
        print("Result ", i, ": ", similarity_search[i][0].page_content)
        print("Score ", i, ": ", similarity_search[i][1])
    # define own similarity based on fuzzy matching
    fuzzy_match = fuzz.token_set_ratio(row.relevant_chunk, similarity_search[0][0].page_content)
    print("Fuzzy Match: ", fuzzy_match)
    print("Article Url: ", row.article_url)
    print("Content Url: ", similarity_search[0][0].metadata['url'])
    fuzzy_match_url = fuzz.token_set_ratio(row.article_url, similarity_search[0][0].metadata['url'])
    print("Fuzzy Match Url: ", fuzzy_match_url)
    print("----")
    
    score += similarity_search[0][1]
    fuzzy_score += fuzzy_match
    
print("Average Score: ", score/len(evaluation_set))
print("Average Fuzzy Score: ", fuzzy_score/(100*len(evaluation_set)))
    

## Collection Query

In [None]:
langchain_collection = chroma_client.get_collection("my_langchain_collection")

langchain_collection.query(
    query_embeddings=[i for i in range(768)],
    n_results=1,
)

## SelfQueryRetriever

In [None]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="url",
        description="Url of the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date of the document",
        type="string",
    ),
    AttributeInfo(
        name="author",
        description="Author of the document",
        type="string",
    ),
    AttributeInfo(
        name="domain",
        description="Domain of the document, closely related to the source of the document",
        type="string",
    ),
]
document_content_description = "Article listing"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, langchain_chroma, document_content_description, metadata_field_info, verbose=True
)

# Based on rag evaluation set 
result = retriever.get_relevant_documents(evaluation_set.iloc[0].question)
print(result[0].page_content)
print(result[0].metadata)