# Setup

In [None]:
import os

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

## Observability & Monitoring

Phoenix is an open-source observability library designed for experimentation, evaluation, and troubleshooting. It allows AI Engineers and Data Scientists to quickly visualize their data, evaluate performance, track down issues, and export data to improve.
 

In [None]:
from phoenix.trace.langchain import LangChainInstrumentor
import phoenix as px

px.close_app()
session = px.launch_app()

LangChainInstrumentor().instrument()

In [None]:
session.view()

# Data Loading

In [None]:
df = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')
df.head()

# Preprocessing & Indexing

## Splitting content into paragraphs

The content is currently stored as a list of strings. We will convert this into a single string with paragraphs separated by two newlines.

In [None]:
import ast

df['content'] = df['content'].apply(ast.literal_eval)
df['content'] = df['content'].apply(lambda x: [str(i) for i in x])
df['content'] = df['content'].apply(lambda x: '\n\n'.join(x))
df['content'].head()

# Indexing

For the indexing we use the VectorStore class which bundles the embeddings and ChromaDB.

In [None]:
from src.vectorstore import VectorStore
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=32, length_function=len)

In [None]:
from langchain_core.documents import Document


def create_documents(df, splitter):
    docs = []
    for index, row in tqdm(df.iterrows()):
        content = row['content']

        row = row.fillna('')

        metadata = {
            "url": row['url'],
            "domain": row['domain'],
            "title": row['title'],
            "author": row['author'],
            "date": row['date']
        }

        for chunk in splitter.split_text(content):
            docs.append(Document(page_content=chunk, metadata=metadata))

    return docs


documents = create_documents(df, recursive_text_splitter)

In [None]:
print("ChromeDB Host: ", os.getenv('CHROMADB_HOST'))
print("ChromeDB Port: ", os.getenv('CHROMADB_PORT'))

bge_vector_store = VectorStore(embedding_function=bge_embeddings,
                               collection="cleantech-bge-small-en")

In the next step we will add the documents to the vector store. This will take a while depending on the number of documents.

In [None]:
%%script false --no-raise-error

bge_vector_store.add_documents(documents, verbose=True, batch_size=128)

After adding the documents to the vector store we can now perform similarity searches.

In [None]:
bge_vector_store.similarity_search_w_scores("The company is also aiming to reduce gas flaring?")

# Retrieval & Generation

In [None]:
from src.generation import get_llm_model, LLMModel

azure_model = get_llm_model(LLMModel.GPT_3_AZURE)

In [None]:
rag_prompt = """
Answer the question to your best knowledge when looking at the following context:
{context}
                
Question: {question}
"""

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | ChatPromptTemplate.from_template(rag_prompt)
        | azure_model
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {
        "context": bge_vector_store.get_retriever(),
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

In [None]:
rag_chain_with_source.invoke("Is the company aiming to reduce gas flaring?")

# Evaluation

In [None]:
df_eval_subset = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23_subset_eval.csv')
df_eval_subset

In [None]:
# remove nan values
df_eval_subset = df_eval_subset.dropna(subset=['answer'])
df_eval_subset = df_eval_subset.drop_duplicates()

In [None]:
from src.evaluation import RAGEvaluator

rag_evaluator = RAGEvaluator(chain=rag_chain_with_source,
                             llm_model=azure_model,
                             embeddings=bge_embeddings)

In [None]:
rag_evaluator.create_dataset_from_df(df_eval_subset.head())
rag_evaluator.evaluate(raise_exceptions=False)