In [None]:
import os

from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()
from src.generation import get_llm_model, LLMModel

azure_model = get_llm_model(LLMModel.GPT_3_AZURE)

In [None]:
from phoenix.trace.langchain import LangChainInstrumentor
import phoenix as px

try:
    px.close_app()
    session = px.launch_app()

    LangChainInstrumentor().instrument()
    session.view()
except Exception as e:
    print("Error while launching Phoenix")
    print(e)

In [None]:
import pandas as pd

df = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')
df.head()

In [None]:
from src.preprocessing.preprocessor import Preprocessor

default_preprocesser = Preprocessor(df, verbose=True, explode=False, concatenate_contents=True)
default_df = default_preprocesser.preprocess()

In [None]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
from langchain_core.documents import Document


def create_documents(df):
    docs = []
    for index, row in tqdm(df.iterrows()):
        content = row['content']

        row = row.fillna('')

        metadata = {
            "url": row['url'],
            "domain": row['domain'],
            "title": row['title'],
            "author": row['author'],
            "date": row['date']
        }

        docs.append(Document(page_content=content, metadata=metadata))

    return docs


documents = create_documents(default_df)

assert len(documents) == len(default_df)

In [None]:
from src.vectorstore import VectorStore

print("ChromeDB Host: ", os.getenv('CHROMADB_HOST'))
print("ChromeDB Port: ", os.getenv('CHROMADB_PORT'))

bge_vector_store = VectorStore(embedding_function=bge_embeddings,
                               collection="cleantech-bge-small-en")

In [None]:
%%script false --no-raise-error
bge_vector_store.add_documents(documents, verbose=True, batch_size=128)

# Baseline Pipeline

The baseline pipeline is a first simple implementation of the RAG pipeline.


In [None]:
rag_prompt = """
Answer the question to your best knowledge when looking at the following context:
{context}
                
Question: {question}
"""

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | ChatPromptTemplate.from_template(rag_prompt)
        | azure_model
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {
        "context": bge_vector_store.get_retriever(),
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

In [None]:
rag_chain_with_source.invoke("Is the company aiming to reduce gas flaring?")

# Evaluation

In [None]:
#df_eval_subset = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23_subset_eval.csv')

df_eval_subset = pd.read_csv('data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')

#df_eval_subset = df_eval_subset.dropna(subset=['answer'])
df_eval_subset = df_eval_subset.drop_duplicates().sample(10)
df_eval_subset

In [None]:
len(documents)

In [None]:
default_df['title'].duplicated().sum(), df['title'].duplicated().sum()

In [None]:
from src.preprocessing.eval_preprocessor import EvaluationSetPreprocessor

eval_processor = EvaluationSetPreprocessor(default_df, df_eval_subset, verbose=True)
eval_df = eval_processor.preprocess()

In [None]:
eval_df

In [None]:
for index, row in eval_df[:2].iterrows():
    best_match = default_df[default_df['id'] == row['best_match_id']]

    print(f"Relevant chunk: {row['relevant_chunk']}")
    print("\n\n")
    print(f"Best Match: {best_match['title'].values}")

    print("\n\n" * 10)