In [None]:
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

df = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')
df.head()

# Preprocessing & Indexing

In [None]:
import ast

df['content'] = df['content'].apply(ast.literal_eval)
df['content'] = df['content'].apply(lambda x: [str(i) for i in x])
df['content'] = df['content'].apply(lambda x: '\n\n'.join(x))

df['content'].head()

In [None]:
from src.embedding_strategy import EmbeddingStrategy
strategy = EmbeddingStrategy.get_default_strategy()

documents = []
for index, row in tqdm(df.iterrows()):
    content = row['content']
    
    row = row.fillna('')
    
    metadata = {
        "url": row['url'],
        "domain": row['domain'],
        "title": row['title'],
        "author": row['author'],
        "date": row['date']
    }
    
    cleaned_content = strategy.processor.clean(content)
    row_docs = strategy.processor.chunk(content, metadata)
    
    documents += row_docs
    
documents = documents[:100] # Only use the first 100 documents for testing

In [None]:
strategy.vector_store.add_documents(documents, verbose=True, batch_size=128)
strategy.vector_store.similarity_search("The company is also aiming to reduce gas flaring?")

# Retrieval & Generation

In [None]:
from src.generator import Generator
import os

gen = Generator(openai_api_key=os.getenv("OPENAI_API_KEY"), 
                embedding_strategy=strategy)

gen.ask("Who was in Paris?")

In [None]:
retriever = strategy.retriever
retriever.invoke("Who was in Paris?")

In [None]:
# Based on rag evaluation set 
result = retriever.get_relevant_documents("Who was in Paris?")
print(result[0].page_content)
print(result[0].metadata)

# Evaluation

In [None]:
df_eval = pd.read_csv('data/Cleantech Media Dataset/cleantech_rag_evaluation_data_2024-02-23.csv')
df_eval.head()

In [None]:
%%script false --no-raise-error

from src.assessor import Assessor
assessor = Assessor()

evaluation = pd.DataFrame()
metrics_list = []

for i, row in tqdm(df_eval.iterrows(), total=len(df_eval)):
    question = row["question"]
    relevant_chunk = row["relevant_chunk"]
    
    answer, relevant_documents = gen.ask(question)
    contexts = [doc.page_content for doc in relevant_documents]
    
    metrics = assessor.assess_example(question, answer, contexts, relevant_chunk)
    metrics_list.append(metrics)
    
    print("\n\n\n")

evaluation = pd.concat(metrics_list, ignore_index=True)
evaluation.head(20)

In [None]:
%%script false --no-raise-error
# This is hella expensive

from src.assessor import Assessor, generate_testset
assessor = Assessor()

testset = generate_testset(documents)
testset.to_pandas().head(20)

evaluation = assessor.assess_dataset(testset.to_dataset())
evaluation.head(20)