In [115]:
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.llms.openai import OpenAI
from llama_index.core import ServiceContext
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core import get_response_synthesizer
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.query_engine.retriever_query_engine import (
    RetrieverQueryEngine,
)
from llama_index.core.llama_dataset import LabelledRagDataset
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.response_synthesizers import ResponseMode
import chromadb
import os
import shutil

from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

In [114]:
dataset_source_dir = "./data/paul_graham/source_files"
dataset_config_dir = "./data/paul_graham/dataset"
index_dir = "./data/paul_graham/index"
dataset_name = "PaulGrahamEssayDataset"
eval_dir = "./data/paul_graham/eval"

similarity_top_k = 2
chunk_size = 512
chunk_overlap = 50
embedding_type = 'openai'  # https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings/#list-of-supported-embeddings
response_mode = 'compact'  # https://docs.llamaindex.ai/en/stable/module_guides/querying/response_synthesizers/

# dataset generation params
num_questions_per_chunk = 1
llm = 'gpt-4'

In [94]:
def process_nodes(nodes,
                  chunk_size: int,
                  chunk_overlap: int,
                  embedding_type: str=None):

    transformations=[
            SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        ]
    if embedding_type in [None, 'openai']:
        transformations.append(OpenAIEmbedding())  

    # https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/
    pipeline = IngestionPipeline(
        transformations=transformations  
    )
    nodes = pipeline.run(documents=documents)
    
    return nodes


def make_query_engine(index,
                      similarity_top_k: int=2,
                      response_mode: str='compact'):

    # configure retriever
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=similarity_top_k,
    )
    # configure response synthesizer
    response_synthesizer = get_response_synthesizer(
        response_mode=response_mode
    )
    # assemble query engine
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )

    return query_engine

In [97]:
# set context for llm provider
llm_context = ServiceContext.from_defaults(
    llm=OpenAI(model=llm, temperature=0.2)
)

index_dir_ = os.path.join(index_dir, f'index_cs_{chunk_size}_co_{chunk_overlap}_em_{embedding_type}')

nodes = None
if not os.path.exists(index_dir_) or not os.listdir(index_dir_):  # create and save index if not exist
    documents = SimpleDirectoryReader(dataset_source_dir_).load_data()
    nodes = process_nodes(documents, chunk_size, chunk_overlap, embedding_type)
    # https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_guide/
    index = VectorStoreIndex(nodes)
    index.storage_context.persist(persist_dir=index_dir_)
else:  # load index if already exists
    # https://docs.llamaindex.ai/en/stable/understanding/storing/storing/
    storage_context = StorageContext.from_defaults(persist_dir=index_dir_)
    index = load_index_from_storage(storage_context)
    
query_engine = make_query_engine(index, similarity_top_k, response_mode)

  llm_context = ServiceContext.from_defaults(


In [103]:
dataset_config_dir_ = os.path.join(dataset_config_dir, f'rag_dataset_cs_{chunk_size}_co_{chunk_overlap}.json')

if not os.path.exists(dataset_config_dir_):  # create and save dataset if not exist
    assert nodes is not None
    dataset_generator = RagDatasetGenerator.from_documents(
        nodes,
        service_context=llm_context,
        num_questions_per_chunk=num_questions_per_chunk,
        show_progress=True,
    )
    # this operarion may take a while
    rag_dataset = dataset_generator.generate_dataset_from_nodes()
    rag_dataset.save_json(dataset_config_dir_)
else: # load dataset if already exists
    rag_dataset = LabelledRagDataset.from_json(dataset_config_dir_)
rag_dataset.to_pandas()

./data/paul_graham/dataset/rag_dataset_cs_512_co_50.json


Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,"In the context, the author describes his early...",[What I Worked On\n\nFebruary 2021\n\nBefore c...,The author's early experiences with programmin...,ai (gpt-4),ai (gpt-4)
1,Discuss the transition from time-sharing machi...,"[On a machine without time-sharing, this was a...",The text describes a transition from time-shar...,ai (gpt-4),ai (gpt-4)
2,"In the context, the author mentions a novel an...",[I couldn't have put this into words when I wa...,"The novel mentioned by the author is ""The Moon...",ai (gpt-4),ai (gpt-4)
3,Based on the author's experience and observati...,[I had gotten into a program at Cornell that d...,The author considered the approach to Artifici...,ai (gpt-4),ai (gpt-4)
4,"In the context, the author discusses his journ...","[So I decided to focus on Lisp. In fact, I dec...","The author initially focused on Lisp, a progra...",ai (gpt-4),ai (gpt-4)
5,"In the provided text, the author discusses his...","[There, right on the wall, was something you c...",The author attempts to balance his interests i...,ai (gpt-4),ai (gpt-4)
6,What was the author's initial topic of dissert...,[Then one day in April 1990 a crack appeared i...,"The author initially chose ""applications of co...",ai (gpt-4),ai (gpt-4)
7,"In the text, the author describes a unique arr...",[I had some money saved from consulting work I...,The author describes an arrangement between th...,ai (gpt-4),ai (gpt-4)
8,"In the provided passage, the author discusses ...",[[3]\n\nWhile I was a student at the Accademia...,The author perceives a significant difference ...,ai (gpt-4),ai (gpt-4)
9,"Discuss the author's experience at Interleaf, ...",[This is not the only way to paint. I'm not 10...,The author's experience at Interleaf was his c...,ai (gpt-4),ai (gpt-4)


In [104]:
response = query_engine.query("Who is Paul Graham.")

In [105]:
len(response.source_nodes)

2

In [108]:
response.response

'Paul Graham is an individual who was involved in organizing a Summer Founders Program for startups, where he funded a group of impressive individuals including reddit founders, Justin Kan, Emmett Shear, Aaron Swartz, and Sam Altman. Additionally, he worked on a project called Bel, an interpreter written in itself, which he completed in the fall of 2019.'

In [110]:
response.source_nodes[0].text

'That part we discovered by accident, not merely implicitly but explicitly due to our ignorance about investing. We needed to get experience as investors. What better way, we thought, than to fund a whole bunch of startups at once? We knew undergrads got temporary jobs at tech companies during the summer. Why not organize a summer program where they\'d start startups instead? We wouldn\'t feel guilty for being in a sense fake investors, because they would in a similar sense be fake founders. So while we probably wouldn\'t make much money out of it, we\'d at least get to practice being investors on them, and they for their part would probably have a more interesting summer than they would working at Microsoft.\n\nWe\'d use the building I owned in Cambridge as our headquarters. We\'d all have dinner there once a week â\x80\x94 on tuesdays, since I was already cooking for the thursday diners on thursdays â\x80\x94 and after dinner we\'d bring in experts on startups to give talks.\n\nWe kn

In [None]:
from llama_index.packs.rag_evaluator.base import RagEvaluatorPack

rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine,
    rag_dataset=rag_dataset,
)
benchmark_df = rag_evaluator.run()

In [113]:
benchmark_df.head()

rag,base_rag
metrics,Unnamed: 1_level_1
mean_correctness_score,3.9875
mean_relevancy_score,0.875
mean_faithfulness_score,1.0
mean_context_similarity_score,0.961158


In [124]:
eval_dir_ = os.path.join(eval_dir, f'index_cs_{chunk_size}_co_{chunk_overlap}_em_{embedding_type}_k_{similarity_top_k}')
os.makedirs(eval_dir_, exist_ok=True)
shutil.move('_evaluations.json', eval_dir_)
shutil.move('benchmark.csv', eval_dir_)

'./data/paul_graham/eval/index_cs_512_co_50_em_openai/benchmark.csv'

In [None]:
# test retriever

In [129]:
rag_evaluator.prediction_dataset.to_pandas()

Unnamed: 0,response,contexts
0,The author faced challenges with programming o...,[What I Worked On\n\nFebruary 2021\n\nBefore c...
1,The transition from time-sharing machines to m...,"[On a machine without time-sharing, this was a..."
2,"The author mentions a novel called ""The Moon i...",[I had gotten into a program at Cornell that d...
3,The author considered the approach to Artifici...,[I had gotten into a program at Cornell that d...
4,The author initially focused on Lisp and syste...,"[So I decided to focus on Lisp. In fact, I dec..."
5,The author in the text describes his interest ...,"[There, right on the wall, was something you c..."
6,The author's initial topic of dissertation was...,[Then one day in April 1990 a crack appeared i...
7,The author describes an arrangement at the Acc...,[I had some money saved from consulting work I...
8,The author highlights the distinction between ...,[[3]\n\nWhile I was a student at the Accademia...
9,The author's experience at Interleaf provided ...,[The good part was that I got paid huge amount...


In [161]:
from llama_index.core.evaluation import RetrieverEvaluator, RetrievalEvalResult
from llama_index.core.evaluation.retrieval.metrics import HitRate, MRR, resolve_metrics

# TODO add cohere rank
metric_names = ["mrr", "hit_rate"]
metric_types = resolve_metrics(metric_names)
metrics=[metric() for metric in metric_types]
df_rag_evaluator = rag_evaluator.prediction_dataset.to_pandas()
df_rag_dataset = rag_dataset.to_pandas()

eval_results = []
for i in range(len(df_rag_evaluator)):
    metric_dict = {}
    for metric in metrics:
        eval_result = metric.compute(
            retrieved_ids=df_rag_evaluator['contexts'].tolist()[i], expected_ids=df_rag_dataset['reference_contexts'].tolist()[i]
        )
        metric_dict[metric.metric_name] = eval_result
    eval_results.append(
        RetrievalEvalResult(metric_dict=metric_dict,
                            query=df_rag_dataset['query'].tolist()[i],
                            retrieved_texts=df_rag_evaluator['contexts'].tolist()[i],
                            retrieved_ids=df_rag_evaluator['contexts'].tolist()[i],
                            expected_ids=df_rag_dataset['reference_contexts'].tolist()[i],
                            expected_texts=df_rag_dataset['reference_contexts'].tolist()[i])
    )

In [174]:
metric_dicts = []
for eval_result in eval_results:
  metric_dict = eval_result.metric_vals_dict
  metric_dicts.append(metric_dict)
full_df = pd.DataFrame(metric_dicts)

mean_metrics = {"mean_hit_rate": full_df["hit_rate"].mean(),
                "mean_mrr": full_df["mrr"].mean()}
data = {
    "metrics": list(mean_metrics.keys()),
    "score": list(mean_metrics.values())
}

metric_df = pd.DataFrame(data).reset_index(drop=True)
metric_df.head()

Unnamed: 0,metrics,score
0,mean_hit_rate,0.9
1,mean_mrr,0.85


In [173]:
eval_path = os.path.join(eval_dir_, 'retrieval_benchmark.csv')
metric_df.to_csv(eval_path, index=False)