In [1]:
# only run this if your have an editable install
%load_ext autoreload
%autoreload 2

In [8]:
from datasets import load_dataset

fiqa_test = load_dataset("explodinggradients/fiqa", "main", split="test")
fiqa_test

Found cached dataset fiqa (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___fiqa/main/1.0.0/953cfddc4a440cf2e290172be2563e5b51a953f2e4266940fc2b311e135cea69)


Dataset({
    features: ['question', 'ground_truths'],
    num_rows: 648
})

## k=1

We know the performance of the baseline model so lets try and see if we can make improvements to that.

In [3]:
from llama_index import GPTVectorStoreIndex, MockEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext, StorageContext

# load in HF embedding model from langchain
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
hf_sc = ServiceContext.from_defaults(embed_model=embed_model)

# mock embeddings
embed_model = MockEmbedding(embed_dim=1536)
mock = ServiceContext.from_defaults(embed_model=embed_model)

# openai embeddings
openai_sc = ServiceContext.from_defaults()

In [4]:
# load the index
from llama_index import StorageContext, load_index_from_storage, ServiceContext

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")

# load index
index = load_index_from_storage(storage_context)

# query with embed_model specified
qe = index.as_query_engine(
    mode="embedding", 
    verbose=True, 
    service_context=openai_sc,
    use_async = False
)

In [5]:
from llama_index import (
    GPTVectorStoreIndex,
    ResponseSynthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=1,
)

# configure response synthesizer
response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ]
)

# assemble query engine
qe = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [6]:
def generate_response(row):
    r = qe.query(row["question"])
    row["answer"] = r.response
    row["contexts"] = [sn.node.text for sn in r.source_nodes]
    
    return row

# generate_response(test_ds[0])

In [9]:
gen_ds = fiqa_test.select(range(30)).map(generate_response)
gen_ds

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'ground_truths', 'answer', 'contexts'],
    num_rows: 30
})

In [10]:
# evaluate
from ragas.metrics import factuality, answer_relevancy, context_relevancy
from ragas import evaluate

evaluate(
    gen_ds, 
    metrics=[factuality, answer_relevancy, context_relevancy]
)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
100%|█████████████████████████████████████████████████████████████| 2/2 [01:18<00:00, 39.17s/it]


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

100%|█████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.51s/it]


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

100%|█████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.51s/it]


{'NLI_score': 0.8822222222222222, 'answer_relevancy': 0.8647333333333332, 'context_relavency': 0.8236333333333333, 'ragas_score': 0.8561498126750564}