In [1]:
%load_ext autoreload
%autoreload 2

In [18]:
import os
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)
from ragas.integrations.llama_index import evaluate
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.vertex import Vertex
from google.oauth2 import service_account
from ragas import SingleTurnSample, EvaluationDataset



In [9]:
filename = "/home/stackops/secret/work/vngcloud/ai-platform/vertex-ai-credential.json"
credentials: service_account.Credentials = (
    service_account.Credentials.from_service_account_file(filename)
)

In [10]:
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
Settings.llm = Vertex(
    model="gemini-1.5-flash", project=credentials.project_id, credentials=credentials
)

In [11]:
dir_path = "/home/stackops/langchain-labs/data/vks/pdf/vi"


documents = SimpleDirectoryReader(dir_path).load_data()
index = VectorStoreIndex.from_documents(documents)

In [12]:
query_engine = index.as_query_engine()


In [None]:
eval_questions = [
    "Can you provide a concise description of the TinyLlama model?",
    "I would like to know the speed optimizations that TinyLlama has made.",
    "Why TinyLlama uses Grouped-query Attention?",
    "Is the TinyLlama model open source?",
    "Tell me about starcoderdata dataset",
]
eval_answers = [
    "TinyLlama is a compact 1.1B language model pretrained on around 1 trillion tokens for approximately 3 epochs. Building on the architecture and tokenizer of Llama 2, TinyLlama leverages various advances contributed by the open-source community (e.g., FlashAttention), achieving better computational efficiency. Despite its relatively small size, TinyLlama demonstrates remarkable performance in a series of downstream tasks. It significantly outperforms existing open-source language models with comparable sizes.",
    "During training, our codebase has integrated FSDP to leverage multi-GPU and multi-node setups efficiently. Another critical improvement is the integration of Flash Attention, an optimized attention mechanism. We have replaced the fused SwiGLU module from the xFormers (Lefaudeux et al., 2022) repository with the original SwiGLU module, further enhancing the efficiency of our codebase. With these features, we can reduce the memory footprint, enabling the 1.1B model to fit within 40GB of GPU RAM.",  
    "To reduce memory bandwidth overhead and speed up inference, we use grouped-query attention in our model. We have 32 heads for query attention and use 4 groups of key-value heads. With this technique, the model can share key and value representations across multiple heads without sacrificing much performance",
    "Yes, TinyLlama is open-source",
    "This dataset was collected to train StarCoder (Li et al., 2023), a powerful opensource large code language model. It comprises approximately 250 billion tokens across 86 programming languages. In addition to code, it also includes GitHub issues and text-code pairs that involve natural languages.",
]
eval_answers = [[a] for a in eval_answers]

In [15]:
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
]

In [34]:
# Sample 1
sample1 = SingleTurnSample(
    user_input="What is the capital of Germany?",
    retrieved_contexts=["Berlin is the capital and largest city of Germany."],
    response="The capital of Germany is Berlin.",
    reference="Berlin",
)

# Sample 2
sample2 = SingleTurnSample(
    user_input="Who wrote 'Pride and Prejudice'?",
    retrieved_contexts=["'Pride and Prejudice' is a novel by Jane Austen."],
    response="'Pride and Prejudice' was written by Jane Austen.",
    reference="Jane Austen",
)

# Sample 3
sample3 = SingleTurnSample(
    user_input="What's the chemical formula for water?",
    retrieved_contexts=["Water has the chemical formula H2O."],
    response="The chemical formula for water is H2O.",
    reference="H2O",
)

dataset = EvaluationDataset(samples=[sample1, sample2, sample3])


In [42]:
result = evaluate(dataset=dataset, metrics=metrics, query_engine=query_engine, llm=Vertex(
    model="gemini-1.5-flash", project=credentials.project_id, credentials=credentials
), embeddings=HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"))
result.to_pandas().to_csv('./test.csv', sep=',')

Running Query Engine: 100%|██████████| 3/3 [00:00<00:00,  4.89it/s]
Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]Exception raised in Job[2]: ValueError(Unknown field for GenerationConfig: n)
Evaluating:   8%|▊         | 1/12 [00:44<08:10, 44.59s/it]Exception raised in Job[4]: ValueError(Unknown field for GenerationConfig: n)
Evaluating:  17%|█▋        | 2/12 [00:57<04:20, 26.02s/it]Exception raised in Job[11]: ValueError(Unknown field for GenerationConfig: n)
Evaluating:  25%|██▌       | 3/12 [01:04<02:34, 17.18s/it]Exception raised in Job[9]: ValueError(Unknown field for GenerationConfig: n)
Evaluating:  33%|███▎      | 4/12 [01:12<01:48, 13.58s/it]Exception raised in Job[1]: ValueError(Unknown field for GenerationConfig: n)
Evaluating:  42%|████▏     | 5/12 [01:33<01:55, 16.47s/it]Exception raised in Job[8]: ValueError(Unknown field for GenerationConfig: n)
Evaluating:  50%|█████     | 6/12 [01:40<01:18, 13.08s/it]Exception raised in Job[6]: ValueError(Unknown field for Generati

In [44]:
print(result)

{'faithfulness': nan, 'answer_relevancy': nan, 'context_precision': nan, 'context_recall': nan}
