In [229]:
import os
import re
import time
from io import StringIO

from dotenv import load_dotenv

import pandas as pd

from tqdm.auto import tqdm

import requests
from bs4 import BeautifulSoup

from sentence_transformers import SentenceTransformer

import tiktoken

from openai import OpenAI

from elasticsearch import Elasticsearch

import json


In [302]:
#!pip install pandas

# 

In [166]:
load_dotenv()

True

# 

First of all we will load de embeddings model, the vector db generated in the ingestion step to continue with the app.

Once it's finished, we'll generate the .py associated leaving only the code needed for the production version of the app.

In [3]:
# Initialize the embeddings model
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the Elasticsearch client
es_client = Elasticsearch("http://localhost:9200")



In [4]:
# ElasticSearch Index names
index_name_cosine = "messixpert_cosine"
index_name_dot_product = "messixpert_dot_product"

# 

# 

# 

# Evaluating Retrieval

In [5]:
# Custom function to calculate the hit-rate of a vector search
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [6]:
# Custom function to calculate the MRR (Mean Reciprocal Rank) of a vector search
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [118]:
#Custom function to evaluate the search function on a ground truth dataset.
def evaluate(ground_truth, search_function, es_client, index):
    relevance_total = []

    for question in tqdm(ground_truth):
        chunk_id = question['chunk_id']
        results = search_function(question['question'], es_client, index)

        retrieved_chunk_ids = [result['_source']['chunk_id'] for result in results]
        
        relevance = [id == chunk_id for id in retrieved_chunk_ids]        
        
        relevance_total.append(relevance)

    print(relevance_total)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [71]:
def text_search(user_query, es_client, index):
    query = {
        "size": 10,  
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "content": user_query  # Search in the content field
                        }
                    }
                ]
            }
        }
    }

    results = es_client.search(index=index, body=query)
    
    return results["hits"]["hits"]

In [72]:
def knn_search(user_query, es_client, index):

    user_query = embeddings_model.encode(user_query)

    query = {
        "k": 10,  
        "field": "content_embeddings",
        "query_vector": user_query
    }

    results = es_client.search(index=index, knn=query)

    return results["hits"]["hits"]

In [67]:
def hybrid_search(user_query, es_client, index):

    text_results = text_search(user_query=user_query, es_client=es_client, index=index)
    knn_results = knn_search(user_query=user_query, es_client=es_client, index=index)
        
    combined_results = text_results + knn_results
    
    return combined_results

In [138]:
def search_by_chunk_id(es_client, index, chunk_id):
    query = {
        "query": {
            "term": {
                "chunk_id": chunk_id  
            }
        }
    }
    
    results = es_client.search(index=index, body=query)
    
    return results['hits']['hits']

In [152]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(question, es_client, index, k=60):
    
    hybrid_search_results = hybrid_search(user_query=question, es_client=es_client, index=index)

    
    rrf_scores = {}
    # Calculate RRF from search results
    for rank, hit in enumerate(hybrid_search_results):
        doc_id = hit['_source']['chunk_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = search_by_chunk_id(es_client, index_name_cosine, doc_id)
        final_results.append(doc[0])
    
    return final_results

In [156]:
def calculate_cost(response):

    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens

    input_tokens_cost_per_1k = 0.00015
    output_tokens_cost_per_1k = 0.0006

    input_tokens_cost = input_tokens_cost_per_1k * (input_tokens / 1000)
    output_tokens_cost = output_tokens_cost_per_1k * (output_tokens / 1000)
    total_cost = input_tokens_cost + output_tokens_cost

    print("------------------------------------")
    print(f"Input Tokens: {input_tokens}       Cost: ${input_tokens_cost:.8f}")
    print(f"Completion Tokens: {output_tokens}       Cost: ${output_tokens_cost:.8f}")
    print(f"Total Cost: ${total_cost:.8f}")
    print("------------------------------------")

    return total_cost

In [264]:
def llm(prompt, open_ai_client):
    response = open_ai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    cost = calculate_cost(response)

    return response.choices[0].message.content, cost

In [175]:
def rewrite_query(query, llm, open_ai_client):

    rewritten_query, cost = llm(f"Reescribe esta pregunta para que sea más precisa: {query}", open_ai_client)
    return rewritten_query, cost

In [176]:
def hybrid_search_query_rewriting(user_query, es_client, index, open_ai_client):

    query_rewrited, cost = rewrite_query(user_query, llm, open_ai_client)

    text_results = text_search(user_query=query_rewrited, es_client=es_client, index=index)
    knn_results = knn_search(user_query=query_rewrited, es_client=es_client, index=index)
        
    combined_results = text_results + knn_results
    
    return combined_results, cost

In [178]:
#Custom function to evaluate the search function on a ground truth dataset.
def evaluate_hybrid_search_query_rewriting(ground_truth, search_function, es_client, index, open_ai_client):
    relevance_total = []

    total_cost = 0

    for question in tqdm(ground_truth):
        chunk_id = question['chunk_id']
        results, cost = search_function(question['question'], es_client, index, open_ai_client=open_ai_client)

        total_cost+=cost

        retrieved_chunk_ids = [result['_source']['chunk_id'] for result in results]
        
        relevance = [id == chunk_id for id in retrieved_chunk_ids]        
        
        relevance_total.append(relevance)

    print(relevance_total)
    print(total_cost)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [180]:
questions_dict

[{'chunk_id': '20240921163112_000000',
  'question': '¿Cuál es la fecha de nacimiento de Lionel Messi y en qué ciudad nació?'},
 {'chunk_id': '20240921163112_000000',
  'question': '¿En qué equipo de la MLS juega Lionel Messi desde 2023?'},
 {'chunk_id': '20240921163112_000000',
  'question': '¿Cuántos títulos ganó Messi con el Fútbol Club Barcelona durante su tiempo en el club?'},
 {'chunk_id': '20240921163112_000000',
  'question': '¿Cuántas veces ha ganado Messi el Balón de Oro en su carrera?'},
 {'chunk_id': '20240921163112_000000',
  'question': '¿A qué edad hizo su debut oficial en el primer equipo del Barcelona?'},
 {'chunk_id': '20240921163112_000001',
  'question': '¿Qué títulos ganó Lionel Messi durante su tiempo en el Fútbol Club Barcelona?'},
 {'chunk_id': '20240921163112_000001',
  'question': '¿A qué edad hizo Messi su debut oficial con el primer equipo del Barcelona?'},
 {'chunk_id': '20240921163112_000001',
  'question': '¿Cuál es la relación entre Lionel Messi y Diego 

# 

# 

# 

We'll start loading the groundtruth dataset and generating a dataframe.

Then, we'll try different retrieval and rag approaches.

The metrics used are the hit_rate and the MRR

In [13]:
questions_df = pd.read_csv("../tests/wiki_Lionel_Messi-GroundTruth.csv", sep=";")

In [14]:
questions_dict = questions_df.to_dict(orient='records')

In [119]:
text_search_metrics = evaluate(questions_dict, text_search, es_client, index_name_cosine)
text_search_metrics

  0%|          | 0/550 [00:00<?, ?it/s]

[[False, True, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [True, False, False, False, False, False, False, False, False, False], [True, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False], [True, False, False, False, False, False, False, False, False, False]

{'hit_rate': 0.09454545454545454, 'mrr': 0.026481240981240967}

In [74]:
knn_search_metrics = evaluate(questions_dict, knn_search, es_client, index_name_cosine)
knn_search_metrics

  0%|          | 0/550 [00:00<?, ?it/s]

[[False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False], [True, False, False, False, False, False, False, False, False, False], [True, False, False, False, False, False, False, False, False, False], [False, False, True, False, False, False, False, False, False, False], [True, False, False, False, False, False, False, False, False, Fal

{'hit_rate': 0.09636363636363636, 'mrr': 0.03125324675324675}

The low metrics of hit_rate and mrr are related to the knowledge source. It's not a problem to have such metrics.

Wikipedia has different sections of information and the answer of a question can be found in different sections, and also in different chunks of each section information.

The most relevant evaluation will be the RAG evaluation, possibly with an LLM As a Judge method.

# 

# 

# 

# Improving Metrics at Retrieval

We'll try different techniques for improving metrics.

- Hybrid search.
- Document re-ranking
- LLM Query rewriting

In [76]:
hybrid_search_metrics = evaluate(questions_dict, hybrid_search, es_client, index_name_cosine)
hybrid_search_metrics

  0%|          | 0/550 [00:00<?, ?it/s]

[[False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], [False, 

{'hit_rate': 0.17272727272727273, 'mrr': 0.033116644379168995}

In [167]:
OpenAI_client = OpenAI()

In [192]:
hybrid_rerank_search_metrics = evaluate_hybrid_search_query_rewriting(questions_dict, hybrid_search_query_rewriting, es_client, index_name_cosine, open_ai_client=OpenAI_client)
hybrid_rerank_search_metrics

  0%|          | 0/550 [00:00<?, ?it/s]

------------------------------------
Input Tokens: 34       Cost: $0.00000510
Completion Tokens: 15       Cost: $0.00000900
Total Cost: $0.00001410
------------------------------------
------------------------------------
Input Tokens: 33       Cost: $0.00000495
Completion Tokens: 24       Cost: $0.00001440
Total Cost: $0.00001935
------------------------------------
------------------------------------
Input Tokens: 38       Cost: $0.00000570
Completion Tokens: 17       Cost: $0.00001020
Total Cost: $0.00001590
------------------------------------
------------------------------------
Input Tokens: 35       Cost: $0.00000525
Completion Tokens: 23       Cost: $0.00001380
Total Cost: $0.00001905
------------------------------------
------------------------------------
Input Tokens: 33       Cost: $0.00000495
Completion Tokens: 15       Cost: $0.00000900
Total Cost: $0.00001395
------------------------------------
------------------------------------
Input Tokens: 34       Cost: $0.000005

{'hit_rate': 0.16727272727272727, 'mrr': 0.03706549008892098}

In [193]:
hybrid_rerank_search_metrics

{'hit_rate': 0.16727272727272727, 'mrr': 0.03706549008892098}

The best metrics were achieved with hybrid search

# 

# 

# 

# 

# RAG Evaluation

We will evaluate the RAG with LLM as a Judge. We'll try two different LLMs for evaluating the RAG.

In [194]:
def build_llm_as_a_judge_prompt(question, llm_answer):
    
    prompt_template = """
        You are an expert evaluator for a RAG system.
        Your task is to analyze the relevance of the LLM generated answer to the users given question.
        The goal is to have 3 classification levels as if the relevance scores were 0, 0.5, or 1.
        Based on the relevance of the generated answer, you will classify it
        as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

        Here is the data for evaluation:

        Question: {question}
        Generated Answer: {llm_answer}

        Please analyze the content and context of the generated answer in relation to the question
        and provide your evaluation in parsable JSON without using code blocks:

        {{
        "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
        "Explanation": "[Provide a brief explanation for your evaluation in spanish]"
        }}
    """.strip()
    
    prompt = prompt_template.format(question=question, llm_answer=llm_answer).strip()
    return prompt

In [263]:
def build_prompt(query, search_results_text_list):
    
    prompt_template = """
You are an expert biographer on the life and career of Lionel Messi, with deep knowledge of his entire history and statistics. 
Your task is to answer users questions based solely on the context provided to you. 
Answer respectfully and in a warm way, as if you are an assistant.
Answer in the same language that you are asked, if you are asked in english then answer in english, but if you are asked in spanish then answer in spanish.
Use only the data from the context to answer the question. 
If you cannot answer the question with the provided information, respond: “I’m sorry, but I don’t have enough information to answer that. Is there anything else I can help you with?”

CONTEXT: {context}

QUESTION: 
{question}
""".strip()

    context = ""
    
    for text in search_results_text_list:
        context = context + f"{text}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [206]:
def get_answers_content(answers):

    retrieved_texts = [hit["_source"]["content"] for hit in answers]

    return retrieved_texts
    

In [209]:
def get_answers_content(answers):

    retrieved_texts = [hit["_source"]["content"] for hit in answers]

    return retrieved_texts
    

In [211]:
def generate_answer(question, search_function, es_client, index_name, open_ai_client):

    top_k_chunks = search_function(user_query=question, es_client=es_client, index=index_name)

    questions = get_answers_content(top_k_chunks)
    
    builded_prompt = build_prompt(query=question, search_results_text_list=questions)
    
    answer = llm(builded_prompt, open_ai_client=open_ai_client)

    return answer

In [216]:
top_k_chunks = hybrid_search(user_query=a['question'], es_client=es_client, index=index_name_cosine)
answers = get_answers_content(top_k_chunks)
builded_prompt = build_prompt(query=a['question'], search_results_text_list=answers)
answer,cost = llm(builded_prompt, open_ai_client=OpenAI_client)
builded_prompt


------------------------------------
Input Tokens: 9515       Cost: $0.00142725
Completion Tokens: 29       Cost: $0.00001740
Total Cost: $0.00144465
------------------------------------


'You are an expert biographer on the life and career of Lionel Messi, with deep knowledge of his entire history and statistics. \nYour task is to answer users questions based solely on the context provided to you. \nAnswer respectfully and in a warm way, as if you are an assistant.\nAnswer in the same language that you are asked, if you are asked in english then answer in english, but if you are asked in spanish then answer in spanish.\nUse only the data from the context to answer the question. \nIf you cannot answer the question with the provided information, respond: “I’m sorry, but I don’t have enough information to answer that. Is there anything else I can help you with?”\n\nCONTEXT: Lionel Andrés Messi nació el 24 de junio de 1987 en el Hospital Italiano Garibaldi de la ciudad de Rosario, en la provincia de Santa Fe. Es el tercer hijo de Jorge Horacio Messi y Celia María Cuccittini. Tiene dos hermanos mayores, Rodrigo y Matías, y una hermana menor, María Sol.[14]\u200b Su familia 

We'll create a random sample questions dataset from the ground truth and we'll evaluate the RAG with it

In [254]:
df_sample = questions_df.sample(n=200, random_state=1)
question_samples = df_sample.to_dict(orient='records')
question_samples

[{'chunk_id': '20240921163112_000032',
  'question': '¿Qué posición juega Lionel Messi en el fútbol?'},
 {'chunk_id': '20240921163112_000061',
  'question': '¿Cuántos títulos ganó Messi con el Fútbol Club Barcelona?'},
 {'chunk_id': '20240921163112_000013',
  'question': '¿Cuál es la fecha de nacimiento de Lionel Messi y en qué ciudad nació?'},
 {'chunk_id': '20240921163112_000084',
  'question': '¿Cuántos Balones de Oro ha ganado Messi a lo largo de su carrera?'},
 {'chunk_id': '20240921163112_000027',
  'question': '¿En qué año nació Lionel Messi y dónde?'},
 {'chunk_id': '20240921163112_000041',
  'question': '¿A qué edad debutó Messi oficialmente con el primer equipo del Barcelona?'},
 {'chunk_id': '20240921163112_000062',
  'question': '¿Cuántos títulos ganó Messi con el FC Barcelona?'},
 {'chunk_id': '20240921163112_000059',
  'question': '¿Cuál es la fecha de nacimiento de Lionel Messi y dónde nació?'},
 {'chunk_id': '20240921163112_000054',
  'question': '¿Qué posición juega Li

In [255]:
evaluations = []
total_cost = 0

for record in tqdm(question_samples):
    
    question = record['question']
    answer_llm, answer_cost = generate_answer(question=question, search_function=hybrid_search, es_client=es_client, index_name=index_name_cosine, open_ai_client=OpenAI_client)


    prompt = build_llm_as_a_judge_prompt(question, answer_llm)
    evaluation, evaluation_cost = llm(prompt, OpenAI_client)

    total_cost = total_cost + answer_cost + evaluation_cost

    evaluation = json.loads(evaluation)
    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

------------------------------------
Input Tokens: 9814       Cost: $0.00147210
Completion Tokens: 11       Cost: $0.00000660
Total Cost: $0.00147870
------------------------------------
------------------------------------
Input Tokens: 212       Cost: $0.00003180
Completion Tokens: 48       Cost: $0.00002880
Total Cost: $0.00006060
------------------------------------
------------------------------------
Input Tokens: 10555       Cost: $0.00158325
Completion Tokens: 27       Cost: $0.00001620
Total Cost: $0.00159945
------------------------------------
------------------------------------
Input Tokens: 232       Cost: $0.00003480
Completion Tokens: 48       Cost: $0.00002880
Total Cost: $0.00006360
------------------------------------
------------------------------------
Input Tokens: 9515       Cost: $0.00142725
Completion Tokens: 26       Cost: $0.00001560
Total Cost: $0.00144285
------------------------------------
------------------------------------
Input Tokens: 233       Cost:

In [258]:
df_evaluations = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_evaluations['chunk_id'] = df_evaluations.record.apply(lambda d: d['chunk_id'])
df_evaluations['question'] = df_evaluations.record.apply(lambda d: d['question'])

df_evaluations['relevance'] = df_evaluations.evaluation.apply(lambda d: d['Relevance'])
df_evaluations['explanation'] = df_evaluations.evaluation.apply(lambda d: d['Explanation'])

df_evaluations = df_evaluations[['chunk_id', 'question', 'answer', 'relevance', 'explanation']]

df_evaluations


Unnamed: 0,chunk_id,question,answer,relevance,explanation
0,20240921163112_000032,¿Qué posición juega Lionel Messi en el fútbol?,Lionel Messi juega como delantero o centrocamp...,RELEVANT,La respuesta es relevante porque responde dire...
1,20240921163112_000061,¿Cuántos títulos ganó Messi con el Fútbol Club...,Lionel Messi ganó un total de 35 títulos con e...,RELEVANT,La respuesta generada proporciona una cifra pr...
2,20240921163112_000013,¿Cuál es la fecha de nacimiento de Lionel Mess...,Lionel Messi nació el 24 de junio de 1987 en l...,RELEVANT,La respuesta generada proporciona la fecha de ...
3,20240921163112_000084,¿Cuántos Balones de Oro ha ganado Messi a lo l...,Lionel Messi ha ganado un total de ocho Balone...,RELEVANT,La respuesta generada proporciona la cantidad ...
4,20240921163112_000027,¿En qué año nació Lionel Messi y dónde?,Lionel Messi nació el 24 de junio de 1987 en l...,RELEVANT,La respuesta generada proporciona tanto el año...
...,...,...,...,...,...
195,20240921163112_000024,¿Qué premios ha ganado Messi a lo largo de su ...,Lionel Messi ha ganado a lo largo de su carrer...,RELEVANT,La respuesta generada proporciona información ...
196,20240921163112_000100,¿En qué equipo juega Lionel Messi a partir de ...,"A partir de 2023, Lionel Messi juega en el Int...",RELEVANT,La respuesta generada proporciona información ...
197,20240921163112_000061,¿Qué distinciones ha ganado Lionel Messi a lo ...,Lionel Messi ha ganado numerosas distinciones ...,RELEVANT,La respuesta generada proporciona una lista pr...
198,20240921163112_000004,¿A qué edad debutó oficialmente Messi con el p...,Lionel Messi debutó oficialmente con el primer...,RELEVANT,La respuesta generada proporciona la edad exac...


In [259]:
df_evaluations.relevance.value_counts()

relevance
RELEVANT           197
PARTLY_RELEVANT      3
Name: count, dtype: int64

In [260]:
df_evaluations.relevance.value_counts(normalize=True)


relevance
RELEVANT           0.985
PARTLY_RELEVANT    0.015
Name: proportion, dtype: float64

In [307]:
df_not_relevant_answers = df_evaluations[df_evaluations.relevance != "RELEVANT"]

for idx,row in df_not_relevant_answers.iterrows():

    for column in df_not_relevant_answers.columns:
        print(f"{column.upper()} = {row[column]}")

    print("______________________________________________")

CHUNK_ID = 20240921163112_000037
QUESTION = ¿Cuándo hizo Messi su debut oficial con el primer equipo del Barcelona y qué logros tuvo en su primera temporada completa?
ANSWER = Lionel Messi hizo su debut oficial con el primer equipo del Barcelona el 16 de octubre de 2004, a los diecisiete años. En su primera temporada completa, que fue la 2008-09, Messi tuvo un desempeño excepcional, anotando 38 goles y proporcionando 18 asistencias en 62 partidos. Además, fue fundamental para que el Barcelona lograra el primer triplete del fútbol español, ganando La Liga, la Copa del Rey y la Liga de Campeones de la UEFA.
RELEVANCE = PARTLY_RELEVANT
EXPLANATION = La respuesta menciona correctamente la fecha del debut de Messi con el Barcelona, pero confunde la primera temporada completa con la 2008-09 en lugar de la 2004-05. Además, se enfoca en logros de una temporada posterior, lo que hace que la información sea parcialmente relevante.
______________________________________________
CHUNK_ID = 2024092

In [261]:
df_evaluations.to_csv('../tests/wiki_Lionel_Messi-rag_evaluation_gpt-4o-mini.csv', sep=';', index=False)

We'll also try an open-source LLM to act as an "llm as a judge". We'll use the phi3 model from ollama.

docker run -it -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama

In [277]:
def open_source_llm(prompt, open_source_client):
    response = open_source_client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=100
    )
    
    return response.choices[0].message.content

In [266]:
open_source_client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)


In [283]:
open_source_evaluations = []

for idx, row in df_evaluations.iterrows():

    chunk_id = row['chunk_id']
    question = row['question']
    answer = row['answer']

    prompt = build_llm_as_a_judge_prompt(question, answer)
    evaluation = open_source_llm(prompt, open_source_client)

    evaluation = json.loads(evaluation)
    open_source_evaluations.append((row, answer, evaluation))




In [293]:
df_open_source_evaluations = pd.DataFrame(open_source_evaluations, columns=['record', 'answer', 'evaluation'])

df_open_source_evaluations['chunk_id'] = df_open_source_evaluations.record.apply(lambda d: d['chunk_id'])
df_open_source_evaluations['question'] = df_open_source_evaluations.record.apply(lambda d: d['question'])

df_open_source_evaluations['relevance'] = df_open_source_evaluations.evaluation.apply(lambda d: d['Relevance'])
df_open_source_evaluations['explanation'] = df_open_source_evaluations.evaluation.apply(lambda d: d['Explanation'])

df_open_source_evaluations = df_open_source_evaluations[['chunk_id', 'question', 'answer', 'relevance', 'explanation']]

df_open_source_evaluations


Unnamed: 0,chunk_id,question,answer,relevance,explanation
0,20240921163112_000032,¿Qué posición juega Lionel Messi en el fútbol?,Lionel Messi juega como delantero o centrocamp...,RELEVANT,La respuesta es relevante porque responde dire...
1,20240921163112_000061,¿Cuántos títulos ganó Messi con el Fútbol Club...,Lionel Messi ganó un total de 35 títulos con e...,RELEVANT,La respuesta generada proporciona una cifra pr...
2,20240921163112_000013,¿Cuál es la fecha de nacimiento de Lionel Mess...,Lionel Messi nació el 24 de junio de 1987 en l...,RELEVANT,La respuesta generada proporciona la fecha de ...
3,20240921163112_000084,¿Cuántos Balones de Oro ha ganado Messi a lo l...,Lionel Messi ha ganado un total de ocho Balone...,RELEVANT,La respuesta generada proporciona la cantidad ...
4,20240921163112_000027,¿En qué año nació Lionel Messi y dónde?,Lionel Messi nació el 24 de junio de 1987 en l...,RELEVANT,La respuesta generada proporciona tanto el año...
...,...,...,...,...,...
195,20240921163112_000024,¿Qué premios ha ganado Messi a lo largo de su ...,Lionel Messi ha ganado a lo largo de su carrer...,RELEVANT,La respuesta generada proporciona información ...
196,20240921163112_000100,¿En qué equipo juega Lionel Messi a partir de ...,"A partir de 2023, Lionel Messi juega en el Int...",RELEVANT,La respuesta generada proporciona información ...
197,20240921163112_000061,¿Qué distinciones ha ganado Lionel Messi a lo ...,Lionel Messi ha ganado numerosas distinciones ...,RELEVANT,La respuesta generada proporciona una lista pr...
198,20240921163112_000004,¿A qué edad debutó oficialmente Messi con el p...,Lionel Messi debutó oficialmente con el primer...,RELEVANT,La respuesta generada proporciona la edad exac...


In [294]:
df_open_source_evaluations.to_csv('../tests/wiki_Lionel_Messi-rag_evaluation_ollama-phi3.csv', sep=';', index=False)

# 

# 

# 