In [52]:
from dotenv import load_dotenv

import pandas as pd

from tqdm.auto import tqdm

import json

from sentence_transformers import SentenceTransformer

from openai import OpenAI

from elasticsearch import Elasticsearch


In [302]:
#!pip install pandas

# 

In [34]:
load_dotenv()

True

# 

# 

# 

# 

First of all we will load de embeddings model, the vector db generated in the ingestion step to continue with the app.

Once it's finished, we'll generate the .py associated leaving only the code needed for the production version of the app.

In [6]:
# Initialize the embeddings model
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the Elasticsearch client
es_client = Elasticsearch("http://localhost:9200")



In [7]:
# ElasticSearch Index names
index_name_cosine = "messixpert_cosine"
index_name_dot_product = "messixpert_dot_product"

# 

#

# 

# Ground Truth Generation

In [9]:
def build_ground_truth_prompt(document_text):
    
    prompt_template = """
        You are an user of a chatbot assistant that is an expert biographer on the life and career of Lionel Messi,  
        with deep knowledge of his entire history and statistics.
        Your task is to generate five questions about the text context sent to you, to generate a ground truth dataset for the app.
        Please, formulate five questions that an user might ask based on the provided text.
        The recourd should contain the answer to the question, and the question should be complete and not too short. Use as fewer
        words as possible from the text provided.
        Generate the questions only in spanish.

        TEXT PROVIDED: 
        {document_text}

        Provide the output in parsable JSON without using code blocks:
        {{"questions": ["Question 1", "Question 2", "Question 3", "Question 4", "Question 5"]}}
    """.strip()
    
    prompt = prompt_template.format(document_text=document_text).strip()
    return prompt

In [13]:
def calculate_cost(response):

    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens

    input_tokens_cost_per_1k = 0.00015
    output_tokens_cost_per_1k = 0.0006

    input_tokens_cost = input_tokens_cost_per_1k * (input_tokens / 1000)
    output_tokens_cost = output_tokens_cost_per_1k * (output_tokens / 1000)
    total_cost = input_tokens_cost + output_tokens_cost

    print("------------------------------------")
    print(f"Input Tokens: {input_tokens}       Cost: ${input_tokens_cost:.8f}")
    print(f"Completion Tokens: {output_tokens}       Cost: ${output_tokens_cost:.8f}")
    print(f"Total Cost: ${total_cost:.8f}")
    print("------------------------------------")

    return total_cost

In [14]:
def llm_generate_question(prompt, open_ai_client):
    response = open_ai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    cost = calculate_cost(response)

    return response.choices[0].message.content, cost

In [31]:
def generate_question(document, open_ai_client):
    builded_prompt = build_ground_truth_prompt(document)
    question, cost = llm_generate_question(builded_prompt, open_ai_client)
    return question, cost


In [59]:
# Function for extracting all the chunks of an index
def extract_chunks_from_index(index_name, es_client, scroll_size=1000):

    # First search of chunk size
    response = es_client.search(
        index=index_name, 
        scroll="2m",  # Window duration parameter
        size=scroll_size,  # Chunks per batch window search
        body={
            #"_source": ["content"],  # F
            "query": {"match_all": {}}  # Extraer todos los documentos
        }
    )
    
    
    chunks = []

    scroll_id = response["_scroll_id"]
    hits = response["hits"]["hits"]
    
    while hits:
        for hit in hits:
            chunks.append(hit["_source"])
        
        # Next scroll step
        response = es_client.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response["_scroll_id"]
        hits = response["hits"]["hits"]

    return chunks

In [37]:
OpenAI_client = OpenAI()

In [38]:
chunks = extract_chunks_from_index(index_name=index_name_cosine, es_client=es_client)

print(f"Total chunks extracted = {len(chunks)}")

Total chunks extracted = 110


  response = es_client.search(


In [57]:
questions = {}

In [58]:
total_cost = 0

for chunk in tqdm(chunks):
    doc_id = chunk.get('chunk_id') 
    content = chunk.get('content')
    
    print(doc_id, content)


    if doc_id in questions:
        continue

    question, cost = generate_question(chunks[2]["content"], open_ai_client=OpenAI_client)

    total_cost+=cost

    questions[doc_id] = json.loads(question)['questions']


  0%|          | 0/110 [00:00<?, ?it/s]

20240921163112_000000 Lionel Messi                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Lionel Messi.1                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Lionel Messi.2       Unnamed: 3
                

In [62]:
len(questions)

110

In [66]:
questions

{'20240921163112_000000': ['¿Cuál es la fecha de nacimiento de Lionel Messi y en qué ciudad nació?',
  '¿En qué equipo de la MLS juega Lionel Messi desde 2023?',
  '¿Cuántos títulos ganó Messi con el Fútbol Club Barcelona durante su tiempo en el club?',
  '¿Cuántas veces ha ganado Messi el Balón de Oro en su carrera?',
  '¿A qué edad hizo su debut oficial en el primer equipo del Barcelona?'],
 '20240921163112_000001': ['¿Qué títulos ganó Lionel Messi durante su tiempo en el Fútbol Club Barcelona?',
  '¿A qué edad hizo Messi su debut oficial con el primer equipo del Barcelona?',
  '¿Cuál es la relación entre Lionel Messi y Diego Maradona según el texto?',
  '¿Qué club integra Lionel Messi a partir de 2023?',
  '¿Cuáles son algunos de los récords que posee Messi en su carrera futbolística?'],
 '20240921163112_000002': ['¿Cuántos títulos ganó Lionel Messi con el Fútbol Club Barcelona durante su carrera?',
  '¿Qué distinciones ha recibido Lionel Messi a lo largo de su trayectoria como futb

In [69]:
print(f"The total cost of generating the ground truth dataset is ${total_cost:.8f}")

The total cost of generating the ground truth dataset is $0.01749210


Now, we'll store the ground truth dataset into a csv

In [72]:
# Converting the questions into a question-row dataframe, to store as a csv
data = [(chunk_id, question) for chunk_id, questions in questions.items() for question in questions]

questions_df = pd.DataFrame(data, columns=['chunk_id', 'question'])

questions_df.to_csv('../tests/wiki_Lionel_Messi-GroundTruth.csv', index=False, sep=';')

# 