In [2]:
# General imports

from llama_index.core.llama_dataset import LabelledRagDataset
from openai import OpenAI

import numpy as np
import pandas as pd
import os

In [3]:
# General variables and data import
print(f'Current working directory: {os.getcwd()}')

# IF using OpenAI, insert your OpenAI API key here
client = OpenAI(
    base_url = 'http://192.168.1.220:11434/v1',
    api_key='ollama', # required, but unused
)

# Default model config
model = 'eas/nous-hermes-2-solar-10.7b:q8_0'

test_dir = os.path.join('test')

rag_dataset = LabelledRagDataset.from_json(
    os.path.join(test_dir, 'rag_test_justicio_dataset.json')
)

rag_dataset_df = rag_dataset.to_pandas()


Current working directory: e:\dev\justicio\src\etls\boe\loading


In [4]:
# Comment to work on the whole dataset
rag_dataset_df = rag_dataset_df.head()

In [5]:
# To save space, we eliminate the reference_context columns since we are only evaluating the questions

rag_dataset_df = rag_dataset_df.drop(columns=['reference_contexts'])


In [6]:
# Evaluation functions

# Function to rate how fit is the question for a legal professional
def rate_pro_fit_question(question):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant, that reponses using a single digit from 1 to 10.'},
            {
                'role': 'user',
                'content': f'Rate the following question on a scale of 1 to 10 based on its relevance and usefulness for a legal professional with experience in legal databases in Spain. Consider complexity, specificity, and practical value. Return one of the following values [1, 2, 3, 4, 5], without any additional comments or explanations. Question: {question}',
            },
        ],
    )
    # Assuming the response will be a single digit number in the text.
    try:
        rating = response.choices[0].message.content.strip()
    except ValueError:
        # In case the response is not a number, default to NaN
        rating = 'NaN'
    return rating

# Function to rate if this is question about data or metadata (Note: True/False works better than data/metadata, which tends to add artifacts to the output)
def rate_data_metadata_question(question):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant, that especializes in data classification.'},
            {
                'role': 'user',
                'content': f"""Classify the following question considering whether is deals with data on the document or with metadata information. 
                
                Questions about data might include:
                - Which article of the GDPR is about data protection?
                - When was this law dictated?
                - What department was in charge of publishing this law?
                - When was this law approved?

                Questions about metadata might include: 
                - What is the name of the file?
                - When was the file created?

                If the question provided deals with data return the word "True". Otherwise, return the word "False". Do not add any additional comments or explanations. 
                Question: {question}""",
            },
        ],
    )
    # Assuming the response will be a single digit number in the text.
    try:
        rating = response.choices[0].message.content.strip()
    except ValueError:
        # In case the response is not a number, default to NaN
        rating = 'NaN'
    return rating

# Function to evaluate if this is a quick search question
def rate_quick_search_question(question):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant, that especializes in data classification.'},
            {
                'role': 'user',
                'content': f"""Classify the following question considering whether it is a quick search question or not. 
                
                Quick search questions follow the following pattern "artículo xxx de la ley" like:
                - Artículo 12 de la Ley de Propiedad Horizontal
                - Artículo primero del Código Civil
                - Párrafo 3 del artículo 5 de la Ley de Enjuiciamiento Civil

                If the question provided is a quick search question, return "True". Otherwise, return the "False". Do not add any additional comments or explanations. 
                Question: {question}""",
            },
        ],
    )
    # Assuming the response will be a single digit number in the text.
    try:
        rating = response.choices[0].message.content.strip()
    except ValueError:
        # In case the response is not a number, default to NaN
        rating = 'NaN'
    return rating

# Function to evaluate if this is a quick search question
def rate_direct_law_question(question):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant, that especializes in data classification.'},
            {
                'role': 'user',
                'content': f"""Classify the following question considering whether it is a direct law question or not. 
                
                Direct law questions are meant to know recent changes on issues that are modified frequently.

                These are several examples of direct law questions:
                - How many ministries are currently in Spain?
                - What is the current minimum wage in Spain?
                - What is the current VAT in Spain for plane tickets?
                
                If the question provided is a direct law question, return "True". Otherwise, return the "False". Do not add any additional comments or explanations. 
                Question: {question}""",
            },
        ],
    )
    # Assuming the response will be a single digit number in the text.
    try:
        rating = response.choices[0].message.content.strip()
    except ValueError:
        # In case the response is not a number, default to NaN
        rating = 'NaN'
    return rating

# Function to rate how fit is the question for a legal professional
def rate_multiple_sources_question(question):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant, that especializes in data categorization.'},
            {
                'role': 'user',
                'content': f"""Rate the following question considering whether it deals with multiple sources of information or not.

                Multiple sources questions are usually asked when preparing a case, a client response, or a legal document. They are meant to gather information from different sources to build a solid argument. 
                
                If the question provided is a question requiring multiple sources, return "True". Otherwise, return the "False". Do not add any additional comments or explanations.
                Question: {question}""",
            },
        ],
    )
    # Assuming the response will be a single digit number in the text.
    try:
        rating = response.choices[0].message.content.strip()
    except ValueError:
        # In case the response is not a number, default to NaN
        rating = 'NaN'
    return rating


In [7]:
# Run a complete evaluation

def rate_questions(rag_dataset_df, num_runs, model):
    rag_dataset_df['run'] = num_runs
    rag_dataset_df['is_fit_for_pro'] = rag_dataset_df['query'].apply(rate_pro_fit_question)
    rag_dataset_df['is_data_or_metadata'] = rag_dataset_df['query'].apply(rate_data_metadata_question)
    rag_dataset_df['is_quick_search'] = rag_dataset_df['query'].apply(rate_quick_search_question)
    rag_dataset_df['is_direct_law'] = rag_dataset_df['query'].apply(rate_direct_law_question)
    rag_dataset_df['is_multiple_sources'] = rag_dataset_df['query'].apply(rate_multiple_sources_question)
    rag_dataset_df['model'] = model
    return rag_dataset_df


In [8]:
# Let's generate multiple runs for a single model
test_runs = 2

# Community model names include / and ., so we need to sanitize the model name
sanitized_model = model.replace('/', '_').replace('.', '_').replace(':', '_')

# First run
rag_dataset_df = rate_questions(rag_dataset_df, 0, model)
rag_dataset_df.to_csv(os.path.join(test_dir, f'rag_dataset_evaluations_{sanitized_model}.csv'), mode='w', index=True, header=True)

# Subsequent runs
for i in range(1, test_runs):
    rag_dataset_df = rate_questions(rag_dataset_df, i, model)
    print(rag_dataset_df[['query', 'run', 'is_fit_for_pro', 'is_data_or_metadata', 'is_quick_search', 'is_direct_law', 'is_multiple_sources', 'model']])
    # Append the result of all runs to a single file
    rag_dataset_df.to_csv(os.path.join(test_dir, f'rag_dataset_evaluations_{sanitized_model}.csv'), mode='a', header=False)

                                               query  run is_fit_for_pro  \
0  En qué fecha se actualizó la información conte...    1              4   
1  ¿Cuál es el origen legislativo del documento c...    1              7   
2  A qué departamento del gobierno pertenece la p...    1              7   
3  ¿Cuál es el tipo de rango asignado al document...    1              7   
4  En qué fecha se dictó la disposición contenida...    1              7   

  is_data_or_metadata is_quick_search is_direct_law is_multiple_sources  \
0                True           False          True               False   
1                True           False          True                True   
2                True           False         False                True   
3                True           False          True               False   
4               False           False         False               False   

                                model  
0  eas/nous-hermes-2-solar-10.7b:q8_0  
1  eas/nous-

In [9]:
# Now, let's generate multiple runs for multiple models
num_runs = 10

models = ['eas/nous-hermes-2-solar-10.7b:q8_0', 'cas/nous-hermes-2-mistral-7b-dpo', 'openhermes2.5-mistral', 'neural-chat:7b-v3.3-q5_K_M', 'macadeliccc/laser-dolphin-mixtral-2x7b-dpo'] #, 'ifioravanti/alphamonarch']

for model in models:
    sanitized_model = model.replace('/', '_').replace('.', '_').replace(':', '_')

    rag_dataset_df = rate_questions(rag_dataset_df, 0, model)
    rag_dataset_df.to_csv(os.path.join(test_dir, f'rag_dataset_evaluations_{sanitized_model}.csv'), mode='w', index=True, header=True)
    
    for i in range(1, num_runs):
        rag_dataset_df = rate_questions(rag_dataset_df, i, model)
        print(rag_dataset_df[['query', 'run', 'is_fit_for_pro', 'is_data_or_metadata', 'is_quick_search', 'is_direct_law', 'is_multiple_sources', 'model']])
        rag_dataset_df.to_csv(os.path.join(test_dir, f'rag_dataset_evaluations_{sanitized_model}.csv'), mode='a', header=False)