In [1]:
import sys
import json
import prompts
import evaluation
import pandas as pd
from tqdm import tqdm
sys.path.append('../')
import post_processing

##### Load data

In [2]:
data = post_processing.load_data()
data.head(2)

Unnamed: 0,id,date,exertion_points,step_goal,minutes_below_zone_1,minutes_in_zone_1,steps,very_active_minutes,minutes_in_zone_2,minutes_in_zone_3,altitude,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,exercises,exercise_duration,sleep_points,sleep_duration,calories,cluster
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,27.0,0.0,1349.0,83.0,99.0,33.0,0.0,0.0,0.0,149.0,24.0,713.0,2,0.967,25.0,0.0,16.82,1
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,27.0,0.0,1349.0,83.0,0.0,33.0,0.0,0.0,0.0,149.0,24.0,713.0,2,0.967,25.0,0.0,2.29,1


##### Instances and parameters

In [3]:
# instances
instances_interpret = [27606, 15179, 29493, 19985, 3144, 19966, 945, 6368, 18524, 30216, 13578, 13400, 5213, 2855, 9869, 15183, 13296, 15463, 19307, 4658]
example_instances = [17170, 570]
# parameteres for the data
granularity = 'hourly'
# parameteres for the ML component
ml_task = 'clustering'
real_task = 'well-being'
target = 'cluster'
target_encoding = {0: 'negative', 1: 'positive'}
# parameteres for the XAI component
scope = 'local'
xai_method = 'lime'

##### Evaluation

In [4]:
# LLM parameters
model = 'mistral'
learning = 'few'

In [5]:
# initialize the metrics
coherences, grammaticals, readabilities, sentiments, conc_covs, conc_intrs, spearman_corrs, ndcg_difs, eucl_dists = [], [], [], [], [], [], [], [], []
pbar = tqdm(total=len(instances_interpret), desc="Processing instances", unit="inst")
for instance in instances_interpret:  # for all the instances to interpret
    # ------------ STRUCTURAL EVALUATION ------------ #
    try:  # because we might do not have the user response for all the instances

        # read the user response
        with open(f'../data/llms_output/{model}_{learning}/{instance}_user.txt', 'r') as f: 
            user_response = f.read()

        # create the prompt
        if learning == 'zero':
            query = prompts.zero_prompt(data, instance, target, target_encoding, granularity, real_task)
        elif learning == 'one':
            query = prompts.one_prompt(data, instance, example_instances[0], target, target_encoding, granularity, real_task)
        elif learning == 'few':
            query = prompts.few_prompt(data, instance, example_instances, target, target_encoding, granularity, real_task)

        # structural metrics
        coherence, grammatical, readability, sentiment, coverage, concepts = evaluation.structural_quality_evaluation(query, user_response)
        coherences.append(coherence)
        grammaticals.append(grammatical)
        readabilities.append(readability)
        sentiments.append(sentiment)
        conc_covs.append(coverage)
        conc_intrs.append(concepts)      
    except:
        print(f'Instance {instance} has not a user response')

    # ------------ CONTENT EVALUATION ------------ #
    try: # because we might do not have the user response for all the instances
        # read the developer response
        with open(f'../data/llms_output/{model}_{learning}/{instance}_developer.json', 'r') as f:
            llm_response = json.load(f)

        # content metrics
        spearman_corr, ndcg_dif, eucl_dist = evaluation.content_xai_quality_evaluation(instance, llm_response)
        spearman_corrs.append(spearman_corr)
        ndcg_difs.append(ndcg_dif)
        eucl_dists.append(eucl_dist)
    except:
        print(f'Instance {instance} has not a developer response')
    pbar.update(1) 
pbar.close()

# print the results
evaluation.aggregated_evaluation(model, learning, coherences, grammaticals, readabilities, sentiments, conc_covs, conc_intrs, spearman_corrs, ndcg_difs, eucl_dists)

Processing instances:  10%|█         | 2/20 [00:43<06:28, 21.57s/inst]

Instance 29493 has not a user response
Instance 29493 has not a developer response


Processing instances: 100%|██████████| 20/20 [28:10<00:00, 84.54s/inst] 

mistral  in  few -shot learning: 
Avg coherence: 0.7140676661541587
Avg number of grammatical errors: 0.7368421052631579
Avg ARI: 50.382810217856324
Avg sentiment consistency: 0.20196744421944168
Avg percentage of concepts covered: 0.37372843874391865
Avg percentage of new concepts introduced: 0.5959171444205982
Avg spearman rank correlation: 0.7463044905277404
Avg NDCG differences: 0.012097979504638453
Avg euclidean distances: 0.20424991864383735



