In [1]:
import sys
import ollama
import prompts
import evaluation
import pandas as pd
sys.path.append('../')

##### Denormalize the data

In [2]:
# load the final clustering data
data = pd.read_csv('../data/clustering_results/kmeans_2_results_hourly_categories_nooutliers3.csv')
data_features = (data.columns).drop(['cluster'])  # final features
data['id'] = data['id'].astype(str)
data['date'] = pd.to_datetime(data['date'])
# load the latest version of denormalized data
old_data = pd.read_pickle('../data/preprocessing_temps/date_engineered_training_df.pkl')
old_data['id'] = old_data['id'].astype(str)
old_data['date'] = pd.to_datetime(old_data['date'])
# keep only the final columns
normalized_data = old_data[data_features]
# keep only the final clustering rows
normalized_data = pd.merge(normalized_data, data[['id', 'date', 'cluster']], on=['id', 'date'], how='inner')
normalized_data.head(2)

Unnamed: 0,id,date,exertion_points,step_goal,minutes_below_zone_1,minutes_in_zone_1,steps,very_active_minutes,minutes_in_zone_2,minutes_in_zone_3,altitude,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,exercises,exercise_duration,sleep_points,sleep_duration,calories,cluster
0,621e2e8e67b776a24055b564,2021-05-24 00:00:00,27.0,0.0,1349.0,83.0,99.0,33.0,0.0,0.0,0.0,149.0,24.0,713.0,2,0.966944,25.0,0.0,16.82,1
1,621e2e8e67b776a24055b564,2021-05-24 01:00:00,27.0,0.0,1349.0,83.0,0.0,33.0,0.0,0.0,0.0,149.0,24.0,713.0,2,0.966944,25.0,0.0,2.29,1


##### Apply the learning technique

In [3]:
instance = 0  # select instance to interpet
prompt = prompts.zero_prompt(normalized_data, instance)
prompt

"Why a user on Monday at 0 o'clock, who has 27.0 exertion points, 0.0 step goal, 1349.0 minutes below zone 1, 83.0 minutes in zone 1, 99.0 steps, 33.0 very active minutes, 0.0 minutes in zone 2, 0.0 minutes in zone 3, 0.0 altitude, 149.0 lightly active minutes, 24.0 moderately active minutes, 713.0 sedentary minutes, 2 exercises, 0.9669444444444444 exercise duration, and 25.0 sleep points, 0.0 sleep duration, 16.82 calories has positive well-being?"

##### Prompt the LLM
Choose between: llama3, llama2-uncensored, mistral, phi3, gemma:2b

In [16]:
query = prompt
response = ollama.chat(model='gemma:2b', messages=[
  {
    'role': 'system',
    'content': 'Follow these steps to answer the user queries:' +
              '1. Answer only the word YES without any other information, and then ' + 
              '2.Answer only the word NO without any other information. '
    # 'content': 'You are an AI model that can help me interpret the clustering results of my data. I have a dataset with hourly data from different sensors and I have clustered the data into 2 clusters. I would like to understand the characteristics of each cluster. Can you help me with that?',
    # lime and format
  },
  {
    'role': 'user',
    'content': prompt,
  },
  ])
response = response['message']['content']
split_responses = response.split('\n\n')
developer_response = split_responses[0]
user_response = split_responses[1]
print("The developer_response is:", developer_response)
print("The user_response is:", user_response)

The developer_response is: YES
The user_response is: NO


##### Evaluate the structural quality of the explanation

In [12]:
evaluation.structural_quality_evaluation(response, prompt)

Coherence/Relevance Score: 0.7036965489387512
Number of Grammatical Errors: 1
Automated Readability Index: 41.327865168539326
Sentiment Consistency Score: 0.18077651515151513
Concepts Covered: {'points', 'user', 'exertion', 'positive', '25', '27', 'sleep'}
New Concepts Introduced: {'lightly', 'altitude', 'calories', 'step', '99', '1349', 'clock', 'exercise', 'zone', 'minutes', '82', 'sedentary', 'moderately', '24', 'active', '83', '33', 'monday', 'duration', 'exercises', '9669444444444444', '16', 'goal', '149', '713', 'steps'}


##### Evaluate the content quality of the explanation

In [8]:
# TODO