In [228]:
import json
import os
import re
import numpy as np
import pprint
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from krippendorff import alpha
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters, cohens_kappa, to_table
from irrCAC.raw import CAC

### Create Variables and Load Dialogues from JSON + Batch IDs

In [229]:
mwoz_judge_json = "results/judge_results_mwoz_autotod/20250403_025805/mwoz-autotod-gpt-4o_j.json"
tau_air_judge_json = "results/judge-results-tau/20250131_152503-tau-4o-airline/tau-gpt-4o_j.json"
tau_retail_judge_json = "results/judge-results-tau/20250131_152422-tau-4o-retail/tau-gpt-4o_j.json"
human_eval_batch_dir = "datasets/main_human_eval"

# load dialogues: order mwoz, tau-retail, tau-airline
with open(mwoz_judge_json, 'r') as f:
		mwoz_judge = json.load(f)
mwoz_dials = mwoz_judge.get('dialogues', [])
with open(tau_retail_judge_json, 'r') as f:
		tau_retail_judge = json.load(f)
tau_retail_dials = tau_retail_judge['dialogues']
with open(tau_air_judge_json, 'r') as f:
		tau_air_judge = json.load(f)
tau_air_dials = tau_air_judge['dialogues']
# load batches
num_batches = 10
batch_list = {}
batch_order = {}
for ind in range(1, num_batches + 1):
	with open(f"{human_eval_batch_dir}/batch{ind}.json", 'r') as f:
		curr_batch_list = json.load(f)
	if curr_batch_list is None or len(curr_batch_list) == 0:
		print('No batches found at this path:',  ind)
		exit()
	# add batches to full list
	batch_list[ind] = {}
	if "autotod_mwoz" not in batch_list:
		batch_list[ind]["autotod_mwoz"] = curr_batch_list["autotod_mwoz"]
	else:
		batch_list[ind]["autotod_mwoz"].extend(curr_batch_list["autotod_mwoz"])
	if "tau" not in batch_list:
		batch_list[ind]["tau"] = curr_batch_list["tau"]
	else:
		batch_list[ind]["tau"]["retail"].extend(curr_batch_list["tau"]["retail"])
		batch_list[ind]["tau"]["airline"].extend(curr_batch_list["tau"]["airline"])
	batch_order[ind] = curr_batch_list["order"]

pprint.pprint(batch_list, compact=True)
pprint.pprint(batch_order, compact=True)

{1: {'autotod_mwoz': ['mul1076', 'sng0323', 'pmul2859', 'pmul3731', 'mul0297',
                      'sng01957'],
     'tau': {'airline': ['2', '11'], 'retail': ['6', '10']}},
 2: {'autotod_mwoz': ['mul0233', 'pmul1210', 'mul1624', 'pmul1931', 'pmul1657',
                      'pmul1105'],
     'tau': {'airline': ['31', '34'], 'retail': ['14', '21']}},
 3: {'autotod_mwoz': ['pmul3921', 'mul0690', 'pmul4125', 'pmul4569', 'pmul2513',
                      'mul0144'],
     'tau': {'airline': ['15', '27'], 'retail': ['15', '29']}},
 4: {'autotod_mwoz': ['pmul4255', 'pmul0566', 'pmul0815', 'pmul0048',
                      'pmul1373', 'pmul0630'],
     'tau': {'airline': ['46', '48'], 'retail': ['28', '51']}},
 5: {'autotod_mwoz': ['mul0528', 'mul0594', 'mul0035', 'pmul2080', 'sng1041',
                      'mul2466'],
     'tau': {'airline': ['19', '33'], 'retail': ['25', '45']}},
 6: {'autotod_mwoz': ['mul1926', 'mul0309', 'pmul1344', 'pmul1266', 'mul2206',
                      'mul2294

### Compile Relevant Dialogues From Batches

In [230]:
def get_batch_dialogues(
	mwoz_dialogues: dict, 
	tau_air_dialogues: dict, 
	tau_retail_dialogues: dict, 
	batch_list: dict
) -> dict:
	#load dialogues
	batch_dials = []
	mwoz_batch_ids = batch_list["autotod_mwoz"]
	tau_air_batch_ids = batch_list["tau"]["airline"]
	tau_retail_batch_ids = batch_list["tau"]["retail"]
	# load batch (order: mwoz, tau-retail, tau-airline)
	for batch_id in mwoz_batch_ids:
		for id, dial in mwoz_dialogues.items():
			if id.split(".json")[0].lower() == batch_id:
				batch_dials.append(dial)
				break
	for batch_id in tau_retail_batch_ids:
		for id, dial in tau_retail_dialogues.items():
			if id == batch_id:
				batch_dials.append(dial)
				break
	for batch_id in tau_air_batch_ids:
		for id, dial in tau_air_dialogues.items():
			if id == batch_id:
				batch_dials.append(dial)
				break
	tot_batch_len = len(mwoz_batch_ids) + len(tau_air_batch_ids) + len(tau_retail_batch_ids)
	if len(batch_dials) != tot_batch_len:
		print("filtered dials size does not match batches:", len(batch_dials), tot_batch_len)
		exit()
	return batch_dials

# get batch dialogues for all eval batches
batch_dials = {}
for b_ind, b_list in batch_list.items():
	dials = get_batch_dialogues(
		mwoz_dials, 
		tau_air_dials, 
		tau_retail_dials, 
		b_list
	)
	batch_dials[b_ind] = dials

# pprint.pprint(batch_dials, compact=True)

### Extract Qualtrics Human Eval CSV Data

In [231]:
def extract_human_csv_data(
	human_eval_csv: dict, 
	batch_dialogues: dict, 
	batch_order: dict
) -> dict:
	"""Read CSV data and convert to appropriate format"""
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	input_csv = human_eval_csv["csv_file"]
	eval_csv = pd.read_csv(input_csv, on_bad_lines='warn') 
	start_col = human_eval_csv['start_col']
	end_col = human_eval_csv['end_col']
	search_str = '2025'
	turn_result = {}
	dial_result = {}
	first_eval_row = eval_csv.StartDate.str.contains(search_str).idxmax()
	human_scores = eval_csv.loc[first_eval_row:, start_col:end_col].to_numpy()
	mapping = {
		"Very Good": 5.0, 
		"Good": 4.0, 
		"Fair": 3.0, 
		"Bad": 2.0, 
		"Very Bad": 1.0
	}
	# mapping = {
	# 	"Very Good": 3.0, 
	# 	"Good": 3.0, 
	# 	"Fair": 2.0, 
	# 	"Bad": 1.0, 
	# 	"Very Bad": 1.0
	# }
	vectorized_map = np.vectorize(lambda x: mapping[x.strip()])
	int_scores = vectorized_map(human_scores)
	# extract scores into results dialogue map
	scores_idx = 0
	for i, dial in enumerate(batch_dialogues):
		if batch_order[i]["type"] == "tau_retail":
			dial_id = f"retail_{batch_order[i]['id']}"
		elif batch_order[i]["type"] == "tau_airline":
			dial_id = f"airline_{batch_order[i]['id']}"
		else:
			dial_id = batch_order[i]["id"]
		turn_result[dial_id] = {}
		# add turn scores
		for _ in dial:
			for i, metric in enumerate(dims):
				if metric not in turn_result[dial_id]:
					turn_result[dial_id][metric] = int_scores[:,scores_idx+i]
				else:
					turn_result[dial_id][metric] = np.concat(
						(turn_result[dial_id][metric], int_scores[:,scores_idx+i])
					)
				# turn_result[dial_id].append({
				# 	'conv_consistency': int_scores[:,scores_idx],
				# 	'backend_consistency': int_scores[:,scores_idx+1],
				# 	'policy_completeness': int_scores[:, scores_idx+2]
				# })
			scores_idx += 3
		# add dial scores
		dial_result[dial_id] = {
				'conv_consistency': int_scores[:,scores_idx],
				'backend_consistency': int_scores[:,scores_idx+1],
				'policy_completeness': int_scores[:, scores_idx+2]
		}
		scores_idx += 3
	return turn_result, dial_result

human_eval_csv = {
	1: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_1.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID187_3" 
	},
	2: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_2.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID135_3" 
	},
	3: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_3.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID180_3" 
	},
	4: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_4.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID123_3" 
	},
	5: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_5.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID129_3" 
	},
	7: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_7.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID137_3" 
	},
	9: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_9.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID115_3" 
	}
}

human_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_eval_data, dial_eval_data = extract_human_csv_data(
		human_eval_csv[i], 
		batch_dials[i], 
		batch_order[i]
	)
	human_batch_scores[i] = { 
		"turn_level": turn_eval_data, 
		"dial_level": dial_eval_data
	}

pprint.pprint(human_batch_scores, compact=True)

{1: {'dial_level': {'airline_11': {'backend_consistency': array([5.]),
                                   'conv_consistency': array([5.]),
                                   'policy_completeness': array([5.])},
                    'airline_2': {'backend_consistency': array([5.]),
                                  'conv_consistency': array([5.]),
                                  'policy_completeness': array([5.])},
                    'mul0297': {'backend_consistency': array([5.]),
                                'conv_consistency': array([2.]),
                                'policy_completeness': array([5.])},
                    'mul1076': {'backend_consistency': array([5.]),
                                'conv_consistency': array([4.]),
                                'policy_completeness': array([5.])},
                    'pmul2859': {'backend_consistency': array([5.]),
                                 'conv_consistency': array([5.]),
                                 'policy_c

### Extract TD-Eval Scores

In [232]:
"""Compare human evaluation data with LLM evaluation data"""    
def extract_score(score_str: str) -> int:
	try:
		# regex matching
		match = re.search(r'Score: (\d+)', str(score_str))
		if not match:
			print("Score not found in string:",score_str)
			print("Checking substring")
			# check substring
			if "Very Good" in score_str:
				return 5
			elif "Good" in score_str:
				return 4
			elif "Fair" in score_str:
				return 3
			# check more detailed string first
			elif "Very Bad" in score_str: 
				return 4
			elif "Bad" in score_str:
				return 1
			else:
				print("Score still not found with substring check")
				return 5
		return int(match.group(1)) if match else 5
	except:
		print("Score not found in string:",score_str)
		return 5

# TODO: extract [real] dialogue level scores as well
def extract_tdeval_scores(
	batch_dialogues: list, 
	autotod_dial_level: dict,
	tau_retail_dial_level: dict,
	tau_airline_dial_level: dict,
	batch_order: dict
) -> tuple:
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	turn_scores = {}
	dial_scores = {}
	for idx, batch_dial in enumerate(batch_order):
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial['type'] == "tau_retail":
			batch_dial_id = f"retail_{batch_dial_id}"
		elif batch_dial['type'] == "tau_airline":
			batch_dial_id = f"airline_{batch_dial_id}"
		llm_dial = batch_dialogues[idx]
		# extract scores from td-eval json data
		turn_scores[batch_dial_id] = {}
		for turn_idx, turn in enumerate(llm_dial):    
			turn_score = turn["scores"]
			# skip turn score if any negative/invalid scores exist
			all_scores = np.array([
				extract_score(turn_score['conv_consistency']["score"]), 
				extract_score(turn_score['backend_consistency']["score"]), 
				extract_score(turn_score['policy_completeness']["score"])
			])
			if np.any(all_scores <= 0):
				print("missing a score")
				continue
			# Get LLM scores
			for metric in dims:
				if metric not in turn_scores[batch_dial_id]:
					turn_scores[batch_dial_id][metric] = [
						extract_score(turn_score[metric]["score"])
					]
				else:
					turn_scores[batch_dial_id][metric].append(
						extract_score(turn_score[metric]["score"])
					)
		# get dialogue level score
		dial_scores[batch_dial_id] = {}
		# grab from TD-Eval 
		for metric in dims:
			if batch_dial['type'] == "autotod_mwoz":
				prev_dial_id = f"{batch_dial_id.upper()}.json"
				score = extract_score(autotod_dial_level[prev_dial_id][metric]["score"])
			elif batch_dial['type'] == "tau_retail":  
				prev_dial_id = batch_dial_id.replace("retail_", "")
				score = extract_score(
					tau_retail_dial_level[prev_dial_id][metric]["score"]
				)
			elif batch_dial['type'] == "tau_airline":
				prev_dial_id = batch_dial_id.replace("airline_", "")
				score = extract_score(
					tau_airline_dial_level[prev_dial_id][metric]["score"]
				)
			dial_scores[batch_dial_id][metric] = score

		# compile dialogue level scores by averaging turn level scores
		# 	dial_scores[batch_dial_id][metric] = np.round(
		# 		np.mean(turn_scores[batch_dial_id][metric]),
		# 		decimals=2
		# 	).item()
		
	return turn_scores, dial_scores

autotod_dial_level_path = "results/dial_level_results/autotod/mwoz-dial-level-gpt-4o_j.json"
with open(autotod_dial_level_path, 'r') as f:
	autotod_dial_level_data = json.load(f)
autotod_dial_level = autotod_dial_level_data["dialogues"]
tau_retail_dial_level_path = "results/dial_level_results/tau/retail-dial-level-gpt-4o_j.json"
with open(tau_retail_dial_level_path, 'r') as f:
	tau_retail_dial_level_data = json.load(f)
tau_retail_dial_level = tau_retail_dial_level_data["dialogues"]
tau_airline_dial_level_path = "results/dial_level_results/tau/airline-dial-level-gpt-4o_j.json"
with open(tau_airline_dial_level_path, 'r') as f:
	tau_airline_dial_level_data = json.load(f)
tau_airline_dial_level = tau_airline_dial_level_data["dialogues"]

tdeval_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_scores, dial_scores = extract_tdeval_scores(
		batch_dials[i], 
		autotod_dial_level,
		tau_retail_dial_level,
		tau_airline_dial_level,
		batch_order[i]
	)
	tdeval_batch_scores[i] = {
		"turn_level": turn_scores, 
		"dial_level": dial_scores 
	}
pprint.pprint(tdeval_batch_scores, compact=True)

Score not found in string: Score: Very Good (5)
Checking substring
Score not found in string: Score: Very Good (5)
Checking substring
Score not found in string: Score: Very Good (5)
Checking substring
Score not found in string: Score: Very Good (5)
Checking substring
Score not found in string: Score: Very Good (5)
Checking substring
{1: {'dial_level': {'airline_11': {'backend_consistency': 5,
                                   'conv_consistency': 5,
                                   'policy_completeness': 4},
                    'airline_2': {'backend_consistency': 5,
                                  'conv_consistency': 5,
                                  'policy_completeness': 3},
                    'mul0297': {'backend_consistency': 5,
                                'conv_consistency': 5,
                                'policy_completeness': 5},
                    'mul1076': {'backend_consistency': 5,
                                'conv_consistency': 5,
                     

### Extract LMUnit Scores

In [233]:
autotod_lmunit_path = "results/judge_results_lmunit/autotod/mwoz-autotod-lmunit_j.json"
tau_lmunit_path = "results/judge_results_lmunit/tau/lmunit_scores.json"

# TODO: extract dialogue level scores
def extract_lmunit_scores(
	autotod_lmunit_path: str, 
	tau_lmunit_path: str, 
	batch_order: dict
):
	# load lmunit dialogue files
	with open(autotod_lmunit_path, 'r') as f:
		autotod_lmunit_judge = json.load(f)
	autotod_lmunit_dials = autotod_lmunit_judge['dialogues']
	with open(tau_lmunit_path) as f:
		tau_lmunit_judge = json.load(f)
	tau_lmunit_dials = tau_lmunit_judge['dialogues']
	# bundle lmunit scores by batch and id
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	turn_scores = {}
	for batch_dial in batch_order:
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial["type"] == "autotod_mwoz":
			prev_autotod_id = f"{batch_dial_id.upper()}.json"
			llm_dial = autotod_lmunit_dials[prev_autotod_id]
		elif batch_dial['type'] == "tau_retail":
			batch_dial_id = f"retail_{batch_dial_id}"
			llm_dial = tau_lmunit_dials[batch_dial_id]
		elif batch_dial['type'] == "tau_airline":
			batch_dial_id = f"airline_{batch_dial_id}"
			llm_dial = tau_lmunit_dials[batch_dial_id]
		# extract scores from lmunit json data
		turn_scores[batch_dial_id] = {}
		for turn_idx, turn in enumerate(llm_dial):    
			turn_score = turn["scores"]
			# skip turn score if any negative/invalid scores exist
			all_scores = np.array([
				turn_score['conv_consistency']["score"], 
				turn_score['backend_consistency']["score"], 
				turn_score['policy_completeness']["score"]
			])
			if np.any(all_scores <= 0):
				print("missing a score")
				continue
			# Get LLM scores
			for metric in dims:
				if metric not in turn_scores[batch_dial_id]:
					turn_scores[batch_dial_id][metric] = [
						turn_score[metric]["score"]
					]
				else:
					turn_scores[batch_dial_id][metric].append(
						turn_score[metric]["score"]
					)
		# get dialogue level score
		dial_scores[batch_dial_id] = {}
		# grab from TD-Eval 
		for metric in dims:
		# compile dialogue level scores by averaging turn level scores
			dial_scores[batch_dial_id][metric] = np.round(
				np.mean(turn_scores[batch_dial_id][metric]),
				decimals=2
			).item()
	return turn_scores, dial_scores

lmunit_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_scores, dial_scores = extract_lmunit_scores(
		autotod_lmunit_path,
		tau_lmunit_path,
		batch_order[i]
	)
	lmunit_batch_scores[i] = {
		"turn_level": turn_scores, 
		"dial_level": dial_scores 
	}
pprint.pprint(lmunit_batch_scores, compact=True)


{1: {'dial_level': {'airline_11': {'backend_consistency': 4.06,
                                   'conv_consistency': 4.4,
                                   'policy_completeness': 3.44},
                    'airline_12': {'backend_consistency': 3.62,
                                   'conv_consistency': 4.47,
                                   'policy_completeness': 3.64},
                    'airline_15': {'backend_consistency': 3.81,
                                   'conv_consistency': 4.4,
                                   'policy_completeness': 3.41},
                    'airline_19': {'backend_consistency': 3.82,
                                   'conv_consistency': 4.7,
                                   'policy_completeness': 3.33},
                    'airline_2': {'backend_consistency': 3.92,
                                  'conv_consistency': 4.74,
                                  'policy_completeness': 3.72},
                    'airline_22': {'backend_consistency'

### Extract Inform/Success of AutoTOD and Tau Reward from Tau Bench

In [234]:
autotod_dial_path = "datasets/out_basic_100_fm_eval.json"
tau_airline_dial_path = "results/agent-results-tau/tool-calling-gpt-4o-0.0_range_0--1_user-gpt-4o-llm_0114160308-airline.json"
tau_retail_dial_path = "results/agent-results-tau/tool-calling-gpt-4o-0.0_range_0--1_user-gpt-4o-llm_0114161231-retail.json"

with open(autotod_dial_path, 'r') as f:
	autotod_dial = json.load(f)
with open(tau_airline_dial_path, 'r') as f:
	tau_airline_dial = json.load(f)
with open(tau_retail_dial_path, 'r') as f:
	tau_retail_dial = json.load(f)

def extract_dial_level_scores(
	autotod_dial: dict,
	tau_airline_dial: list,
	tau_retail_dial: list,
	batch_order: dict
) -> dict:
	# extract autotod and tau bench dialogue scores
	dial_scores = {}
	for batch_dial in batch_order:
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial["type"] == "autotod_mwoz":
			prev_autotod_id = f"{batch_dial_id.upper()}.json"
			dial_summary = autotod_dial[prev_autotod_id]["eval_summary"]
			# inform and success are only 1 if all domains are successful, otherwise 0
			inform = True
			success = True
			for domain, values in dial_summary.items():
				if "inform" in values:
					inform = inform and values["inform"]
				if "success" in values:
					success = success and values["success"]
			score = { "inform": int(inform), "success": int(success) }
		elif batch_dial['type'] == "tau_retail":
			score = tau_retail_dial[int(batch_dial_id)]["reward"]
			batch_dial_id = f"retail_{batch_dial_id}"
		elif batch_dial['type'] == "tau_airline":
			score = tau_airline_dial[int(batch_dial_id)]["reward"]
			batch_dial_id = f"airline_{batch_dial_id}"
		dial_scores[batch_dial_id] = score
	return dial_scores

dial_level_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_scores = extract_dial_level_scores(
		autotod_dial,
		tau_airline_dial,
		tau_retail_dial,
		batch_order[i]
	)
	dial_level_batch_scores[i] = turn_scores
pprint.pprint(dial_level_batch_scores, compact=True)

{1: {'airline_11': 0.0,
     'airline_2': 0.0,
     'mul0297': {'inform': 1, 'success': 1},
     'mul1076': {'inform': 0, 'success': 0},
     'pmul2859': {'inform': 1, 'success': 0},
     'pmul3731': {'inform': 1, 'success': 1},
     'retail_10': 1.0,
     'retail_6': 1.0,
     'sng01957': {'inform': 1, 'success': 1},
     'sng0323': {'inform': 1, 'success': 1}},
 2: {'airline_31': 1.0,
     'airline_34': 1.0,
     'mul0233': {'inform': 1, 'success': 1},
     'mul1624': {'inform': 1, 'success': 1},
     'pmul1105': {'inform': 1, 'success': 1},
     'pmul1210': {'inform': 1, 'success': 1},
     'pmul1657': {'inform': 1, 'success': 0},
     'pmul1931': {'inform': 1, 'success': 1},
     'retail_14': 0.0,
     'retail_21': 0.0},
 3: {'airline_15': 0.0,
     'airline_27': 1.0,
     'mul0144': {'inform': 1, 'success': 1},
     'mul0690': {'inform': 0, 'success': 0},
     'pmul2513': {'inform': 1, 'success': 0},
     'pmul3921': {'inform': 0, 'success': 0},
     'pmul4125': {'inform': 1, 'suc

### Inter-Rater Agreement Functions

In [235]:
def calculate_krippendorff_alpha(data):
	"""Calculate Krippendorff's alpha for ordinal data"""
	try:
		return alpha(reliability_data=data.astype(np.int64), value_domain=[1,2,3,4,5], level_of_measurement='interval')
	except Exception as e:
		print(f"Krippendorff calculation error: {e}")
		return None
    
def calculate_fleiss_kappa(data, n_cat):
	"""Calculate Fleiss' kappa (for more than 2 raters)"""
	try:
		data_table, _ = aggregate_raters(data=data, n_cat=n_cat)
		# randolph method gives best performance
		return fleiss_kappa(table=data_table, method='randolph') 
	except Exception as e:
		print(f"Fleiss calculation error: {e}")
		return None
    
def calculate_cohen_kappa(data, n_cat):
	"""Calculate Fleiss' kappa for 2 raters"""
	try:
		data_table, _ = to_table(data=data, bins=n_cat)
		return cohens_kappa(table=data_table, wt='linear')['kappa']
	except Exception as e:
		print(f"Cohen calculation error: {e}")
		return None
	
def calculate_gwet_ac(data: np.ndarray) -> float:
	"""Calculate Gwet's AC1 for data with skew in distribution"""
	cac_raters = CAC(pd.DataFrame(data))
	gwet = cac_raters.gwet()
	# pprint.pprint(gwet, compact=True)
	return float(gwet['est']['coefficient_value'])

### Calculate Turn-Level Correlations: Human vs TD-Eval, LMUnit

In [236]:
def calculate_turn_level_corrs(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["turn_level"]
		tdeval_dials = tdeval_batch_scores[bId]["turn_level"]
		lmunit_dials = lmunit_batch_scores[bId]["turn_level"]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_scores = lmunit_dials[dId]
			for metric in dims:
				batch_human_scores[bId] = np.concat(
					(batch_human_scores[bId], human_scores[metric])
				)
				all_human_scores = np.concat(
					(all_human_scores, human_scores[metric])
				)
				batch_tdeval_scores[bId] = np.concat(
					(batch_tdeval_scores[bId], tdeval_scores[metric])
				)
				all_tdeval_scores = np.concat(
					(all_tdeval_scores, tdeval_scores[metric])
				)
				batch_lmunit_scores[bId] = np.concat(
					(batch_lmunit_scores[bId], lmunit_scores[metric])
				)
				all_lmunit_scores = np.concat(
					(all_lmunit_scores, lmunit_scores[metric])
				)
	# take correlation for each batch
	turn_corrs = {}
	for bId in human_batch_scores.keys():
		tdeval_co, tdeval_p = stats.pearsonr(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId]
		)
		lmunit_co, lmunit_p = stats.pearsonr(
			batch_human_scores[bId], 
			batch_lmunit_scores[bId]
		)
		turn_corrs[f"{bId}"] = {
			"human-tdeval": { "coeff": tdeval_co, "pval": tdeval_p }, 
			"human-lmunit": { "coeff": lmunit_co, "pval": lmunit_p }
		}

	tdeval_co, tdeval_p = stats.pearsonr(all_human_scores, all_tdeval_scores)
	lmunit_co, lmunit_p = stats.pearsonr(all_human_scores, all_lmunit_scores)
	turn_corrs["all"] = {
		"human-tdeval": { "coeff": tdeval_co, "pval": tdeval_p }, 
		"human-lmunit": { "coeff": lmunit_co, "pval": lmunit_p }
	}
	return {
		'tdeval': (tdeval_co.item(), tdeval_p.item()), 
		'lmunit': (lmunit_co.item(), lmunit_p.item())}

calculate_turn_level_corrs(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores
)

{'tdeval': (0.0941747420414556, 0.0015656288653562024),
 'lmunit': (0.04306800254779333, 0.14884908132078)}

### Calculate IRR For Turn Level Scores

In [237]:
def calculate_turn_level_irr(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["turn_level"]
		tdeval_dials = tdeval_batch_scores[bId]["turn_level"]
		lmunit_dials = lmunit_batch_scores[bId]["turn_level"]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_scores = lmunit_dials[dId]
			for metric in dims:
				batch_human_scores[bId] = np.concat(
					(batch_human_scores[bId], human_scores[metric])
				)
				all_human_scores = np.concat(
					(all_human_scores, human_scores[metric])
				)
				batch_tdeval_scores[bId] = np.concat(
					(batch_tdeval_scores[bId], tdeval_scores[metric])
				)
				all_tdeval_scores = np.concat(
					(all_tdeval_scores, tdeval_scores[metric])
				)
				batch_lmunit_scores[bId] = np.concat(
					(batch_lmunit_scores[bId], lmunit_scores[metric])
				)
				all_lmunit_scores = np.concat(
					(all_lmunit_scores, lmunit_scores[metric])
				)
	# take correlation for each batch
	turn_irr = {}
	for bId in human_batch_scores.keys():
		human_tdeval = np.vstack(
			(batch_human_scores[bId], batch_tdeval_scores[bId])
		)
		human_lmunit = np.vstack(
			(batch_human_scores[bId], batch_lmunit_scores[bId])
		)
		human_tdeval_irr = calculate_gwet_ac(human_tdeval)
		human_lmunit_irr = calculate_gwet_ac(human_lmunit)
		turn_irr[f"{bId}"] = {
			"human-tdeval": human_tdeval_irr, 
			"human-lmunit": human_lmunit_irr
		}

	human_tdeval = np.vstack(
		(all_human_scores, all_tdeval_scores)
	)
	human_lmunit = np.vstack(
		(all_human_scores, all_lmunit_scores)
	)
	human_tdeval_irr = calculate_gwet_ac(human_tdeval)
	human_lmunit_irr = calculate_gwet_ac(human_lmunit)
	turn_irr["all"] = {
		"human-tdeval": human_tdeval_irr, 
		"human-lmunit": human_lmunit_irr
	}
	return turn_irr

calculate_turn_level_irr(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores
)

{'1': {'human-tdeval': 0.73463, 'human-lmunit': 0.45449},
 '2': {'human-tdeval': 0.57557, 'human-lmunit': 0.3229},
 '3': {'human-tdeval': 0.65789, 'human-lmunit': 0.4107},
 '4': {'human-tdeval': 0.57024, 'human-lmunit': 0.28019},
 '5': {'human-tdeval': 0.59388, 'human-lmunit': 0.40387},
 '7': {'human-tdeval': 0.58513, 'human-lmunit': 0.29608},
 '9': {'human-tdeval': 0.69996, 'human-lmunit': 0.42785},
 'all': {'human-tdeval': 0.5273, 'human-lmunit': 0.28467}}

### Calculate Dialogue-Level Correlations: Human vs TD-Eval, LMUnit, AutoTOD, Inform/Success + Tau Reward

In [244]:
def calculate_dial_level_corrs(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	dial_level_batch_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	batch_dial_inform_scores = {}
	batch_dial_success_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	all_dial_inform_scores = []
	all_dial_success_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["dial_level"]
		tdeval_dials = tdeval_batch_scores[bId]["dial_level"]
		lmunit_dials = lmunit_batch_scores[bId]["dial_level"]
		dial_level_dials = dial_level_batch_scores[bId]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		batch_dial_inform_scores[bId] = []
		batch_dial_success_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_scores = lmunit_dials[dId]
			dial_level_scores = dial_level_dials[dId]
			# Inform/Success/Tau reward need to be concatenated differently
			if ("airline" not in dId) and ("retail" not in dId):
				batch_dial_inform_scores[bId] = np.append(
					batch_dial_inform_scores[bId], 
					dial_level_scores["inform"]
				)
				batch_dial_success_scores[bId] = np.append(
					batch_dial_success_scores[bId],
					dial_level_scores["success"]
				)
				all_dial_inform_scores = np.append(
					all_dial_inform_scores,
	  			dial_level_scores["inform"]
				)
				all_dial_success_scores = np.append(
					all_dial_success_scores, 
					dial_level_scores["success"]
				)
			else:
				batch_dial_inform_scores[bId] = np.append(
					batch_dial_inform_scores[bId], 
					dial_level_scores
				)
				batch_dial_success_scores[bId] = np.append(
					batch_dial_success_scores[bId], 
					dial_level_scores
				)
				all_dial_inform_scores = np.append(
					all_dial_inform_scores, 
					dial_level_scores
				)
				all_dial_success_scores = np.append(
					all_dial_success_scores, 
					dial_level_scores
				)
			dial_level_scores = dial_level_dials[dId]
			for metric in dims:
				batch_human_scores[bId] = np.append(
					batch_human_scores[bId], 
					human_scores[metric]
				)
				all_human_scores = np.append(
					all_human_scores, 
					human_scores[metric]
				)
				batch_tdeval_scores[bId] = np.append(
					batch_tdeval_scores[bId], 
					tdeval_scores[metric]
				)
				all_tdeval_scores = np.append(
					all_tdeval_scores, 
					tdeval_scores[metric]
				)
				batch_lmunit_scores[bId] = np.append(
					batch_lmunit_scores[bId], 
					lmunit_scores[metric]
				)
				all_lmunit_scores = np.append(
					all_lmunit_scores, 
					lmunit_scores[metric]
				)

	# take correlation for each batch
	dial_corrs = {}
	for bId in human_batch_scores.keys():
		avg_human = np.mean(batch_human_scores[bId].reshape((-1, 3)), axis=1)
		avg_human = np.where(avg_human >= 4, 1, 0)
		avg_tdeval = np.mean(batch_tdeval_scores[bId].reshape(-1, 3), axis=1)
		avg_tdeval = np.where(avg_tdeval >= 4, 1, 0)
		avg_lmunit = np.mean(batch_lmunit_scores[bId].reshape(-1, 3), axis=1)
		avg_lmunit = np.where(avg_lmunit >= 4, 1, 0)
		tdeval_co, tdeval_p = stats.pearsonr(avg_human, avg_tdeval)
		lmunit_co, lmunit_p = stats.pearsonr(avg_human, avg_lmunit)
		inform_co, inform_p = stats.pearsonr(
			avg_human, 
			batch_dial_inform_scores[bId]
		)
		success_co, success_p = stats.pearsonr(
			avg_human, 
			batch_dial_success_scores[bId]
		)
		dial_corrs[f"{bId}"] = {
			"human-tdeval": { "coeff": tdeval_co.item(), "pval": tdeval_p.item() }, 
			"human-lmunit": { "coeff": lmunit_co.item(), "pval": lmunit_p.item() },
			"human-inform": { "coeff": inform_co.item(), "pval": inform_p.item() },
			"human-success": { "coeff": success_co.item(), "pval": success_p.item() }
		}

	# take correlation for all
	avg_all_human = np.mean(all_human_scores.reshape((-1, 3)), axis=1)
	avg_all_human = np.where(avg_all_human >= 4, 1, 0)
	avg_all_tdeval = np.mean(all_tdeval_scores.reshape(-1, 3), axis=1)
	avg_all_tdeval = np.where(avg_all_tdeval >= 4, 1, 0)
	avg_all_lmunit = np.mean(all_lmunit_scores.reshape((-1, 3)), axis=1)
	avg_all_lmunit = np.where(avg_all_lmunit >= 4, 1, 0)
	tdeval_co, tdeval_p = stats.pearsonr(avg_all_human, avg_all_tdeval)
	lmunit_co, lmunit_p = stats.pearsonr(avg_all_human, avg_all_lmunit)
	inform_co, inform_p = stats.pearsonr(avg_all_human, all_dial_inform_scores)
	success_co, success_p = stats.pearsonr(avg_all_human, all_dial_success_scores)
	dial_corrs["all"] = {
		"human-tdeval": { "coeff": tdeval_co.item(), "pval": tdeval_p.item() }, 
		"human-lmunit": { "coeff": lmunit_co.item(), "pval": lmunit_p.item() },
		"human-inform": { "coeff": inform_co.item(), "pval": inform_p.item() },
		"human-success": { "coeff": success_co.item(), "pval": success_p.item() }
	}
	return dial_corrs

dial_level_corr = calculate_dial_level_corrs(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores,
	dial_level_batch_scores
)
pprint.pprint(dial_level_corr["all"], compact=True)

{'human-inform': {'coeff': 0.20080483222562467, 'pval': 0.09554854566179914},
 'human-lmunit': {'coeff': 0.22986138207504433, 'pval': 0.055588473821794014},
 'human-success': {'coeff': 0.1317183200654749, 'pval': 0.2770652265015951},
 'human-tdeval': {'coeff': -0.12903225806451613, 'pval': 0.28706555486918506}}


  tdeval_co, tdeval_p = stats.pearsonr(avg_human, avg_tdeval)
  lmunit_co, lmunit_p = stats.pearsonr(avg_human, avg_lmunit)
  inform_co, inform_p = stats.pearsonr(
  success_co, success_p = stats.pearsonr(


### Calculate Dialogue-Level IRR: Human vs TD-Eval, LMUnit, AutoTOD, Inform/Success + Tau Reward

In [None]:
def calculate_dial_level_corrs(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	dial_level_batch_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	batch_dial_inform_scores = {}
	batch_dial_success_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	all_dial_inform_scores = []
	all_dial_success_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["dial_level"]
		tdeval_dials = tdeval_batch_scores[bId]["dial_level"]
		lmunit_dials = lmunit_batch_scores[bId]["dial_level"]
		dial_level_dials = dial_level_batch_scores[bId]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		batch_dial_inform_scores[bId] = []
		batch_dial_success_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_scores = lmunit_dials[dId]
			dial_level_scores = dial_level_dials[dId]
			# Inform/Success/Tau reward need to be concatenated differently
			if ("airline" not in dId) and ("retail" not in dId):
				batch_dial_inform_scores[bId] = np.append(
					batch_dial_inform_scores[bId], 
					dial_level_scores["inform"]
				)
				batch_dial_success_scores[bId] = np.append(
					batch_dial_success_scores[bId],
					dial_level_scores["success"]
				)
				all_dial_inform_scores = np.append(
					all_dial_inform_scores,
	  			dial_level_scores["inform"]
				)
				all_dial_success_scores = np.append(
					all_dial_success_scores, 
					dial_level_scores["success"]
				)
			else:
				batch_dial_inform_scores[bId] = np.append(
					batch_dial_inform_scores[bId], 
					dial_level_scores
				)
				batch_dial_success_scores[bId] = np.append(
					batch_dial_success_scores[bId], 
					dial_level_scores
				)
				all_dial_inform_scores = np.append(
					all_dial_inform_scores, 
					dial_level_scores
				)
				all_dial_success_scores = np.append(
					all_dial_success_scores, 
					dial_level_scores
				)
			dial_level_scores = dial_level_dials[dId]
			for metric in dims:
				batch_human_scores[bId] = np.append(
					batch_human_scores[bId], 
					human_scores[metric]
				)
				all_human_scores = np.append(
					all_human_scores, 
					human_scores[metric]
				)
				batch_tdeval_scores[bId] = np.append(
					batch_tdeval_scores[bId], 
					tdeval_scores[metric]
				)
				all_tdeval_scores = np.append(
					all_tdeval_scores, 
					tdeval_scores[metric]
				)
				batch_lmunit_scores[bId] = np.append(
					batch_lmunit_scores[bId], 
					lmunit_scores[metric]
				)
				all_lmunit_scores = np.append(
					all_lmunit_scores, 
					lmunit_scores[metric]
				)
	# take IRR for each batch
	dial_irr = {}
	for bId in human_batch_scores.keys():
		avg_human = np.mean(batch_human_scores[bId].reshape((-1, 3)), axis=1)
		avg_human = np.where(avg_human >= 4, 1, 0)
		avg_tdeval = np.mean(batch_tdeval_scores[bId].reshape(-1, 3), axis=1)
		avg_tdeval = np.where(avg_tdeval >= 4, 1, 0)
		avg_lmunit = np.mean(batch_lmunit_scores[bId].reshape(-1, 3), axis=1)
		avg_lmunit = np.where(avg_lmunit >= 4, 1, 0)
		human_tdeval = np.vstack(
			(avg_human, avg_tdeval)
		)
		human_tdeval_irr = calculate_gwet_ac(human_tdeval)
		human_lmunit = np.vstack(
			(avg_human, avg_lmunit)
		)
		human_lmunit_irr = calculate_gwet_ac(human_lmunit)
		human_inform = np.vstack(
			(avg_human, batch_dial_inform_scores[bId])
		)
		human_inform_irr = calculate_gwet_ac(human_inform)
		human_success = np.vstack(
			(avg_human, batch_dial_success_scores[bId])
		)
		human_success_irr = calculate_gwet_ac(human_success)
		dial_irr[f"{bId}"] = {
			"human-tdeval": human_tdeval_irr, 
			"human-lmunit": human_lmunit_irr,
			"human-inform": human_inform_irr,
			"human-success": human_success_irr
		}

	# take correlation for all
	avg_all_human = np.mean(all_human_scores.reshape((-1, 3)), axis=1)
	avg_all_human = np.where(avg_all_human >= 4, 1, 0)
	avg_all_tdeval = np.mean(all_tdeval_scores.reshape(-1, 3), axis=1)
	avg_all_tdeval = np.where(avg_all_tdeval >= 4, 1, 0)
	avg_all_lmunit = np.mean(all_lmunit_scores.reshape((-1, 3)), axis=1)
	avg_all_lmunit = np.where(avg_all_lmunit >= 4, 1, 0)
	human_tdeval = np.vstack(
		(avg_all_human, avg_all_tdeval)
	)
	human_tdeval_irr = calculate_gwet_ac(human_tdeval)
	human_lmunit = np.vstack(
		(avg_all_human, avg_all_lmunit)
	)
	human_lmunit_irr = calculate_gwet_ac(human_lmunit)
	human_inform = np.vstack(
		(avg_all_human, all_dial_inform_scores)
	)
	human_tdeval_irr = calculate_gwet_ac(human_tdeval)
	human_inform = np.vstack(
		(avg_all_human, all_dial_inform_scores)
	)
	human_inform_irr = calculate_gwet_ac(human_inform)
	human_success = np.vstack(
		(avg_all_human, all_dial_success_scores)
	)
	human_success_irr = calculate_gwet_ac(human_success)
	dial_irr["all"] = {
		"human-tdeval": human_tdeval_irr, 
		"human-lmunit": human_lmunit_irr,
		"human-inform": human_inform_irr,
		"human-success": human_success_irr
	}
	return dial_irr

dial_level_corr = calculate_dial_level_corrs(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores,
	dial_level_batch_scores
)
pprint.pprint(dial_level_corr["all"], compact=True)

{'human-inform': 0.47536, 'human-success': 0.40976, 'human-tdeval': 0.74248}


  (weights_mat_sum / (self.q * (self.q - 1)))
  (weights_mat_sum / (self.q * (self.q - 1)))
  (weights_mat_sum / (self.q * (self.q - 1)))
  (weights_mat_sum / (self.q * (self.q - 1)))


### Compile Results and Save to File

In [240]:
# comparison_results = {
# 	'turn_level_scores': scores,
# 	'inter_annotator_agreement': agreement_metrics,
# 	'human_llm_agreement': human_llm_corr,
# 	'llm_mwzeval_agreement': llm_mwzeval_agreement,
# 	'human_mwzeval_agreement': human_mwzeval_corr
# }
# output_dir = "agreement_scores"
# with open(os.path.join(output_dir, 'main_human_eval.json'), 'w') as f:
# 	json.dump(comparison_results, f, indent=4)    
# print(f"\nResults saved to {output_dir}")