In [None]:
import json
import os
import re
import numpy as np
import pprint
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from krippendorff import alpha
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters, cohens_kappa, to_table
from irrCAC.raw import CAC

#### Create Variables and Load Dialogues from JSON + Batch IDs

In [None]:
mwoz_judge_json = "results/judge_results_mwoz_autotod/20250403_025805/mwoz-autotod-gpt-4o_j.json"
tau_air_judge_json = "results/judge-results-tau/20250131_152503-tau-4o-airline/tau-gpt-4o_j.json"
tau_retail_judge_json = "results/judge-results-tau/20250131_152422-tau-4o-retail/tau-gpt-4o_j.json"
human_eval_batch_dir = "datasets/main_human_eval"

# load dialogues: order mwoz, tau-retail, tau-airline
with open(mwoz_judge_json, 'r') as f:
		mwoz_judge = json.load(f)
mwoz_dials = mwoz_judge.get('dialogues', [])
with open(tau_retail_judge_json, 'r') as f:
		tau_retail_judge = json.load(f)
tau_retail_dials = tau_retail_judge['dialogues']
with open(tau_air_judge_json, 'r') as f:
		tau_air_judge = json.load(f)
tau_air_dials = tau_air_judge['dialogues']
# load batches
num_batches = 10
batch_list = {}
batch_order = {}
for ind in range(1, num_batches + 1):
	with open(f"{human_eval_batch_dir}/batch{ind}.json", 'r') as f:
		curr_batch_list = json.load(f)
	if curr_batch_list is None or len(curr_batch_list) == 0:
		print('No batches found at this path:',  ind)
		exit()
	# add batches to full list
	batch_list[ind] = {}
	if "autotod_mwoz" not in batch_list:
		batch_list[ind]["autotod_mwoz"] = curr_batch_list["autotod_mwoz"]
	else:
		batch_list[ind]["autotod_mwoz"].extend(curr_batch_list["autotod_mwoz"])
	if "tau" not in batch_list:
		batch_list[ind]["tau"] = curr_batch_list["tau"]
	else:
		batch_list[ind]["tau"]["retail"].extend(curr_batch_list["tau"]["retail"])
		batch_list[ind]["tau"]["airline"].extend(curr_batch_list["tau"]["airline"])
	batch_order[ind] = curr_batch_list["order"]

pprint.pprint(batch_list, compact=True)
pprint.pprint(batch_order, compact=True)

#### Compile Relevant Dialogues From Batches

In [None]:
def get_batch_dialogues(
	mwoz_dialogues: dict, 
	tau_air_dialogues: dict, 
	tau_retail_dialogues: dict, 
	batch_list: dict
) -> dict:
	#load dialogues
	batch_dials = []
	mwoz_batch_ids = batch_list["autotod_mwoz"]
	tau_air_batch_ids = batch_list["tau"]["airline"]
	tau_retail_batch_ids = batch_list["tau"]["retail"]
	# load batch (order: mwoz, tau-retail, tau-airline)
	for batch_id in mwoz_batch_ids:
		for id, dial in mwoz_dialogues.items():
			if id.split(".json")[0].lower() == batch_id:
				batch_dials.append(dial)
				break
	for batch_id in tau_retail_batch_ids:
		for id, dial in tau_retail_dialogues.items():
			if id == batch_id:
				batch_dials.append(dial)
				break
	for batch_id in tau_air_batch_ids:
		for id, dial in tau_air_dialogues.items():
			if id == batch_id:
				batch_dials.append(dial)
				break
	tot_batch_len = len(mwoz_batch_ids) + len(tau_air_batch_ids) + len(tau_retail_batch_ids)
	if len(batch_dials) != tot_batch_len:
		print("filtered dials size does not match batches:", len(batch_dials), tot_batch_len)
		exit()
	return batch_dials

# get batch dialogues for all eval batches
batch_dials = {}
for b_ind, b_list in batch_list.items():
	dials = get_batch_dialogues(
		mwoz_dials, 
		tau_air_dials, 
		tau_retail_dials, 
		b_list
	)
	batch_dials[b_ind] = dials

# pprint.pprint(batch_dials, compact=True)

#### Extract Qualtrics Human Eval CSV Data

In [None]:
def extract_human_csv_data(
	human_eval_csv: dict, 
	batch_dialogues: dict, 
	batch_order: dict
) -> dict:
	"""Read CSV data and convert to appropriate format"""
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	input_csv = human_eval_csv["csv_file"]
	eval_csv = pd.read_csv(input_csv, on_bad_lines='warn') 
	start_col = human_eval_csv['start_col']
	end_col = human_eval_csv['end_col']
	search_str = '2025'
	turn_result = {}
	dial_result = {}
	first_eval_row = eval_csv.StartDate.str.contains(search_str).idxmax()
	human_scores = eval_csv.loc[first_eval_row:, start_col:end_col].to_numpy()
	mapping = {
		"Very Good": 5.0, 
		"Good": 4.0, 
		"Fair": 3.0, 
		"Bad": 2.0, 
		"Very Bad": 1.0
	}
	# mapping = {
	# 	"Very Good": 3.0, 
	# 	"Good": 3.0, 
	# 	"Fair": 2.0, 
	# 	"Bad": 1.0, 
	# 	"Very Bad": 1.0
	# }
	vectorized_map = np.vectorize(lambda x: mapping[x.strip()])
	int_scores = vectorized_map(human_scores)
	# extract scores into results dialogue map
	scores_idx = 0
	for i, dial in enumerate(batch_dialogues):
		if batch_order[i]["type"] == "tau_retail":
			dial_id = f"retail_{batch_order[i]['id']}"
		elif batch_order[i]["type"] == "tau_airline":
			dial_id = f"airline_{batch_order[i]['id']}"
		else:
			dial_id = batch_order[i]["id"]
		turn_result[dial_id] = {}
		# add turn scores
		for _ in dial:
			for i, metric in enumerate(dims):
				if metric not in turn_result[dial_id]:
					turn_result[dial_id][metric] = int_scores[:,scores_idx+i]
				else:
					turn_result[dial_id][metric] = np.concat(
						(turn_result[dial_id][metric], int_scores[:,scores_idx+i])
					)
				# turn_result[dial_id].append({
				# 	'conv_consistency': int_scores[:,scores_idx],
				# 	'backend_consistency': int_scores[:,scores_idx+1],
				# 	'policy_completeness': int_scores[:, scores_idx+2]
				# })
			scores_idx += 3
		# add dial scores
		dial_result[dial_id] = {
				'conv_consistency': int_scores[:,scores_idx],
				'backend_consistency': int_scores[:,scores_idx+1],
				'policy_completeness': int_scores[:, scores_idx+2]
		}
		scores_idx += 3
	return turn_result, dial_result

human_eval_csv = {
	1: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_1.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID187_3" 
	},
	2: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_2.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID135_3" 
	},
	3: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_3.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID180_3" 
	},
	4: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_4.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID123_3" 
	},
	5: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_5.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID129_3" 
	},
	6: {
		"csv_file": "qualtrics/results/main_human_eval/eval_6.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID341_3" 
	},
	7: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_7.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID137_3" 
	},
	8: {
		"csv_file": "qualtrics/results/main_human_eval/eval_8.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID142_3" 
	},
	9: { 
		"csv_file": "qualtrics/results/main_human_eval/eval_9.csv", 
		"start_col": "QID3_1", 
		"end_col": "QID115_3" 
	}
}

human_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_eval_data, dial_eval_data = extract_human_csv_data(
		human_eval_csv[i], 
		batch_dials[i], 
		batch_order[i]
	)
	human_batch_scores[i] = { 
		"turn_level": turn_eval_data, 
		"dial_level": dial_eval_data
	}

pprint.pprint(human_batch_scores, compact=True)

#### Extract TD-Eval Scores

In [None]:
"""Compare human evaluation data with LLM evaluation data"""    
def extract_score(score_str: str) -> int:
	try:
		# regex matching
		match = re.search(r'Score: (\d+)', str(score_str))
		if not match:
			print("Score not found in string:",score_str)
			print("Checking substring")
			# check substring
			if "Very Good" in score_str:
				return 5
			elif "Good" in score_str:
				return 4
			elif "Fair" in score_str:
				return 3
			# check more detailed string first
			elif "Bad" in score_str:
				return 2
			elif "Very Bad" in score_str: 
				return 1
			else:
				print("Score still not found with substring check")
				return 5
		return int(match.group(1)) if match else 5
	except:
		print("Score not found in string:",score_str)
		return 5

def extract_tdeval_scores(
	batch_dialogues: list, 
	autotod_dial_level: dict,
	tau_retail_dial_level: dict,
	tau_airline_dial_level: dict,
	batch_order: dict
) -> tuple:
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	turn_scores = {}
	dial_scores = {}
	for idx, batch_dial in enumerate(batch_order):
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial['type'] == "tau_retail":
			batch_dial_id = f"retail_{batch_dial_id}"
		elif batch_dial['type'] == "tau_airline":
			batch_dial_id = f"airline_{batch_dial_id}"
		llm_dial = batch_dialogues[idx]
		# extract scores from td-eval json data
		turn_scores[batch_dial_id] = {}
		for turn_idx, turn in enumerate(llm_dial):    
			turn_score = turn["scores"]
			# skip turn score if any negative/invalid scores exist
			all_scores = np.array([
				extract_score(turn_score['conv_consistency']["score"]), 
				extract_score(turn_score['backend_consistency']["score"]), 
				extract_score(turn_score['policy_completeness']["score"])
			])
			if np.any(all_scores <= 0):
				print("missing a score")
				continue
			# Get LLM scores
			for metric in dims:
				if metric not in turn_scores[batch_dial_id]:
					turn_scores[batch_dial_id][metric] = [
						extract_score(turn_score[metric]["score"])
					]
				else:
					turn_scores[batch_dial_id][metric].append(
						extract_score(turn_score[metric]["score"])
					)
		# get dialogue level score
		dial_scores[batch_dial_id] = {}
		# grab from TD-Eval 
		for metric in dims:
			if batch_dial['type'] == "autotod_mwoz":
				orig_dial_id = f"{batch_dial_id.upper()}.json"
				score = extract_score(autotod_dial_level[orig_dial_id][metric]["score"])
			elif batch_dial['type'] == "tau_retail":  
				orig_dial_id = batch_dial_id.replace("retail_", "")
				score = extract_score(
					tau_retail_dial_level[orig_dial_id][metric]["score"]
				)
			elif batch_dial['type'] == "tau_airline":
				orig_dial_id = batch_dial_id.replace("airline_", "")
				score = extract_score(
					tau_airline_dial_level[orig_dial_id][metric]["score"]
				)
			dial_scores[batch_dial_id][metric] = score
		
	return turn_scores, dial_scores

autotod_dial_level_path = "results/dial_level_results/autotod/mwoz-dial-level-gpt-4o_j.json"
with open(autotod_dial_level_path, 'r') as f:
	autotod_dial_level_data = json.load(f)
autotod_dial_level = autotod_dial_level_data["dialogues"]
tau_retail_dial_level_path = "results/dial_level_results/tau/retail-dial-level-gpt-4o_j.json"
with open(tau_retail_dial_level_path, 'r') as f:
	tau_retail_dial_level_data = json.load(f)
tau_retail_dial_level = tau_retail_dial_level_data["dialogues"]
tau_airline_dial_level_path = "results/dial_level_results/tau/airline-dial-level-gpt-4o_j.json"
with open(tau_airline_dial_level_path, 'r') as f:
	tau_airline_dial_level_data = json.load(f)
tau_airline_dial_level = tau_airline_dial_level_data["dialogues"]

tdeval_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_scores, dial_scores = extract_tdeval_scores(
		batch_dials[i], 
		autotod_dial_level,
		tau_retail_dial_level,
		tau_airline_dial_level,
		batch_order[i]
	)
	tdeval_batch_scores[i] = {
		"turn_level": turn_scores, 
		"dial_level": dial_scores 
	}
pprint.pprint(tdeval_batch_scores, compact=True)

#### Extract LMUnit Scores

In [None]:
def extract_lmunit_scores(
	autotod_lmunit_dials: dict, 
	tau_lmunit_dials: dict, 
	lmunit_dial_level: dict,
	batch_order: dict
):
	# bundle lmunit scores by batch and id
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	turn_scores = {}
	dial_scores = {}
	for batch_dial in batch_order:
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial["type"] == "autotod_mwoz":
			prev_autotod_id = f"{batch_dial_id.upper()}.json"
			llm_dial = autotod_lmunit_dials[prev_autotod_id]
		elif batch_dial['type'] == "tau_retail":
			batch_dial_id = f"retail_{batch_dial_id}"
			llm_dial = tau_lmunit_dials[batch_dial_id]
		elif batch_dial['type'] == "tau_airline":
			batch_dial_id = f"airline_{batch_dial_id}"
			llm_dial = tau_lmunit_dials[batch_dial_id]
		# extract scores from lmunit json data
		turn_scores[batch_dial_id] = {}
		for turn_idx, turn in enumerate(llm_dial):    
			turn_score = turn["scores"]
			# skip turn score if any negative/invalid scores exist
			all_scores = np.array([
				turn_score['conv_consistency']["score"], 
				turn_score['backend_consistency']["score"], 
				turn_score['policy_completeness']["score"]
			])
			if np.any(all_scores <= 0):
				print("missing a score")
				continue
			# Get LLM scores
			for metric in dims:
				if metric not in turn_scores[batch_dial_id]:
					turn_scores[batch_dial_id][metric] = [
						turn_score[metric]["score"]
					]
				else:
					turn_scores[batch_dial_id][metric].append(
						turn_score[metric]["score"]
					)
		# get dialogue level score
		dial_scores[batch_dial_id] = lmunit_dial_level[batch_dial_id]

	return turn_scores, dial_scores

autotod_lmunit_path = "results/judge_results_lmunit/autotod/mwoz-autotod-lmunit_j.json"
tau_lmunit_path = "results/judge_results_lmunit/tau/lmunit_scores.json"
lmunit_dial_level_path = "results/dial_level_results/lmunit/lmunit_dial_scores.json"

# load lmunit dialogue files
with open(autotod_lmunit_path, 'r') as f:
	autotod_lmunit_judge = json.load(f)
autotod_lmunit_dials = autotod_lmunit_judge['dialogues']
with open(tau_lmunit_path) as f:
	tau_lmunit_judge = json.load(f)
tau_lmunit_dials = tau_lmunit_judge['dialogues']
with open(lmunit_dial_level_path, 'r') as f:
	lmunit_dial_level = json.load(f)

lmunit_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	turn_scores, dial_scores = extract_lmunit_scores(
		autotod_lmunit_dials,
		tau_lmunit_dials,
		lmunit_dial_level,
		batch_order[i]
	)
	lmunit_batch_scores[i] = {
		"turn_level": turn_scores, 
		"dial_level": dial_scores 
	}
pprint.pprint(lmunit_batch_scores, compact=True)


#### Extract Inform/Success of AutoTOD and Tau Reward from Tau Bench

In [None]:
autotod_dial_path = "datasets/out_basic_100_fm_eval.json"
tau_airline_dial_path = "results/agent-results-tau/tool-calling-gpt-4o-0.0_range_0--1_user-gpt-4o-llm_0114160308-airline.json"
tau_retail_dial_path = "results/agent-results-tau/tool-calling-gpt-4o-0.0_range_0--1_user-gpt-4o-llm_0114161231-retail.json"

with open(autotod_dial_path, 'r') as f:
	autotod_dial = json.load(f)
with open(tau_airline_dial_path, 'r') as f:
	tau_airline_dial = json.load(f)
with open(tau_retail_dial_path, 'r') as f:
	tau_retail_dial = json.load(f)

def extract_trad_scores(
	autotod_dial: dict,
	tau_airline_dial: list,
	tau_retail_dial: list,
	batch_order: dict
) -> dict:
	# extract autotod and tau bench dialogue scores
	dial_scores = {}
	for batch_dial in batch_order:
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial["type"] == "autotod_mwoz":
			prev_autotod_id = f"{batch_dial_id.upper()}.json"
			dial_summary = autotod_dial[prev_autotod_id]["eval_summary"]
			# inform and success are only 1 if all domains are successful, otherwise 0
			inform = True
			success = True
			for domain, values in dial_summary.items():
				if "inform" in values:
					inform = inform and values["inform"]
				if "success" in values:
					success = success and values["success"]
			score = { "inform": int(inform), "success": int(success) }
		elif batch_dial['type'] == "tau_retail":
			score = tau_retail_dial[int(batch_dial_id)]["reward"]
			batch_dial_id = f"retail_{batch_dial_id}"
		elif batch_dial['type'] == "tau_airline":
			score = tau_airline_dial[int(batch_dial_id)]["reward"]
			batch_dial_id = f"airline_{batch_dial_id}"
		dial_scores[batch_dial_id] = score
	return dial_scores

trad_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	trad_scores = extract_trad_scores(
		autotod_dial,
		tau_airline_dial,
		tau_retail_dial,
		batch_order[i]
	)
	trad_batch_scores[i] = trad_scores
pprint.pprint(trad_batch_scores, compact=True)

#### Extract Traditional Automatic Inform/Success (MultiWOZ) and Reward (Tau) Scores of MultiWOZ with Tau

In [None]:
mwoz_dial_path = "datasets/basic_result_final.json"

with open(mwoz_dial_path, 'r') as f:
	mwoz_dial = json.load(f)

def extract_all_trad_auto_scores(
	mwoz_dial: dict,
	tau_airline_dial: list,
	tau_retail_dial: list,
	batch_order: dict
) -> dict:
	# extract mwoz and tau bench dialogue scores
	trad_auto_scores = {}
	for batch_dial in batch_order:
		# convert batch dial id format
		batch_dial_id = batch_dial["id"]
		if batch_dial["type"] == "autotod_mwoz":
			prev_mwoz_id = f"{batch_dial_id.upper()}.json"
			# inform and success are only 1 if all domains are successful, otherwise 0
			inform = True
			success = True
			inform_scores = mwoz_dial[prev_mwoz_id]["match"]
			if "total" in inform_scores:
				inform = inform_scores["total"]
			success_scores = mwoz_dial[prev_mwoz_id]["success"]
			if "total" in success_scores:
				success = success_scores["total"]
			score = inform and success
		elif batch_dial['type'] == "tau_retail":
			score = tau_retail_dial[int(batch_dial_id)]["reward"]
			batch_dial_id = f"retail_{batch_dial_id}"
		elif batch_dial['type'] == "tau_airline":
			score = tau_airline_dial[int(batch_dial_id)]["reward"]
			batch_dial_id = f"airline_{batch_dial_id}"
		trad_auto_scores[batch_dial_id] = int(score)
	return trad_auto_scores

trad_auto_batch_scores = {}
trad_auto_mwoz_batch_scores = {}
tau_batch_scores = {}
for i in batch_dials.keys():
	if i not in human_eval_csv:
		continue
	trad_auto_scores = extract_all_trad_auto_scores(
		mwoz_dial,
		tau_airline_dial,
		tau_retail_dial,
		batch_order[i]
	)
	trad_auto_batch_scores[i] = trad_auto_scores
	trad_auto_mwoz_batch_scores[i] = trad_auto_mwoz_scores
	tau_batch_scores[i] = trad_scores
pprint.pprint(trad_auto_batch_scores, compact=True)
pprint.pprint(trad_auto_mwoz_batch_scores, compact=True)
pprint.pprint(tau_batch_scores, compact=True)

#### Inter-Rater Agreement Functions

In [None]:
def calculate_krippendorff_alpha(data: np.ndarray) -> float:
	"""Calculate Krippendorff's alpha for ordinal data"""
	try:
		return alpha(
			reliability_data=data.astype(np.int64), 
			value_domain=[1,2,3,4,5], 
			level_of_measurement='interval'
		)
	except Exception as e:
			print(f"Krippendorff calculation error: {e}")
			return None

def calculate_kappa(data: np.ndarray, n_cat: int) -> dict:
	"""Calculate kappa statistic for more than 2 raters (fleiss and randolph)"""
	try:
		data_table, _ = aggregate_raters(data=data, n_cat=n_cat)
		# randolph method gives best performance
		fleiss = fleiss_kappa(table=data_table, method='fleiss') 
		randolph = fleiss_kappa(table=data_table, method='randolph') 
		return {"fleiss": float(fleiss), "randolph": float(randolph)}
	except Exception as e:
		print(f"Fleiss calculation error: {e}")
		return None

def calculate_cohen_kappa(data: np.ndarray, n_cat: int) -> float:
	"""Calculate Cohen's kappa for 2 raters"""
	try:
		data_table, _ = to_table(data=data, bins=n_cat)
		return cohens_kappa(table=data_table, wt='linear')['kappa']
	except Exception as e:
		print(f"Cohen calculation error: {e}")
		return None
	
def calculate_gwet_ac(data: np.ndarray) -> float:
	"""Calculate Gwet's AC1 for data with skew in distribution"""
	cac_raters = CAC(pd.DataFrame(data))
	gwet = cac_raters.gwet()
	# pprint.pprint(gwet, compact=True)
	return float(gwet['est']['coefficient_value'])

#### Calculate Turn-Level Correlations: Human vs TD-Eval, LMUnit

In [None]:
def compile_all_scores_turn_level(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	batch_inform_reward_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["turn_level"]
		tdeval_dials = tdeval_batch_scores[bId]["turn_level"]
		lmunit_dials = lmunit_batch_scores[bId]["turn_level"]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		batch_inform_reward_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_scores = lmunit_dials[dId]
			for metric in dims:
				batch_human_scores[bId] = np.concat(
					(batch_human_scores[bId], human_scores[metric])
				)
				all_human_scores = np.concat(
					(all_human_scores, human_scores[metric])
				)
				batch_tdeval_scores[bId] = np.concat(
					(batch_tdeval_scores[bId], tdeval_scores[metric])
				)
				all_tdeval_scores = np.concat(
					(all_tdeval_scores, tdeval_scores[metric])
				)
				batch_lmunit_scores[bId] = np.concat(
					(batch_lmunit_scores[bId], lmunit_scores[metric])
				)
				all_lmunit_scores = np.concat(
					(all_lmunit_scores, lmunit_scores[metric])
				)
	return {
		'batch': {
			'human': batch_human_scores,
			'tdeval': batch_tdeval_scores,
			'lmunit': batch_lmunit_scores
		},
		"all": {
			'human': all_human_scores,
			'tdeval': all_tdeval_scores,
			'lmunit': all_lmunit_scores
		}
	}

def map_all_correlations_dial_level(
	human_scores: dict, 
	tdeval_scores: dict, 
	lmunit_scores: dict, 
):
	# take correlation by metric (conv)
	human_conv = human_scores[::3]
	tdeval_conv = tdeval_scores[::3]
	lmunit_conv = lmunit_scores[::3]
	tdeval_conv_co, tdeval_conv_p = stats.spearmanr(
		human_conv, 
		tdeval_conv
	)
	lmunit_conv_co, lmunit_conv_p = stats.spearmanr(
		human_conv, 
		lmunit_conv
	)
	# take correlation by metric (backend)
	human_backend = human_scores[1::3]
	tdeval_backend = tdeval_scores[1::3]
	lmunit_backend = lmunit_scores[1::3]
	tdeval_backend_co, tdeval_backend_p = stats.spearmanr(
		human_backend, 
		tdeval_backend
	)
	lmunit_backend_co, lmunit_backend_p = stats.spearmanr(
		human_backend, 
		lmunit_backend
	)
	# take correlation by metric (policy)
	human_policy = human_scores[2::3]
	tdeval_policy = tdeval_scores[2::3]
	lmunit_policy = lmunit_scores[2::3]
	tdeval_policy_co, tdeval_policy_p = stats.spearmanr(
		human_policy, 
		tdeval_policy
	)
	lmunit_policy_co, lmunit_policy_p = stats.spearmanr(
		human_policy, 
		lmunit_policy
	)
	# take correlation by averaging metrics
	human_overall = human_scores
	tdeval_overall = tdeval_scores
	lmunit_overall = lmunit_scores
	tdeval_overall_co, tdeval_overall_p = stats.spearmanr(
		human_overall, 
		tdeval_overall
	)
	overall_lmunit_co, overall_lmunit_p = stats.spearmanr(
		human_overall, 
		lmunit_overall
	)
	dial_corrs = {
		"human-tdeval": {
			"conv": {
				"coeff": round(float(tdeval_conv_co), 3), 
				"pval": round(float(tdeval_conv_p), 3)
			}, 
			"backend": {
				"coeff": round(float(tdeval_backend_co), 3), 
				"pval": round(float(tdeval_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(tdeval_policy_co), 3), 
				"pval": round(float(tdeval_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(tdeval_overall_co), 3), 
				"pval": round(float(tdeval_overall_p), 3)
			},
		},
		"human-lmunit": {
			"conv": {
				"coeff": round(float(lmunit_conv_co), 3), 
				"pval": round(float(lmunit_conv_p), 3) 
			}, 
			"backend": {
				"coeff": round(float(lmunit_backend_co), 3), 
				"pval": round(float(lmunit_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(lmunit_policy_co), 3), 
				"pval": round(float(lmunit_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(overall_lmunit_co), 3), 
				"pval": round(float(overall_lmunit_p), 3) 
			},
		},
	}
	return dial_corrs

def calculate_turn_level_corrs(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict
):
	compiled_scores = compile_all_scores_turn_level(	
		human_batch_scores, 
		tdeval_batch_scores, 
		lmunit_batch_scores,
	)
	# serialize batched compile scores
	batch_human_scores = compiled_scores['batch']['human']
	batch_tdeval_scores = compiled_scores['batch']['tdeval']
	batch_lmunit_scores = compiled_scores['batch']['lmunit']
	# serialized fully compiled scores
	all_human_scores = compiled_scores['all']['human']
	all_tdeval_scores = compiled_scores['all']['tdeval']
	all_lmunit_scores = compiled_scores['all']['lmunit']
	# take correlation for each batch
	turn_corrs = {}
	for bId in human_batch_scores.keys():
		turn_corrs[bId] = map_all_correlations_dial_level(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId], 
			batch_lmunit_scores[bId], 
		)
	turn_corrs["all"] = map_all_correlations_dial_level(
		all_human_scores,
		all_tdeval_scores,
		all_lmunit_scores,
	)
	return turn_corrs

turn_level_corr = calculate_turn_level_corrs(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores
)
pprint.pprint(turn_level_corr["all"], compact=True)

#### Calculate IRR For Turn Level Scores

In [None]:
def map_all_irr_turn_level(
	human_scores: dict, 
	tdeval_scores: dict, 
	lmunit_scores: dict
):
	# take IRR by metric (conv)
	human_conv = human_scores[::3]
	tdeval_conv = tdeval_scores[::3]
	lmunit_conv = lmunit_scores[::3]

	human_tdeval_conv = np.vstack(
		(human_conv, tdeval_conv)
	).astype(dtype=np.int64)
	human_tdeval_conv_gwet = calculate_gwet_ac(human_tdeval_conv)
	human_tdeval_conv_kappa = calculate_kappa(
		human_tdeval_conv-1, 
		n_cat=5
	)['randolph']

	human_lmunit_conv = np.vstack(
		(human_conv, lmunit_conv)
	).astype(dtype=np.int64)
	human_lmunit_conv_gwet = calculate_gwet_ac(human_lmunit_conv)
	human_lmunit_conv_kappa = calculate_kappa(
		human_lmunit_conv-1, 
		n_cat=5
	)['randolph']

	# take correlation by metric (backend)
	human_backend = human_scores[1::3]
	tdeval_backend = tdeval_scores[1::3]
	lmunit_backend = lmunit_scores[1::3]
	
	human_tdeval_backend = np.vstack(
		(human_backend, tdeval_backend)
	).astype(dtype=np.int64)
	human_tdeval_backend_gwet = calculate_gwet_ac(human_tdeval_backend)
	human_tdeval_backend_kappa = calculate_kappa(
		human_tdeval_backend-1, 
		n_cat=5
	)['randolph']

	human_lmunit_backend = np.vstack(
		(human_backend, lmunit_backend)
	).astype(dtype=np.int64)
	human_lmunit_backend_gwet = calculate_gwet_ac(human_lmunit_backend)
	human_lmunit_backend_kappa = calculate_kappa(
		human_lmunit_backend-1, 
		n_cat=5
	)['randolph']

	# take correlation by metric (policy)
	human_policy = human_scores[2::3]
	tdeval_policy = tdeval_scores[2::3]
	lmunit_policy = lmunit_scores[2::3]

	human_tdeval_policy = np.vstack(
		(human_policy, tdeval_policy)
	).astype(dtype=np.int64)
	human_tdeval_policy_gwet = calculate_gwet_ac(human_tdeval_policy)
	human_tdeval_policy_kappa = calculate_kappa(
		human_tdeval_policy-1, 
		n_cat=5
	)['randolph']

	human_lmunit_policy = np.vstack(
		(human_policy, lmunit_policy)
	).astype(dtype=np.int64)
	human_lmunit_policy_gwet = calculate_gwet_ac(human_lmunit_policy)
	human_lmunit_policy_kappa = calculate_kappa(
		human_lmunit_policy-1, 
		n_cat=5
	)['randolph']

	# take IRR of all metrics (overall)
	overall_human = human_scores
	overall_tdeval = tdeval_scores
	overall_lmunit = lmunit_scores

	overall_human_tdeval = np.vstack(
		(overall_human, overall_tdeval)
	).astype(dtype=np.int64)
	overall_human_tdeval_gwet = calculate_gwet_ac(overall_human_tdeval)
	overall_human_tdeval_kappa = calculate_kappa(
		overall_human_tdeval-1, 
		5
	)['randolph']

	overall_human_lmunit = np.vstack(
		(overall_human, overall_lmunit)
	).astype(dtype=np.int64)
	overall_human_lmunit_gwet = calculate_gwet_ac(overall_human_lmunit)
	overall_human_lmunit_kappa = calculate_kappa(
		overall_human_lmunit-1,
		5
	)['randolph']

	dial_irr = {
		"human-tdeval": {
			"conv": {
				"gwet-ac1": human_tdeval_conv_gwet,
				"r_kappa": human_tdeval_conv_kappa
			}, 
			"backend": {
				"gwet-ac1": human_tdeval_backend_gwet,
				"r_kappa": human_tdeval_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_tdeval_policy_gwet,
				"r_kappa": human_tdeval_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_tdeval_gwet,
				"r_kappa": overall_human_tdeval_kappa
			},
		},
		"human-lmunit": {
			"conv": {
				"gwet-ac1": human_lmunit_conv_gwet,
				"r_kappa": human_lmunit_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_lmunit_backend_gwet,
				"r_kappa": human_lmunit_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_lmunit_policy_gwet,
				"r_kappa": human_lmunit_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_lmunit_gwet,
				"r_kappa": overall_human_lmunit_kappa
			},
		},
	}
	return dial_irr


def calculate_turn_level_irr(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict
):
	compiled_scores = compile_all_scores_turn_level(	
		human_batch_scores, 
		tdeval_batch_scores, 
		lmunit_batch_scores,
	)
	# serialize batched compile scores
	batch_human_scores = compiled_scores['batch']['human']
	batch_tdeval_scores = compiled_scores['batch']['tdeval']
	batch_lmunit_scores = compiled_scores['batch']['lmunit']
	# serialized fully compiled scores
	all_human_scores = compiled_scores['all']['human']
	all_tdeval_scores = compiled_scores['all']['tdeval']
	all_lmunit_scores = compiled_scores['all']['lmunit']
	# take correlation for each batch
	turn_irr = {}
	for bId in human_batch_scores.keys():
		turn_irr[bId] = map_all_irr_turn_level(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId], 
			batch_lmunit_scores[bId]
		)
	turn_irr["all"] = map_all_irr_turn_level(
		all_human_scores,
		all_tdeval_scores,
		all_lmunit_scores
	)
	return turn_irr

turn_level_irr = calculate_turn_level_irr(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores
)
pprint.pprint(turn_level_irr["all"], compact=True)

#### Calculate Dialogue-Level Agreement (Spearman): Human vs TD-Eval, LMUnit, AutoTOD, Inform/Success + Tau Reward

In [None]:
def map_all_correlations_dial_level(
	human_scores: np.ndarray, 
	tdeval_scores: np.ndarray, 
	lmunit_scores: np.ndarray, 
	inform_reward_scores: np.ndarray, 
	success_reward_scores: np.ndarray, 
	trad_auto_scores: np.ndarray
):
	# take correlation by metric (conv)
	human_conv = human_scores[::3]
	tdeval_conv = tdeval_scores[::3]
	tdeval_conv_co, tdeval_conv_p = stats.spearmanr(
		human_conv, 
		tdeval_conv
	)
	lmunit_conv_co, lmunit_conv_p = stats.spearmanr(
		human_conv, 
		lmunit_scores
	)
	inform_reward_conv_co, inform_reward_conv_p = stats.spearmanr(
		human_conv, 
		inform_reward_scores
	)
	success_reward_conv_co, success_reward_conv_p = stats.spearmanr(
		human_conv, 
		success_reward_scores
	)
	trad_auto_conv_co, trad_auto_conv_p = stats.spearmanr(
		human_conv, 
		trad_auto_scores
	)
	# take correlation by metric (backend)
	human_backend = human_scores[1::3]
	tdeval_backend = tdeval_scores[1::3]
	tdeval_backend_co, tdeval_backend_p = stats.spearmanr(
		human_backend, 
		tdeval_backend
	)
	lmunit_backend_co, lmunit_backend_p = stats.spearmanr(
		human_backend, 
		lmunit_scores
	)
	inform_reward_backend_co, inform_reward_backend_p = stats.spearmanr(
		human_backend, 
		inform_reward_scores
	)
	success_reward_backend_co, success_reward_backend_p = stats.spearmanr(
		human_backend, 
		success_reward_scores
	)
	trad_auto_backend_co, trad_auto_backend_p = stats.spearmanr(
		human_backend, 
		trad_auto_scores
	)
	# take correlation by metric (policy)
	human_policy = human_scores[2::3]
	tdeval_policy = tdeval_scores[2::3]
	tdeval_policy_co, tdeval_policy_p = stats.spearmanr(
		human_policy, 
		tdeval_policy
	)
	lmunit_policy_co, lmunit_policy_p = stats.spearmanr(
		human_policy, 
		lmunit_scores
	)
	inform_reward_policy_co, inform_reward_policy_p = stats.spearmanr(
		human_policy, 
		inform_reward_scores
	)
	success_reward_policy_co, success_reward_policy_p = stats.spearmanr(
		human_policy, 
		success_reward_scores
	)
	trad_auto_policy_co, trad_auto_policy_p = stats.spearmanr(
		human_policy, 
		trad_auto_scores
	)
	# take correlation by averaging metrics
	avg_human_scores = np.mean(
		human_scores.reshape((-1, 3)), 
		axis=1
	)
	human_overall = human_scores
	tdeval_overall = tdeval_scores
	tdeval_overall_co, tdeval_overall_p = stats.spearmanr(
		human_overall, 
		tdeval_overall
	)
	overall_lmunit_co, overall_lmunit_p = stats.spearmanr(
		avg_human_scores, 
		lmunit_scores
	)
	overall_inform_reward_co, overall_inform_reward_p = stats.spearmanr(
		avg_human_scores, 
		inform_reward_scores
	)
	overall_success_reward_co, overall_success_reward_p = stats.spearmanr(
		avg_human_scores, 
		success_reward_scores
	)
	overall_trad_auto_co, overall_trad_auto_p = stats.spearmanr(
		avg_human_scores,
		trad_auto_scores
	)
	dial_corrs = {
		"human-tdeval": {
			"conv": {
				"coeff": round(float(tdeval_conv_co), 3), 
				"pval": round(float(tdeval_conv_p), 3)
			}, 
			"backend": {
				"coeff": round(float(tdeval_backend_co), 3), 
				"pval": round(float(tdeval_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(tdeval_policy_co), 3), 
				"pval": round(float(tdeval_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(tdeval_overall_co), 3), 
				"pval": round(float(tdeval_overall_p), 3)
			},
		},
		"human-lmunit": {
			"conv": {
				"coeff": round(float(lmunit_conv_co), 3), 
				"pval": round(float(lmunit_conv_p), 3) 
			}, 
			"backend": {
				"coeff": round(float(lmunit_backend_co), 3), 
				"pval": round(float(lmunit_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(lmunit_policy_co), 3), 
				"pval": round(float(lmunit_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(overall_lmunit_co), 3), 
				"pval": round(float(overall_lmunit_p), 3) 
			},
		},
		"human-inform_reward": {
			"conv": {
				"coeff": round(float(inform_reward_conv_co), 3), 
				"pval": round(float(inform_reward_conv_p), 3)
			}, 
			"backend": {
				"coeff": round(float(inform_reward_backend_co), 3), 
				"pval": round(float(inform_reward_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(inform_reward_policy_co), 3), 
				"pval": round(float(inform_reward_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(overall_inform_reward_co), 3), 
				"pval": round(float(overall_inform_reward_p), 3) 
			},
		},
		"human-success_reward": {
			"conv": {
				"coeff": round(float(success_reward_conv_co), 3), 
				"pval": round(float(success_reward_conv_p), 3)
			}, 
			"backend": {
				"coeff": round(float(success_reward_backend_co), 3), 
				"pval": round(float(success_reward_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(success_reward_policy_co), 3), 
				"pval": round(float(success_reward_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(overall_success_reward_co), 3), 
				"pval": round(float(overall_success_reward_p), 3) 
			},
		},
		"human-trad_auto": {
			"conv": {
				"coeff": round(float(trad_auto_conv_co), 3), 
				"pval": round(float(trad_auto_conv_p), 3)
			}, 
			"backend": {
				"coeff": round(float(trad_auto_backend_co), 3), 
				"pval": round(float(trad_auto_backend_p), 3) 
			},
			"policy": {
				"coeff": round(float(trad_auto_policy_co), 3), 
				"pval": round(float(trad_auto_policy_p), 3) 
			},
			"overall": {
				"coeff": round(float(overall_trad_auto_co), 3), 
				"pval": round(float(overall_trad_auto_p), 3) 
			},
		}
	}
	return dial_corrs

def compile_all_scores_dial_level(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
	trad_auto_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	batch_inform_reward_scores = {}
	batch_success_reward_scores = {}
	batch_trad_auto_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	all_inform_reward_scores = []
	all_success_reward_scores = []
	all_trad_auto_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["dial_level"]
		tdeval_dials = tdeval_batch_scores[bId]["dial_level"]
		lmunit_dials = lmunit_batch_scores[bId]["dial_level"]
		trad_dials = trad_batch_scores[bId]
		trad_auto_dials = trad_auto_scores[bId]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		batch_inform_reward_scores[bId] = []
		batch_success_reward_scores[bId] = []
		batch_trad_auto_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_score = lmunit_dials[dId]
			trad_scores = trad_dials[dId]
			trad_auto_score = trad_auto_dials[dId]
			# batch dial level is just a single score for the whole dialogue
			batch_lmunit_scores[bId] = np.append(
				batch_lmunit_scores[bId], 
				lmunit_score
			)
			all_lmunit_scores = np.append(
				all_lmunit_scores, 
				lmunit_score
			)
			# Inform/Success/Tau reward need to be concatenated differently
			if ("airline" not in dId) and ("retail" not in dId):
				batch_inform_reward_scores[bId] = np.append(
					batch_inform_reward_scores[bId], 
					trad_scores["inform"]
				)
				all_inform_reward_scores = np.append(
					all_inform_reward_scores,
	  			trad_scores["inform"]
				)
				batch_success_reward_scores[bId] = np.append(
					batch_success_reward_scores[bId],
					trad_scores["success"]
				)
				all_success_reward_scores = np.append(
					all_success_reward_scores, 
					trad_scores["success"]
				)
			else:
				batch_inform_reward_scores[bId] = np.append(
					batch_inform_reward_scores[bId],
					trad_scores
				)
				batch_success_reward_scores[bId] = np.append(
					batch_success_reward_scores[bId],
					trad_scores
				)
				all_inform_reward_scores = np.append(
					all_inform_reward_scores,
					trad_scores
				)
				all_success_reward_scores = np.append(
					all_success_reward_scores,
					trad_scores
				)
			# compile dialogue level automatic metrics, tau and mwoz are same format
			batch_trad_auto_scores[bId] = np.append(
				batch_trad_auto_scores[bId],
				trad_auto_score
			)
			all_trad_auto_scores = np.append(
				all_trad_auto_scores,
				trad_auto_score
			)
			for metric in dims:
				batch_human_scores[bId] = np.append(
					batch_human_scores[bId], 
					human_scores[metric]
				)
				all_human_scores = np.append(
					all_human_scores, 
					human_scores[metric]
				)
				batch_tdeval_scores[bId] = np.append(
					batch_tdeval_scores[bId], 
					tdeval_scores[metric]
				)
				all_tdeval_scores = np.append(
					all_tdeval_scores, 
					tdeval_scores[metric]
				)
	return {
		'batch': {
			'human': batch_human_scores,
			'tdeval': batch_tdeval_scores,
			'lmunit': batch_lmunit_scores,
			"inform_reward": batch_inform_reward_scores,
			"success_reward": batch_success_reward_scores,
			"trad_auto": batch_trad_auto_scores
		},
		"all": {
			'human': all_human_scores,
			'tdeval': all_tdeval_scores,
			'lmunit': all_lmunit_scores,
			"inform_reward": all_inform_reward_scores,
			"success_reward": all_success_reward_scores,
			"trad_auto": all_trad_auto_scores
		}
	}

def calculate_dial_level_corrs(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
	trad_auto_batch_scores: dict
):
	compiled_scores = compile_all_scores_dial_level(	
		human_batch_scores, 
		tdeval_batch_scores, 
		lmunit_batch_scores,
		trad_batch_scores,
		trad_auto_batch_scores
	)
	# serialize batched compile scores
	batch_human_scores = compiled_scores['batch']['human']
	batch_tdeval_scores = compiled_scores['batch']['tdeval']
	batch_lmunit_scores = compiled_scores['batch']['lmunit']
	batch_inform_reward_scores = compiled_scores['batch']['inform_reward']
	batch_success_reward_scores = compiled_scores['batch']['success_reward']
	batch_trad_auto_scores = compiled_scores['batch']['trad_auto']
	# serialized fully compiled scores
	all_human_scores = compiled_scores['all']['human']
	all_tdeval_scores = compiled_scores['all']['tdeval']
	all_lmunit_scores = compiled_scores['all']['lmunit']
	all_inform_reward_scores = compiled_scores['all']['inform_reward']
	all_success_reward_scores = compiled_scores['all']['success_reward']
	all_trad_auto_scores = compiled_scores['all']['trad_auto']
	# take correlation for each batch
	dial_corrs = {}
	for bId in human_batch_scores.keys():
		dial_corrs[bId] = map_all_correlations_dial_level(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId], 
			batch_lmunit_scores[bId], 
			batch_inform_reward_scores[bId], 
			batch_success_reward_scores[bId],
			batch_trad_auto_scores[bId]
		)
	dial_corrs["all"] = map_all_correlations_dial_level(
		all_human_scores,
		all_tdeval_scores,
		all_lmunit_scores,
		all_inform_reward_scores,
		all_success_reward_scores,
		all_trad_auto_scores
	)
	return dial_corrs

dial_level_corr = calculate_dial_level_corrs(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores,
	trad_batch_scores,
	trad_auto_batch_scores
)
pprint.pprint(dial_level_corr["all"], compact=True)

#### Calculate Dialogue-Level IRR: Human vs TD-Eval, LMUnit, AutoTOD, Inform/Success + Tau Reward

In [None]:
def map_all_irr_dial_level(
	human_scores: np.ndarray, 
	tdeval_scores: np.ndarray, 
	lmunit_scores: np.ndarray, 
	inform_reward_scores: np.ndarray, 
	success_reward_scores: np.ndarray, 
	trad_auto_scores: np.ndarray
):
	# take IRR by metric (conv)
	human_conv = human_scores[::3]
	bin_human_conv = np.where(human_conv >= 4, 1, 0)
	tdeval_conv = tdeval_scores[::3]

	human_tdeval_conv = np.vstack(
		(human_conv, tdeval_conv)
	).astype(dtype=np.int64)
	human_tdeval_conv_gwet = calculate_gwet_ac(human_tdeval_conv)
	human_tdeval_conv_kappa = calculate_kappa(
		human_tdeval_conv-1, 
		n_cat=5
	)['randolph']

	human_lmunit_conv = np.vstack(
		(human_conv, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_conv_gwet = calculate_gwet_ac(human_lmunit_conv)
	human_lmunit_conv_kappa = calculate_kappa(
		human_lmunit_conv-1, 
		n_cat=5
	)['randolph']

	human_inform_reward_conv = np.vstack(
		(bin_human_conv, inform_reward_scores)
	).astype(dtype=np.int64)
	human_inform_reward_conv_gwet = calculate_gwet_ac(human_inform_reward_conv)
	human_inform_reward_conv_kappa = calculate_kappa(
		human_inform_reward_conv, 
		n_cat=2
	)['randolph']

	human_success_reward_conv = np.vstack(
		(bin_human_conv, success_reward_scores)
	).astype(dtype=np.int64)
	human_success_reward_conv_gwet = calculate_gwet_ac(human_success_reward_conv)
	human_success_reward_conv_kappa = calculate_kappa(
		human_success_reward_conv, 
		n_cat=2
	)['randolph']

	human_trad_auto_conv = np.vstack(
		(bin_human_conv, trad_auto_scores)
	).astype(dtype=np.int64)
	human_trad_auto_conv_gwet = calculate_gwet_ac(human_trad_auto_conv)
	human_trad_auto_conv_kappa = calculate_kappa(
		human_trad_auto_conv, 
		n_cat=2
	)['randolph']

	# take correlation by metric (backend)
	human_backend = human_scores[1::3]
	bin_human_backend = np.where(human_backend >= 4, 1, 0)
	tdeval_backend = tdeval_scores[1::3]
	human_tdeval_backend = np.vstack(
		(human_backend, tdeval_backend)
	).astype(dtype=np.int64)
	human_tdeval_backend_gwet = calculate_gwet_ac(human_tdeval_backend)
	human_tdeval_backend_kappa = calculate_kappa(
		human_tdeval_backend-1, 
		n_cat=5
	)['randolph']

	human_lmunit_backend = np.vstack(
		(human_backend, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_backend_gwet = calculate_gwet_ac(human_lmunit_backend)
	human_lmunit_backend_kappa = calculate_kappa(
		human_lmunit_backend-1, 
		n_cat=5
	)['randolph']

	human_inform_reward_backend = np.vstack(
		(bin_human_backend, inform_reward_scores)
	).astype(dtype=np.int64)
	human_inform_reward_backend_gwet = calculate_gwet_ac(
		human_inform_reward_backend
	)
	human_inform_reward_backend_kappa = calculate_kappa(
		human_inform_reward_backend, 
		n_cat=2
	)['randolph']

	human_success_reward_backend = np.vstack(
		(bin_human_backend, success_reward_scores)
	).astype(dtype=np.int64)
	human_success_reward_backend_gwet = calculate_gwet_ac(
		human_success_reward_backend
	)
	human_success_reward_backend_kappa = calculate_kappa(
		human_success_reward_backend, 
		n_cat=2
	)['randolph']

	human_trad_auto_backend = np.vstack(
		(bin_human_backend, trad_auto_scores)
	).astype(dtype=np.int64)
	human_trad_auto_backend_gwet = calculate_gwet_ac(human_trad_auto_backend)
	human_trad_auto_backend_kappa = calculate_kappa(
		human_trad_auto_backend, 
		n_cat=2
	)['randolph']

	# take correlation by metric (policy)
	human_policy = human_scores[2::3]
	bin_human_policy = np.where(human_policy >= 4, 1, 0)
	tdeval_policy = tdeval_scores[2::3]
	
	human_tdeval_policy = np.vstack(
		(human_policy, tdeval_policy)
	).astype(dtype=np.int64)
	human_tdeval_policy_gwet = calculate_gwet_ac(human_tdeval_policy)
	human_tdeval_policy_kappa = calculate_kappa(
		human_tdeval_policy-1, 
		n_cat=5
	)['randolph']

	human_lmunit_policy = np.vstack(
		(human_policy, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_policy_gwet = calculate_gwet_ac(human_lmunit_policy)
	human_lmunit_policy_kappa = calculate_kappa(
		human_lmunit_policy-1, 
		n_cat=5
	)['randolph']

	human_inform_reward_policy = np.vstack(
		(bin_human_policy, inform_reward_scores)
	).astype(dtype=np.int64)
	human_inform_reward_policy_gwet = calculate_gwet_ac(
		human_inform_reward_policy
	)
	human_inform_reward_policy_kappa = calculate_kappa(
		human_inform_reward_policy, 
		n_cat=2
	)['randolph']

	human_success_reward_policy = np.vstack(
		(bin_human_policy, success_reward_scores)
	).astype(dtype=np.int64)
	human_success_reward_policy_gwet = calculate_gwet_ac(
		human_success_reward_policy
	)
	human_success_reward_kappa = calculate_kappa(
		human_success_reward_policy, 
		n_cat=2
	)['randolph']

	human_trad_auto_policy = np.vstack(
		(bin_human_policy, trad_auto_scores)
	).astype(dtype=np.int64)
	human_trad_auto_policy_gwet = calculate_gwet_ac(human_trad_auto_policy)
	human_trad_auto_policy_kappa = calculate_kappa(
		human_trad_auto_policy, 
		n_cat=2
	)['randolph']

	# take correlation of all metrics (overall)
	overall_human = human_scores
	avg_human = np.mean(human_scores.reshape((-1, 3)), axis=1)
	avg_bin_human = np.where(avg_human >= 4, 1, 0)
	overall_tdeval = tdeval_scores

	overall_human_tdeval = np.vstack(
		(overall_human, overall_tdeval)
	).astype(dtype=np.int64)
	overall_human_tdeval_gwet = calculate_gwet_ac(overall_human_tdeval)
	overall_human_tdeval_kappa = calculate_kappa(
		overall_human_tdeval-1, 
		5
	)['randolph']

	overall_human_lmunit = np.vstack(
		(avg_human, lmunit_scores)
	).astype(dtype=np.int64)
	overall_human_lmunit_gwet = calculate_gwet_ac(overall_human_lmunit)
	overall_human_lmunit_kappa = calculate_kappa(
		overall_human_lmunit-1,
		n_cat=5
	)['randolph']

	overall_human_inform_reward = np.vstack(
		(avg_bin_human, inform_reward_scores)
	).astype(dtype=np.int64)
	overall_human_inform_reward_gwet = calculate_gwet_ac(
		overall_human_inform_reward
	)
	overall_human_inform_reward_kappa = calculate_kappa(
		overall_human_inform_reward, 
		n_cat=2
	)['randolph']

	overall_human_success_reward = np.vstack(
		(avg_bin_human, success_reward_scores)
	).astype(dtype=np.int64)
	overall_human_success_reward_gwet = calculate_gwet_ac(
		overall_human_success_reward
	)
	overall_human_success_reward_kappa = calculate_kappa(
		overall_human_success_reward, 
		n_cat=2
	)['randolph']

	overall_human_trad_auto = np.vstack(
		(avg_bin_human, trad_auto_scores)
	).astype(dtype=np.int64)
	overall_human_trad_auto_gwet = calculate_gwet_ac(overall_human_trad_auto)
	overall_human_trad_auto_kappa = calculate_kappa(
		overall_human_trad_auto, 
		n_cat=2
	)['randolph']

	dial_irr = {
		"human-tdeval": {
			"conv": {
				"gwet-ac1": human_tdeval_conv_gwet,
				"r_kappa": human_tdeval_conv_kappa
			}, 
			"backend": {
				"gwet-ac1": human_tdeval_backend_gwet,
				"r_kappa": human_tdeval_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_tdeval_policy_gwet,
				"r_kappa": human_tdeval_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_tdeval_gwet,
				"r_kappa": overall_human_tdeval_kappa
			},
		},
		"human-lmunit": {
			"conv": {
				"gwet-ac1": human_lmunit_conv_gwet,
				"r_kappa": human_lmunit_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_lmunit_backend_gwet,
				"r_kappa": human_lmunit_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_lmunit_policy_gwet,
				"r_kappa": human_lmunit_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_lmunit_gwet,
				"r_kappa": overall_human_lmunit_kappa
			},
		},
		"human-inform-reward": {
			"conv": {
				"gwet-ac1": human_inform_reward_conv_gwet,
				"r_kappa": human_inform_reward_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_inform_reward_backend_gwet,
				"r_kappa": human_inform_reward_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_inform_reward_policy_gwet,
				"r_kappa": human_inform_reward_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_inform_reward_gwet,
				"r_kappa": overall_human_inform_reward_kappa
			},
		},
		"human-success-reward": {
			"conv": {
				"gwet-ac1": human_success_reward_conv_gwet,
				"r_kappa": human_success_reward_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_success_reward_backend_gwet,
				"r_kappa": human_success_reward_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_success_reward_policy_gwet,
				"r_kappa": human_success_reward_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_success_reward_gwet,
				"r_kappa": overall_human_success_reward_kappa
			},
		},
		"human-trad-auto": {
			"conv": {
				"gwet-ac1": human_trad_auto_conv_gwet,
				"r_kappa": human_trad_auto_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_trad_auto_backend_gwet,
				"r_kappa": human_trad_auto_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_trad_auto_policy_gwet,
				"r_kappa": human_trad_auto_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_trad_auto_gwet,
				"r_kappa": overall_human_trad_auto_kappa
			}
		}
	}
	return dial_irr

def calculate_dial_level_agreement(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
	trad_auto_batch_scores: dict
):
	compiled_scores = compile_all_scores_dial_level(	
		human_batch_scores, 
		tdeval_batch_scores, 
		lmunit_batch_scores,
		trad_batch_scores,
		trad_auto_batch_scores
	)
	# serialize batched compile scores
	batch_human_scores = compiled_scores['batch']['human']
	batch_tdeval_scores = compiled_scores['batch']['tdeval']
	batch_lmunit_scores = compiled_scores['batch']['lmunit']
	batch_inform_reward_scores = compiled_scores['batch']['inform_reward']
	batch_success_reward_scores = compiled_scores['batch']['success_reward']
	batch_trad_auto_scores = compiled_scores['batch']['trad_auto']
	# serialized fully compiled scores
	all_human_scores = compiled_scores['all']['human']
	all_tdeval_scores = compiled_scores['all']['tdeval']
	all_lmunit_scores = compiled_scores['all']['lmunit']
	all_inform_reward_scores = compiled_scores['all']['inform_reward']
	all_success_reward_scores = compiled_scores['all']['success_reward']
	all_trad_auto_scores = compiled_scores['all']['trad_auto']
	# take IRR for each batch
	dial_irr = {}
	for bId in human_batch_scores.keys():
		dial_irr[bId] = map_all_irr_dial_level(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId], 
			batch_lmunit_scores[bId], 
			batch_inform_reward_scores[bId], 
			batch_success_reward_scores[bId],
			batch_trad_auto_scores[bId]
		)
	dial_irr["all"] = map_all_irr_dial_level(
		all_human_scores,
		all_tdeval_scores,
		all_lmunit_scores,
		all_inform_reward_scores,
		all_success_reward_scores,
		all_trad_auto_scores
	)
	return dial_irr

dial_level_irr = calculate_dial_level_agreement(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores,
	trad_batch_scores,
	trad_auto_batch_scores
)
pprint.pprint(dial_level_irr["all"], compact=True)

#### Find agreement only on MultiWOZ dialogues on traditional automatic metrics

In [None]:
def map_all_irr_mwoz_dial_level(
	human_scores: np.ndarray, 
	tdeval_scores: np.ndarray, 
	lmunit_scores: np.ndarray, 
	inform_scores: np.ndarray, 
	success_scores: np.ndarray, 
	trad_auto_scores: np.ndarray
):
	# take IRR by metric (conv)
	human_conv = human_scores[::3]
	bin_human_conv = np.where(human_conv >= 4, 1, 0)
	tdeval_conv = tdeval_scores[::3]

	human_tdeval_conv = np.vstack(
		(human_conv, tdeval_conv)
	).astype(dtype=np.int64)
	human_tdeval_conv_gwet = calculate_gwet_ac(human_tdeval_conv)
	human_tdeval_conv_kappa = calculate_kappa(
		human_tdeval_conv-1, 
		n_cat=5
	)['randolph']

	human_lmunit_conv = np.vstack(
		(human_conv, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_conv_gwet = calculate_gwet_ac(human_lmunit_conv)
	human_lmunit_conv_kappa = calculate_kappa(
		human_lmunit_conv-1, 
		n_cat=5
	)['randolph']

	human_inform_conv = np.vstack(
		(bin_human_conv, inform_scores)
	).astype(dtype=np.int64)
	human_inform_conv_gwet = calculate_gwet_ac(human_inform_conv)
	human_inform_conv_kappa = calculate_kappa(
		human_inform_conv, 
		n_cat=2
	)['randolph']

	human_success_conv = np.vstack(
		(bin_human_conv, success_scores)
	).astype(dtype=np.int64)
	human_success_conv_gwet = calculate_gwet_ac(human_success_conv)
	human_success_conv_kappa = calculate_kappa(
		human_success_conv, 
		n_cat=2
	)['randolph']

	human_trad_auto_conv = np.vstack(
		(bin_human_conv, trad_auto_scores)
	).astype(dtype=np.int64)
	human_trad_auto_conv_gwet = calculate_gwet_ac(human_trad_auto_conv)
	human_trad_auto_conv_kappa = calculate_kappa(
		human_trad_auto_conv, 
		n_cat=2
	)['randolph']

	# take correlation by metric (backend)
	human_backend = human_scores[1::3]
	bin_human_backend = np.where(human_backend >= 4, 1, 0)
	tdeval_backend = tdeval_scores[1::3]
	human_tdeval_backend = np.vstack(
		(human_backend, tdeval_backend)
	).astype(dtype=np.int64)
	human_tdeval_backend_gwet = calculate_gwet_ac(human_tdeval_backend)
	human_tdeval_backend_kappa = calculate_kappa(
		human_tdeval_backend-1, 
		n_cat=5
	)['randolph']

	human_lmunit_backend = np.vstack(
		(human_backend, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_backend_gwet = calculate_gwet_ac(human_lmunit_backend)
	human_lmunit_backend_kappa = calculate_kappa(
		human_lmunit_backend-1, 
		n_cat=5
	)['randolph']

	human_inform_backend = np.vstack(
		(bin_human_backend, inform_scores)
	).astype(dtype=np.int64)
	human_inform_backend_gwet = calculate_gwet_ac(
		human_inform_backend
	)
	human_inform_backend_kappa = calculate_kappa(
		human_inform_backend, 
		n_cat=2
	)['randolph']

	human_success_backend = np.vstack(
		(bin_human_backend, success_scores)
	).astype(dtype=np.int64)
	human_success_backend_gwet = calculate_gwet_ac(
		human_success_backend
	)
	human_success_backend_kappa = calculate_kappa(
		human_success_backend, 
		n_cat=2
	)['randolph']

	human_trad_auto_backend = np.vstack(
		(bin_human_backend, trad_auto_scores)
	).astype(dtype=np.int64)
	human_trad_auto_backend_gwet = calculate_gwet_ac(human_trad_auto_backend)
	human_trad_auto_backend_kappa = calculate_kappa(
		human_trad_auto_backend, 
		n_cat=2
	)['randolph']

	# take correlation by metric (policy)
	human_policy = human_scores[2::3]
	bin_human_policy = np.where(human_policy >= 4, 1, 0)
	tdeval_policy = tdeval_scores[2::3]
	
	human_tdeval_policy = np.vstack(
		(human_policy, tdeval_policy)
	).astype(dtype=np.int64)
	human_tdeval_policy_gwet = calculate_gwet_ac(human_tdeval_policy)
	human_tdeval_policy_kappa = calculate_kappa(
		human_tdeval_policy-1, 
		n_cat=5
	)['randolph']

	human_lmunit_policy = np.vstack(
		(human_policy, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_policy_gwet = calculate_gwet_ac(human_lmunit_policy)
	human_lmunit_policy_kappa = calculate_kappa(
		human_lmunit_policy-1, 
		n_cat=5
	)['randolph']

	human_inform_policy = np.vstack(
		(bin_human_policy, inform_scores)
	).astype(dtype=np.int64)
	human_inform_policy_gwet = calculate_gwet_ac(
		human_inform_policy
	)
	human_inform_policy_kappa = calculate_kappa(
		human_inform_policy, 
		n_cat=2
	)['randolph']

	human_success_policy = np.vstack(
		(bin_human_policy, success_scores)
	).astype(dtype=np.int64)
	human_success_policy_gwet = calculate_gwet_ac(
		human_success_policy
	)
	human_success_kappa = calculate_kappa(
		human_success_policy, 
		n_cat=2
	)['randolph']

	human_trad_auto_policy = np.vstack(
		(bin_human_policy, trad_auto_scores)
	).astype(dtype=np.int64)
	human_trad_auto_policy_gwet = calculate_gwet_ac(human_trad_auto_policy)
	human_trad_auto_policy_kappa = calculate_kappa(
		human_trad_auto_policy, 
		n_cat=2
	)['randolph']

	# take correlation of all metrics (overall)
	overall_human = human_scores
	avg_human = np.mean(human_scores.reshape((-1, 3)), axis=1)
	avg_bin_human = np.where(avg_human >= 4, 1, 0)
	overall_tdeval = tdeval_scores

	overall_human_tdeval = np.vstack(
		(overall_human, overall_tdeval)
	).astype(dtype=np.int64)
	overall_human_tdeval_gwet = calculate_gwet_ac(overall_human_tdeval)
	overall_human_tdeval_kappa = calculate_kappa(
		overall_human_tdeval-1, 
		5
	)['randolph']

	overall_human_lmunit = np.vstack(
		(avg_human, lmunit_scores)
	).astype(dtype=np.int64)
	overall_human_lmunit_gwet = calculate_gwet_ac(overall_human_lmunit)
	overall_human_lmunit_kappa = calculate_kappa(
		overall_human_lmunit-1,
		n_cat=5
	)['randolph']

	overall_human_inform = np.vstack(
		(avg_bin_human, inform_scores)
	).astype(dtype=np.int64)
	overall_human_inform_gwet = calculate_gwet_ac(
		overall_human_inform
	)
	overall_human_inform_kappa = calculate_kappa(
		overall_human_inform, 
		n_cat=2
	)['randolph']

	overall_human_success = np.vstack(
		(avg_bin_human, success_scores)
	).astype(dtype=np.int64)
	overall_human_success_gwet = calculate_gwet_ac(
		overall_human_success
	)
	overall_human_success_kappa = calculate_kappa(
		overall_human_success, 
		n_cat=2
	)['randolph']

	overall_human_trad_auto = np.vstack(
		(avg_bin_human, trad_auto_scores)
	).astype(dtype=np.int64)
	overall_human_trad_auto_gwet = calculate_gwet_ac(overall_human_trad_auto)
	overall_human_trad_auto_kappa = calculate_kappa(
		overall_human_trad_auto, 
		n_cat=2
	)['randolph']

	dial_irr = {
		"mwoz-human-tdeval": {
			"conv": {
				"gwet-ac1": human_tdeval_conv_gwet,
				"r_kappa": human_tdeval_conv_kappa
			}, 
			"backend": {
				"gwet-ac1": human_tdeval_backend_gwet,
				"r_kappa": human_tdeval_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_tdeval_policy_gwet,
				"r_kappa": human_tdeval_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_tdeval_gwet,
				"r_kappa": overall_human_tdeval_kappa
			},
		},
		"mwoz-human-lmunit": {
			"conv": {
				"gwet-ac1": human_lmunit_conv_gwet,
				"r_kappa": human_lmunit_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_lmunit_backend_gwet,
				"r_kappa": human_lmunit_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_lmunit_policy_gwet,
				"r_kappa": human_lmunit_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_lmunit_gwet,
				"r_kappa": overall_human_lmunit_kappa
			},
		},
		"mwoz-human-inform": {
			"conv": {
				"gwet-ac1": human_inform_conv_gwet,
				"r_kappa": human_inform_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_inform_backend_gwet,
				"r_kappa": human_inform_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_inform_policy_gwet,
				"r_kappa": human_inform_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_inform_gwet,
				"r_kappa": overall_human_inform_kappa
			},
		},
		"mwoz-human-success": {
			"conv": {
				"gwet-ac1": human_success_conv_gwet,
				"r_kappa": human_success_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_success_backend_gwet,
				"r_kappa": human_success_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_success_policy_gwet,
				"r_kappa": human_success_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_success_gwet,
				"r_kappa": overall_human_success_kappa
			},
		},
		"mwoz-human-trad-auto": {
			"conv": {
				"gwet-ac1": human_trad_auto_conv_gwet,
				"r_kappa": human_trad_auto_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_trad_auto_backend_gwet,
				"r_kappa": human_trad_auto_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_trad_auto_policy_gwet,
				"r_kappa": human_trad_auto_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_trad_auto_gwet,
				"r_kappa": overall_human_trad_auto_kappa
			}
		}
	}
	return dial_irr

def compile_all_mwoz_scores_dial_level(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
	trad_auto_scores: dict
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	batch_inform_scores = {}
	batch_success_scores = {}
	batch_trad_auto_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	all_inform_scores = []
	all_success_scores = []
	all_trad_auto_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["dial_level"]
		tdeval_dials = tdeval_batch_scores[bId]["dial_level"]
		lmunit_dials = lmunit_batch_scores[bId]["dial_level"]
		trad_dials = trad_batch_scores[bId]
		trad_auto_dials = trad_auto_scores[bId]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		batch_inform_scores[bId] = []
		batch_success_scores[bId] = []
		batch_trad_auto_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_score = lmunit_dials[dId]
			trad_scores = trad_dials[dId]
			trad_auto_score = trad_auto_dials[dId]
			# skip tau conversations 
			if ("airline" in dId) or ("retail" in dId):
				continue
			# batch dial level is just a single score for the whole dialogue
			batch_lmunit_scores[bId] = np.append(
				batch_lmunit_scores[bId], 
				lmunit_score
			)
			all_lmunit_scores = np.append(
				all_lmunit_scores, 
				lmunit_score
			)
			# Inform/Success/Tau reward need to be concatenated differently
			batch_inform_scores[bId] = np.append(
				batch_inform_scores[bId], 
				trad_scores["inform"]
			)
			all_inform_scores = np.append(
				all_inform_scores,
				trad_scores["inform"]
			)
			batch_success_scores[bId] = np.append(
				batch_success_scores[bId],
				trad_scores["success"]
			)
			all_success_scores = np.append(
				all_success_scores, 
				trad_scores["success"]
			)
			# compile dialogue level automatic metrics, tau and mwoz are same format
			batch_trad_auto_scores[bId] = np.append(
				batch_trad_auto_scores[bId],
				trad_auto_score
			)
			all_trad_auto_scores = np.append(
				all_trad_auto_scores,
				trad_auto_score
			)
			for metric in dims:
				batch_human_scores[bId] = np.append(
					batch_human_scores[bId], 
					human_scores[metric]
				)
				all_human_scores = np.append(
					all_human_scores, 
					human_scores[metric]
				)
				batch_tdeval_scores[bId] = np.append(
					batch_tdeval_scores[bId], 
					tdeval_scores[metric]
				)
				all_tdeval_scores = np.append(
					all_tdeval_scores, 
					tdeval_scores[metric]
				)

	return {
		'batch': {
			'human': batch_human_scores,
			'tdeval': batch_tdeval_scores,
			'lmunit': batch_lmunit_scores,
			"inform": batch_inform_scores,
			"success": batch_success_scores,
			"trad_auto": batch_trad_auto_scores
		},
		"all": {
			'human': all_human_scores,
			'tdeval': all_tdeval_scores,
			'lmunit': all_lmunit_scores,
			"inform": all_inform_scores,
			"success": all_success_scores,
			"trad_auto": all_trad_auto_scores
		}
	}

def calculate_mwoz_dial_level_agreement(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
	trad_auto_batch_scores: dict
):
	compiled_scores = compile_all_mwoz_scores_dial_level(	
		human_batch_scores, 
		tdeval_batch_scores, 
		lmunit_batch_scores,
		trad_batch_scores,
		trad_auto_batch_scores
	)
	# serialize batched compile scores
	batch_human_scores = compiled_scores['batch']['human']
	batch_tdeval_scores = compiled_scores['batch']['tdeval']
	batch_lmunit_scores = compiled_scores['batch']['lmunit']
	batch_inform_scores = compiled_scores['batch']['inform']
	batch_success_scores = compiled_scores['batch']['success']
	batch_trad_auto_scores = compiled_scores['batch']['trad_auto']
	# serialized fully compiled scores
	all_human_scores = compiled_scores['all']['human']
	all_tdeval_scores = compiled_scores['all']['tdeval']
	all_lmunit_scores = compiled_scores['all']['lmunit']
	all_inform_scores = compiled_scores['all']['inform']
	all_success_scores = compiled_scores['all']['success']
	all_trad_auto_scores = compiled_scores['all']['trad_auto']
	# take IRR for each batch
	dial_irr = {}
	for bId in human_batch_scores.keys():
		dial_irr[bId] = map_all_irr_mwoz_dial_level(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId], 
			batch_lmunit_scores[bId], 
			batch_inform_scores[bId], 
			batch_success_scores[bId],
			batch_trad_auto_scores[bId]
		)
	dial_irr["all"] = map_all_irr_mwoz_dial_level(
		all_human_scores,
		all_tdeval_scores,
		all_lmunit_scores,
		all_inform_scores,
		all_success_scores,
		all_trad_auto_scores
	)
	return dial_irr

mwoz_dial_level_irr = calculate_mwoz_dial_level_agreement(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores,
	trad_batch_scores,
	trad_auto_batch_scores
)
pprint.pprint(mwoz_dial_level_irr["all"], compact=True)

#### Find agreement only on Tau dialogues using Reward metrics

In [None]:
def map_all_irr_tau_dial_level(
	human_scores: np.ndarray, 
	tdeval_scores: np.ndarray, 
	lmunit_scores: np.ndarray, 
	reward_scores: np.ndarray, 
):
	# take IRR by metric (conv)
	human_conv = human_scores[::3]
	bin_human_conv = np.where(human_conv >= 4, 1, 0)
	tdeval_conv = tdeval_scores[::3]

	human_tdeval_conv = np.vstack(
		(human_conv, tdeval_conv)
	).astype(dtype=np.int64)
	human_tdeval_conv_gwet = calculate_gwet_ac(human_tdeval_conv)
	human_tdeval_conv_kappa = calculate_kappa(
		human_tdeval_conv-1, 
		n_cat=5
	)['randolph']

	human_lmunit_conv = np.vstack(
		(human_conv, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_conv_gwet = calculate_gwet_ac(human_lmunit_conv)
	human_lmunit_conv_kappa = calculate_kappa(
		human_lmunit_conv-1, 
		n_cat=5
	)['randolph']

	human_reward_conv = np.vstack(
		(bin_human_conv, reward_scores)
	).astype(dtype=np.int64)
	human_reward_conv_gwet = calculate_gwet_ac(human_reward_conv)
	human_reward_conv_kappa = calculate_kappa(
		human_reward_conv, 
		n_cat=2
	)['randolph']

	# take correlation by metric (backend)
	human_backend = human_scores[1::3]
	bin_human_backend = np.where(human_backend >= 4, 1, 0)
	tdeval_backend = tdeval_scores[1::3]
	human_tdeval_backend = np.vstack(
		(human_backend, tdeval_backend)
	).astype(dtype=np.int64)
	human_tdeval_backend_gwet = calculate_gwet_ac(human_tdeval_backend)
	human_tdeval_backend_kappa = calculate_kappa(
		human_tdeval_backend-1, 
		n_cat=5
	)['randolph']

	human_lmunit_backend = np.vstack(
		(human_backend, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_backend_gwet = calculate_gwet_ac(human_lmunit_backend)
	human_lmunit_backend_kappa = calculate_kappa(
		human_lmunit_backend-1, 
		n_cat=5
	)['randolph']

	human_reward_backend = np.vstack(
		(bin_human_backend, reward_scores)
	).astype(dtype=np.int64)
	human_reward_backend_gwet = calculate_gwet_ac(
		human_reward_backend
	)
	human_reward_backend_kappa = calculate_kappa(
		human_reward_backend, 
		n_cat=2
	)['randolph']

	# take correlation by metric (policy)
	human_policy = human_scores[2::3]
	bin_human_policy = np.where(human_policy >= 4, 1, 0)
	tdeval_policy = tdeval_scores[2::3]
	
	human_tdeval_policy = np.vstack(
		(human_policy, tdeval_policy)
	).astype(dtype=np.int64)
	human_tdeval_policy_gwet = calculate_gwet_ac(human_tdeval_policy)
	human_tdeval_policy_kappa = calculate_kappa(
		human_tdeval_policy-1, 
		n_cat=5
	)['randolph']

	human_lmunit_policy = np.vstack(
		(human_policy, lmunit_scores)
	).astype(dtype=np.int64)
	human_lmunit_policy_gwet = calculate_gwet_ac(human_lmunit_policy)
	human_lmunit_policy_kappa = calculate_kappa(
		human_lmunit_policy-1, 
		n_cat=5
	)['randolph']

	human_reward_policy = np.vstack(
		(bin_human_policy, reward_scores)
	).astype(dtype=np.int64)
	human_reward_policy_gwet = calculate_gwet_ac(
		human_reward_policy
	)
	human_reward_policy_kappa = calculate_kappa(
		human_reward_policy, 
		n_cat=2
	)['randolph']

	# take correlation of all metrics (overall)
	overall_human = human_scores
	avg_human = np.mean(human_scores.reshape((-1, 3)), axis=1)
	avg_bin_human = np.where(avg_human >= 4, 1, 0)
	overall_tdeval = tdeval_scores

	overall_human_tdeval = np.vstack(
		(overall_human, overall_tdeval)
	).astype(dtype=np.int64)
	overall_human_tdeval_gwet = calculate_gwet_ac(overall_human_tdeval)
	overall_human_tdeval_kappa = calculate_kappa(
		overall_human_tdeval-1, 
		5
	)['randolph']

	overall_human_lmunit = np.vstack(
		(avg_human, lmunit_scores)
	).astype(dtype=np.int64)
	overall_human_lmunit_gwet = calculate_gwet_ac(overall_human_lmunit)
	overall_human_lmunit_kappa = calculate_kappa(
		overall_human_lmunit-1,
		n_cat=5
	)['randolph']

	overall_human_reward = np.vstack(
		(avg_bin_human, reward_scores)
	).astype(dtype=np.int64)
	overall_human_reward_gwet = calculate_gwet_ac(
		overall_human_reward
	)
	overall_human_reward_kappa = calculate_kappa(
		overall_human_reward, 
		n_cat=2
	)['randolph']

	dial_irr = {
		"tau-human-tdeval": {
			"conv": {
				"gwet-ac1": human_tdeval_conv_gwet,
				"r_kappa": human_tdeval_conv_kappa
			}, 
			"backend": {
				"gwet-ac1": human_tdeval_backend_gwet,
				"r_kappa": human_tdeval_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_tdeval_policy_gwet,
				"r_kappa": human_tdeval_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_tdeval_gwet,
				"r_kappa": overall_human_tdeval_kappa
			},
		},
		"tau-human-lmunit": {
			"conv": {
				"gwet-ac1": human_lmunit_conv_gwet,
				"r_kappa": human_lmunit_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_lmunit_backend_gwet,
				"r_kappa": human_lmunit_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_lmunit_policy_gwet,
				"r_kappa": human_lmunit_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_lmunit_gwet,
				"r_kappa": overall_human_lmunit_kappa
			},
		},
		"tau-human-reward": {
			"conv": {
				"gwet-ac1": human_reward_conv_gwet,
				"r_kappa": human_reward_conv_kappa
			},
			"backend": {
				"gwet-ac1": human_reward_backend_gwet,
				"r_kappa": human_reward_backend_kappa
			},
			"policy": {
				"gwet-ac1": human_reward_policy_gwet,
				"r_kappa": human_reward_policy_kappa
			},
			"overall": {
				"gwet-ac1": overall_human_reward_gwet,
				"r_kappa": overall_human_reward_kappa
			},
		}
	}
	return dial_irr

def compile_all_tau_scores_dial_level(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
):
	dims = ['conv_consistency', 'backend_consistency', 'policy_completeness']
	# organize scores by batch
	batch_human_scores = {}
	batch_tdeval_scores = {}
	batch_lmunit_scores = {}
	batch_reward_scores = {}
	# consolidate scores into one large array
	all_human_scores = []
	all_tdeval_scores = []
	all_lmunit_scores = []
	all_reward_scores = []
	for bId in human_batch_scores.keys():
		human_dials = human_batch_scores[bId]["dial_level"]
		tdeval_dials = tdeval_batch_scores[bId]["dial_level"]
		lmunit_dials = lmunit_batch_scores[bId]["dial_level"]
		trad_dials = trad_batch_scores[bId]
		batch_human_scores[bId] = []
		batch_tdeval_scores[bId] = []
		batch_lmunit_scores[bId] = []
		batch_reward_scores[bId] = []
		for dId in human_dials.keys():
			human_scores = human_dials[dId]
			tdeval_scores = tdeval_dials[dId]
			lmunit_score = lmunit_dials[dId]
			trad_scores = trad_dials[dId]
			# skip mwoz conversations 
			if ("airline" not in dId) and ("retail" not in dId):
				continue
			batch_lmunit_scores[bId] = np.append(
				batch_lmunit_scores[bId], 
				lmunit_score
			)
			all_lmunit_scores = np.append(
				all_lmunit_scores, 
				lmunit_score
			)
			# Inform/Success/Tau reward need to be concatenated differently
			batch_reward_scores[bId] = np.append(
				batch_reward_scores[bId], 
				trad_scores
			)
			all_reward_scores = np.append(
				all_reward_scores,
				trad_scores
			)
			for metric in dims:
				batch_human_scores[bId] = np.append(
					batch_human_scores[bId], 
					human_scores[metric]
				)
				all_human_scores = np.append(
					all_human_scores, 
					human_scores[metric]
				)
				batch_tdeval_scores[bId] = np.append(
					batch_tdeval_scores[bId], 
					tdeval_scores[metric]
				)
				all_tdeval_scores = np.append(
					all_tdeval_scores, 
					tdeval_scores[metric]
				)
	return {
		'batch': {
			'human': batch_human_scores,
			'tdeval': batch_tdeval_scores,
			'lmunit': batch_lmunit_scores,
			"reward": batch_reward_scores,
		},
		"all": {
			'human': all_human_scores,
			'tdeval': all_tdeval_scores,
			'lmunit': all_lmunit_scores,
			"reward": all_reward_scores,
		}
	}

def calculate_tau_dial_level_agreement(
	human_batch_scores: dict, 
	tdeval_batch_scores: dict, 
	lmunit_batch_scores: dict,
	trad_batch_scores: dict,
):
	compiled_scores = compile_all_tau_scores_dial_level(	
		human_batch_scores, 
		tdeval_batch_scores, 
		lmunit_batch_scores,
		trad_batch_scores,
	)
	# serialize batched compile scores
	batch_human_scores = compiled_scores['batch']['human']
	batch_tdeval_scores = compiled_scores['batch']['tdeval']
	batch_lmunit_scores = compiled_scores['batch']['lmunit']
	batch_reward_scores = compiled_scores['batch']['reward']
	# serialized fully compiled scores
	all_human_scores = compiled_scores['all']['human']
	all_tdeval_scores = compiled_scores['all']['tdeval']
	all_lmunit_scores = compiled_scores['all']['lmunit']
	all_reward_scores = compiled_scores['all']['reward']
	# take IRR for each batch
	dial_irr = {}
	for bId in human_batch_scores.keys():
		dial_irr[bId] = map_all_irr_tau_dial_level(
			batch_human_scores[bId], 
			batch_tdeval_scores[bId], 
			batch_lmunit_scores[bId], 
			batch_reward_scores[bId],
		)
	dial_irr["all"] = map_all_irr_tau_dial_level(
		all_human_scores,
		all_tdeval_scores,
		all_lmunit_scores,
		all_reward_scores,
	)
	return dial_irr

tau_dial_level_irr = calculate_tau_dial_level_agreement(
	human_batch_scores, 
	tdeval_batch_scores, 
	lmunit_batch_scores,
	trad_batch_scores
)
pprint.pprint(tau_dial_level_irr["all"], compact=True)

### Compile Results and Save to File

In [None]:
comparison_results = {
	'turn_level_corrs': turn_level_corr["all"],
	'turn_level_irr': turn_level_irr["all"],
	'dial_level_corrs': dial_level_corr["all"],
	'dial_level_irr': dial_level_irr["all"],
	'mwoz_dial_level_irr': mwoz_dial_level_irr["all"]
}
output_dir = "agreement_scores"
with open(os.path.join(output_dir, 'main_human_eval.json'), 'w') as f:
	json.dump(comparison_results, f, indent=4)    
print(f"\nResults saved to {output_dir}")