In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np


Load all results and set up main variables

In [None]:
results_fname = 'summary_results.csv'

freyr_bootrstrap_results = pd.read_csv(os.path.join('./experiments/freyr_bootstrap', results_fname))
freyr_nobootstrap_results = pd.read_csv(os.path.join('./experiments/freyr_no_bootstrap', results_fname))
tool_bootrstrap_results = pd.read_csv(os.path.join('experiments/tool_v2_bootstrap', results_fname))
tool_nobootstrap_results = pd.read_csv(os.path.join('./experiments/tool_v2_no_bootstrap', results_fname))


In [None]:
freyr_bootrstrap_results['mode'] = 'freyr'
freyr_bootrstrap_results['bootstrap'] = True

freyr_nobootstrap_results['mode'] = 'freyr'
freyr_nobootstrap_results['bootstrap'] = False

tool_bootrstrap_results['mode'] = 'tool'
tool_bootrstrap_results['bootstrap'] = True

tool_nobootstrap_results['mode'] = 'tool'
tool_nobootstrap_results['bootstrap'] = False

# Concatenate all dataframes
df = pd.concat([
    freyr_bootrstrap_results,
    freyr_nobootstrap_results,
    # tool_bootrstrap_results,
    # tool_nobootstrap_results
], ignore_index=True)

# df = df.loc[((df['intent_llm'] == 'command-r') | (df['intent_llm'] == 'llama3.1') | (df['intent_llm'] == 'qwen2.5')) &
# 	                ((df['params_llm'] == df['intent_llm']) | (pd.isna(df['params_llm'])))]

In [None]:
from typing import Tuple


def analyze_tokens_and_time(df: pd.DataFrame,
                            step_n: int,
                            freyr_mode: bool,
                            with_bootstrap: bool) -> Tuple[int, int, float]:
	with open(f'./experiments/{"freyr" if freyr_mode else "tool_v2"}_{"bootstrap" if with_bootstrap else "no_bootstrap"}/{df["logfile"]}.log', 'r') as f:
		ls = f.readlines()

		step_start_idxs = [i for i, l in enumerate(ls) if 'main - step=' in l]

		in_tks_step, out_tks_step, t_step = 0, 0, 0.0
		for j, l in enumerate(ls[step_start_idxs[step_n - 1]:]):
			# LocalLLM.extract_intents - Prompt Tokens: 448; Completion Tokens: 6; Time: 0.2812
			if 'Prompt Tokens' in l:
				in_tks, out_tks, t = l.split(' - ')[1].split(';')
				in_tks_step += int(in_tks.split(':')[1])
				out_tks_step += int(out_tks.split(':')[1])
				t_step += float(t.split(':')[1])
				# print(step_n, j, l)

			# print('\t', step_n, in_tks_step, out_tks_step, t_step)
			# print('\t', len(step_start_idxs), step_n < len(step_start_idxs) - 1, j, step_start_idxs[step_n - 1] + j > step_start_idxs[step_n])
			if step_n < len(step_start_idxs) and step_start_idxs[step_n - 1] + j > step_start_idxs[step_n]:
				break

	return in_tks_step, out_tks_step, t_step

In [None]:
full_df = pd.DataFrame()

for i, row in df.iterrows():
	in_tks, out_tks, t = analyze_tokens_and_time(row, row['step'], row['mode'] == 'freyr', row['bootstrap'])
	# if row['step'] == 12: raise Exception
	full_df = pd.concat([full_df, pd.DataFrame({
		**row.to_dict(),
		'in_tks': in_tks,
		'out_tks': out_tks,
		't_step': t,
	}, index=[0])], ignore_index=True)

In [None]:
full_df.to_csv('./experiments/full_freyr_results.csv', index=False)

In [None]:
llms = freyr_bootrstrap_results['intent_llm'].unique()
tcases = freyr_bootrstrap_results['test_case'].unique()
steps_per_tcase = {
	1: 7,
	2: 9,
	3: 10,
	4: 11,
	5: 13
}

In [None]:
from typing import Tuple


def get_credible_interval(s: pd.Series) -> Tuple[float, float]:
	m, c, s = s.mean(), s.count(), s.std()
	return m + 1.96 * s / np.sqrt(c), m - 1.96 * s / np.sqrt(c)


def get_summary_freyr(df: pd.DataFrame,
                      with_bootstrap: bool) -> pd.DataFrame:
	results_summary = pd.DataFrame()
	for intent_llm in llms:
		for params_llm in llms:
			for tcase in tcases:
				tcase_results = pd.DataFrame()
				for run_n in df['run_n'].unique():
					test_results = df.loc[(df['intent_llm'] == intent_llm) &
					                      (df['params_llm'] == params_llm) &
					                      (df['run_n'] == run_n) &
					                      (df['test_case'] == tcase)]
					if not test_results.empty:
						if not with_bootstrap:
							valid_steps = test_results[test_results['valid_design'] == True]
							n_steps = valid_steps['step'].max() if not valid_steps.empty else 0
						else:
							n_steps = df['step'].max()
						n_valid = len(test_results[test_results['valid_design'] == True])
						tcase_results = pd.concat([tcase_results, pd.DataFrame({
							'intent_llm': intent_llm,
							'params_llm': params_llm,
							'run_n': run_n,
							'n_steps': n_steps,
							'n_valid': n_valid,
							'perc_complete': n_steps / steps_per_tcase[tcase],
							'perc_valid': n_valid / steps_per_tcase[tcase]
							}, index=[0])], ignore_index=True)
				if not tcase_results.empty:
					results_summary = pd.concat([results_summary, pd.DataFrame({
						'intent_llm': intent_llm,
						'params_llm': params_llm,
						'tcase': tcase,
						'mode': 'FREYR',
						'bootstrap': with_bootstrap,
						'avg_n_steps': tcase_results['n_steps'].mean(),
						'std_n_steps': tcase_results['n_steps'].std(),
						'n_steps_ci_hi': get_credible_interval(tcase_results['n_steps'])[0],
						'n_steps_ci_lo': get_credible_interval(tcase_results['n_steps'])[1],
						'avg_n_valid': tcase_results['n_valid'].mean(),
						'std_n_valid': tcase_results['n_valid'].std(),
						'n_valid_ci_hi': get_credible_interval(tcase_results['n_valid'])[0],
						'n_valid_ci_lo': get_credible_interval(tcase_results['n_valid'])[1],
						'avg_perc_complete': tcase_results['perc_complete'].mean(),
						'std_perc_complete': tcase_results['perc_complete'].std(),
						'perc_complete_ci_hi': get_credible_interval(tcase_results['perc_complete'])[0],
						'perc_complete_ci_lo': get_credible_interval(tcase_results['perc_complete'])[1],
						'avg_perc_valid': tcase_results['perc_valid'].mean(),
						'std_perc_valid': tcase_results['perc_valid'].std(),
						'perc_valid_ci_hi': get_credible_interval(tcase_results['perc_valid'])[0],
						'perc_valid_ci_lo': get_credible_interval(tcase_results['perc_valid'])[1],
					}, index=[0])], ignore_index=True)
	return results_summary

def get_summary_tool(df: pd.DataFrame,
                     with_bootstrap: bool) -> pd.DataFrame:
	results_summary = pd.DataFrame()
	for intent_llm in llms:
		for tcase in tcases:
			tcase_results = pd.DataFrame()
			for run_n in df['run_n'].unique():
				test_results = df.loc[(df['intent_llm'] == intent_llm) &
									  (df['run_n'] == run_n) &
									  (df['test_case'] == tcase)]
				if not test_results.empty:
					if not with_bootstrap:
						valid_steps = test_results[test_results['valid_design'] == True]
						n_steps = valid_steps['step'].max() if not valid_steps.empty else 0
					else:
						n_steps = df['step'].max()
					n_valid = len(test_results[test_results['valid_design'] == True])
					tcase_results = pd.concat([tcase_results, pd.DataFrame({
						'intent_llm': intent_llm,
						'params_llm': intent_llm,
						'run_n': run_n,
						'n_steps': n_steps,
						'n_valid': n_valid,
						'perc_complete': n_steps / steps_per_tcase[tcase],
						'perc_valid': n_valid / steps_per_tcase[tcase]
						}, index=[0])], ignore_index=True)
			if not tcase_results.empty:
				results_summary = pd.concat([results_summary, pd.DataFrame({
					'intent_llm': intent_llm,
					'params_llm': intent_llm,
					'tcase': tcase,
					'mode': 'Tool',
					'bootstrap': with_bootstrap,
					'avg_n_steps': tcase_results['n_steps'].mean(),
					'std_n_steps': tcase_results['n_steps'].std(),
					'n_steps_ci_hi': get_credible_interval(tcase_results['n_steps'])[0],
					'n_steps_ci_lo': get_credible_interval(tcase_results['n_steps'])[1],
					'avg_n_valid': tcase_results['n_valid'].mean(),
					'std_n_valid': tcase_results['n_valid'].std(),
					'n_valid_ci_hi': get_credible_interval(tcase_results['n_valid'])[0],
					'n_valid_ci_lo': get_credible_interval(tcase_results['n_valid'])[1],
					'avg_perc_complete': tcase_results['perc_complete'].mean(),
					'std_perc_complete': tcase_results['perc_complete'].std(),
					'perc_complete_ci_hi': get_credible_interval(tcase_results['perc_complete'])[0],
					'perc_complete_ci_lo': get_credible_interval(tcase_results['perc_complete'])[1],
					'avg_perc_valid': tcase_results['perc_valid'].mean(),
					'std_perc_valid': tcase_results['perc_valid'].std(),
					'perc_valid_ci_hi': get_credible_interval(tcase_results['perc_valid'])[0],
					'perc_valid_ci_lo': get_credible_interval(tcase_results['perc_valid'])[1],
				}, index=[0])], ignore_index=True)
	return results_summary

In [None]:
print('Summary for FREYR with no bootstrap...')
summary_freyr_nobootstrap = get_summary_freyr(freyr_nobootstrap_results, False)
print('Summary for FREYR with bootstrap...')
summary_freyr_bootstrap = get_summary_freyr(freyr_bootrstrap_results, True)
print('Summary for tool usage with bootstrap...')
summary_tool_bootstrap = get_summary_tool(tool_bootrstrap_results, True)
print('Summary for tool usage with no bootstrap...')
summary_tool_nobootstrap = get_summary_tool(tool_nobootstrap_results, False)


In [None]:
# Concatenate all dataframes
df = pd.concat([
    summary_freyr_nobootstrap,
    summary_freyr_bootstrap,
    summary_tool_bootstrap,
    summary_tool_nobootstrap
], ignore_index=True)

df = df.loc[((df['intent_llm'] == 'command-r') | (df['intent_llm'] == 'llama3.1') | (df['intent_llm'] == 'qwen2.5')) &
	                ((df['params_llm'] == df['intent_llm']) | (pd.isna(df['params_llm'])))]

# Calculate credible intervals and format the required metrics
df['n_steps_ci'] = df.apply(
    lambda row: f"${row['avg_n_steps']:.1f} \pm {(row['n_steps_ci_hi'] - row['n_steps_ci_lo']) / 2:.1f}$", axis=1)
df['perc_complete_ci'] = df.apply(
    lambda row: f"${row['avg_perc_complete']:.0%} \pm {(row['perc_complete_ci_hi'] - row['perc_complete_ci_lo']) / 2:.0%}$", axis=1)
df['perc_valid_ci'] = df.apply(
    lambda row: f"${row['avg_perc_valid']:.0%} \pm {(row['perc_valid_ci_hi'] - row['perc_valid_ci_lo']) / 2:.0%}$", axis=1)

for (with_bootstrap, columns) in zip([False, True],
                                   [['mode', 'intent_llm', 'params_llm', 'tcase', 'n_steps_ci', 'perc_complete_ci'],
                                    ['mode', 'intent_llm', 'params_llm', 'tcase', 'perc_valid_ci']]):
	sub_df = df.loc[(df['bootstrap'] == with_bootstrap)]

	# Select the columns to display
	display_df = sub_df[columns]

	# Sort the DataFrame for easier cell merging
	display_df = display_df.sort_values(['mode', 'intent_llm', 'params_llm', 'tcase']).reset_index(drop=True)

	headers = {
		'mode': 'Mode',
		'intent_llm': 'Intent LLM',
		'params_llm': 'Parameters LLM',
		'tcase': 'Test Case',
		'n_steps_ci': 'Num. Steps',
		'perc_complete_ci': 'Completed Steps (\%)',
		'perc_valid_ci': 'Valid Steps (\%)',
	}

	# Generate the LaTeX table with merged cells
	from tabulate import tabulate

	# Convert to LaTeX
	latex_table = tabulate(
	    display_df, headers=headers, tablefmt='latex_raw', showindex=False
	)

	with open(f'./experiments/freyr_tools_{"single" if with_bootstrap else "iterative"}_table.tex', 'w') as f:
		f.write(latex_table)

In [None]:
df

In [None]:
# Concatenate all dataframes
df = pd.concat([
    summary_freyr_nobootstrap,
    summary_freyr_bootstrap,
], ignore_index=True)

# Calculate credible intervals and format the required metrics
df['n_steps_ci'] = df.apply(
    lambda row: f"${row['avg_n_steps']:.1f} \pm {(row['n_steps_ci_hi'] - row['n_steps_ci_lo']) / 2:.1f}$", axis=1)
df['perc_complete_ci'] = df.apply(
    lambda row: f"${row['avg_perc_complete']:.0%} \pm {(row['perc_complete_ci_hi'] - row['perc_complete_ci_lo']) / 2:.0%}$", axis=1)
df['perc_valid_ci'] = df.apply(
    lambda row: f"${row['avg_perc_valid']:.0%} \pm {(row['perc_valid_ci_hi'] - row['perc_valid_ci_lo']) / 2:.0%}$", axis=1)

for (with_bootstrap, columns) in zip([False, True],
                                   [['mode', 'bootstrap', 'intent_llm', 'params_llm', 'tcase', 'n_steps_ci', 'perc_complete_ci'],
                                    ['mode', 'bootstrap', 'intent_llm', 'params_llm', 'tcase', 'perc_valid_ci']]):
	sub_df = df.loc[(df['bootstrap'] == with_bootstrap)]

	# Select the columns to display
	display_df = sub_df[columns]

	# Sort the DataFrame for easier cell merging
	display_df = display_df.sort_values(['intent_llm', 'params_llm', 'tcase']).reset_index(drop=True)

	# Generate the LaTeX table with merged cells
	from tabulate import tabulate

	headers = {
		'mode': 'Mode',
		'intent_llm': 'Intent LLM',
		'params_llm': 'Parameters LLM',
		'tcase': 'Test Case',
		'n_steps_ci': 'Num. Steps',
		'perc_complete_ci': 'Completed Steps (\%)',
		'perc_valid_ci': 'Valid Steps (\%)',
	}

	# Convert to LaTeX
	latex_table = tabulate(
	    display_df, headers=headers, tablefmt='latex_raw', showindex=False
	)

	with open(f'./experiments/freyr_{"single" if with_bootstrap else "iterative"}_table.tex', 'w') as f:
		f.write(latex_table)

In [None]:
latex_table

In [None]:
fails = pd.DataFrame()

df = freyr_bootrstrap_results.loc[((freyr_bootrstrap_results['intent_llm'] == 'command-r') |
                                       (freyr_bootrstrap_results['intent_llm'] == 'llama3.1') |
                                       (freyr_bootrstrap_results['intent_llm'] == 'qwen2.5')) &
	                ((freyr_bootrstrap_results['params_llm'] == freyr_bootrstrap_results['intent_llm']) |
	                 (pd.isna(freyr_bootrstrap_results['params_llm'])))]

for test_case in range(1, 5 + 1):
	for step in range(1, steps_per_tcase[test_case] + 1):
		sub_df = df.loc[(df['test_case'] == test_case) & (df['step'] == step)]
		domain_fails = len(sub_df.loc[sub_df['valid_domain'] == False])
		design_fails = len(sub_df.loc[sub_df['valid_design'] == False])
		tot = len(sub_df)

		fails = pd.concat([fails, pd.DataFrame({'test_case': test_case,
		                                        'step': step,
		                                        'domain_fails': domain_fails,
		                                        'design_fails': design_fails,
		                                        'tot': tot}, index=[0])], ignore_index=True)
# Calculate percentage columns
fails['domain_fail_pct'] = (fails['domain_fails'] / fails['tot'])
fails['design_fail_pct'] = (fails['design_fails'] / fails['tot'])

n = 5

top_domain_fails = fails.nlargest(n, 'domain_fails')[['test_case', 'step', 'domain_fails', 'domain_fail_pct']]
top_design_fails = fails.nlargest(n, 'design_fails')[['test_case', 'step', 'design_fails', 'design_fail_pct']]

with open('./experiments/failing_rankings', 'w') as z:
	z.write('Top 5 Domain Failing\n')
	for i, (_, row) in enumerate(top_domain_fails.iterrows()):
		with open(f'./test_cases/test_case_{int(row["test_case"])}', 'r') as f:
			query = f.readlines()[int(row['step'])].strip()

		z.write(
			f'#{i + 1} - T{row["test_case"]}.{row["step"]}: {query} - {row["domain_fails"]} ({row["domain_fail_pct"]:.0%})\n')

	z.write('\n\nTop 5 Design Failing\n')
	for i, (_, row) in enumerate(top_design_fails.iterrows()):
		with open(f'./test_cases/test_case_{int(row["test_case"])}', 'r') as f:
			query = f.readlines()[int(row['step'])].strip()

		z.write(
			f'#{i + 1} - T{row["test_case"]}.{row["step"]}: {query} - {row["design_fails"]} ({row["design_fail_pct"]:.0%})\n')


In [None]:
def analyze_tokens_and_time(df: pd.DataFrame,
                            freyr_mode: bool,
                            with_bootstrap: bool) -> Tuple[int, int, float]:
	input_tokens = []
	output_tokens = []
	time = []

	for i, logfile in enumerate(df['logfile'].unique()):
		with open(f'./experiments/{"freyr" if freyr_mode else "tool_v2"}_{"bootstrap" if with_bootstrap else "no_bootstrap"}/{logfile}.log', 'r') as f:
			ls = f.readlines()

			step_start_idxs = [i for i, l in enumerate(ls) if 'main - step=' in l]

			for i, step_idx in enumerate(step_start_idxs):
				in_tks_step, out_tks_step, t_step = 0, 0, 0.0
				for j, l in enumerate(ls, start=step_idx):

					# LocalLLM.extract_intents - Prompt Tokens: 448; Completion Tokens: 6; Time: 0.2812
					if 'Prompt Tokens' in l:
						in_tks, out_tks, t = l.split(' - ')[1].split(';')
						in_tks_step += int(in_tks.split(':')[1])
						out_tks_step += int(out_tks.split(':')[1])
						t_step += float(t.split(':')[1])

					if i < len(step_start_idxs) - 1 and j > step_start_idxs[i + 1]:
						break

				input_tokens.append(in_tks_step)
				output_tokens.append(out_tks_step)
				time.append(t_step)

	return input_tokens, output_tokens, time


In [None]:
tokens_time_dict = {}

for fmode, bmode, original_df in zip([True, True, False, False],
                                     [False, True, False, True],
                                     [freyr_nobootstrap_results, freyr_bootrstrap_results, tool_nobootstrap_results, tool_bootrstrap_results]):
	print(f'Processing {"FREYR" if fmode else "Tool"} {"bootstrap" if bmode else "no-bootstrap"}...')
	# sub_df = df.loc[((df['intent_llm'] == 'command-r') | (df['intent_llm'] == 'llama3.1') | (df['intent_llm'] == 'qwen2.5')) &
	#                 ((df['params_llm'] == df['intent_llm']) | (pd.isna(df['params_llm'])))]
	sub_original_df = original_df.loc[((original_df['intent_llm'] == 'command-r') | (original_df['intent_llm'] == 'llama3.1') | (original_df['intent_llm'] == 'qwen2.5')) &
	                ((original_df['params_llm'] == original_df['intent_llm']) | (pd.isna(original_df['params_llm'])))]

	k = f'{"freyr" if fmode else "tool"}_{"bootstrap" if bmode else "nobootstrap"}'
	tokens_time_dict[k] = {}

	for intent_llm in ['command-r', 'llama3.1', 'qwen2.5']:
		tokens_time_dict[k][intent_llm] = []
		for tn in [1,2,3,4,5]:
			df = sub_original_df.loc[(sub_original_df['test_case'] == tn) &
			                         (sub_original_df['intent_llm'] == intent_llm) &
			                         (sub_original_df['params_llm'] == intent_llm if fmode else sub_original_df['params_llm'].isna())]

			in_tks, out_tks, t = analyze_tokens_and_time(df=df, freyr_mode=fmode, with_bootstrap=bmode)
			tokens_time_dict[k][intent_llm].append({
				'input_tokens_avg': np.mean(in_tks),
				'input_tokens_ci': get_credible_interval(pd.Series(in_tks)),
				'output_tokens_avg': np.mean(out_tks),
				'output_tokens': get_credible_interval(pd.Series(out_tks)),
				'time_avg': np.mean(t),
				'time_ci': get_credible_interval(pd.Series(t)),
			})


In [None]:
import json

with open('./experiments/tokens_time_dict.json', 'w') as f:
	json.dump(tokens_time_dict, f)

In [None]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

for tcase in range(5):
	freyr = tokens_time_dict['freyr_bootstrap']
	tools = tokens_time_dict['tool_bootstrap']

	in_tks_freyr = {k: freyr[k][tcase]['input_tokens'] for k in freyr.keys()}
	in_tks_tools = {k: tools[k][tcase]['input_tokens'] for k in tools.keys()}
	out_tks_freyr = {k: freyr[k][tcase]['output_tokens'] for k in freyr.keys()}
	out_tks_tools = {k: tools[k][tcase]['output_tokens'] for k in tools.keys()}
	time_freyr = {k: freyr[k][tcase]['time'] for k in freyr.keys()}
	time_tools = {k: tools[k][tcase]['time'] for k in tools.keys()}

	# Extract data for plotting
	labels = ['command-r', 'llama3.1', 'qwen2.5']
	x = np.arange(len(labels))  # X-axis positions
	width = 0.4  # Width of bars

	# Plot each pair
	fig, ax = plt.subplots(figsize=(10, 6))
	for i in range(len(in_tks_freyr)):
		ax.bar(x + i * (width * 2), list(in_tks_freyr.values())[i], width, label=f'FREYR')
		ax.bar(x + (i * (width * 2) + width), list(in_tks_tools.values())[i], width, label=f'Tools')

	# Customization
	ax.set_xlabel('LLMs')
	ax.set_ylabel('Input Tokens')
	ax.set_title('')
	ax.set_xticks(x + width)
	ax.set_xticklabels(labels)
	ax.legend(loc='best')

	# Display the plot
	plt.tight_layout()
	plt.show()


In [None]:
raise Exception

In [None]:
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

def compare_on_tcase(df: pd.DataFrame,
                     freyr_mode: bool,
                     with_bootstrap: bool,
                     tn: int) -> None:
	k = 'avg_perc_complete' if not with_bootstrap else 'avg_perc_valid'

	# sub_df = df.loc[((df['intent_llm'] == 'command-r') | (df['intent_llm'] == 'llama3.1') | (df['intent_llm'] == 'qwen2.5')) &
	#                 ((df['params_llm'] == df['intent_llm']) | (pd.isna(df['params_llm'])))]

	matrix = df.loc[(df['tcase']) == tn].pivot(index='params_llm', columns='intent_llm', values=k)
	
	plt.figure(figsize=(8, 6))
	plt.imshow(matrix, cmap='coolwarm', aspect='auto', vmin=0.0, vmax=1.0)
	
	plt.colorbar(label='Average % Complete' if not with_bootstrap else 'Average % Valid')
	
	plt.xticks(ticks=np.arange(len(matrix.columns)), labels=matrix.columns, rotation=45)
	plt.yticks(ticks=np.arange(len(matrix.index)), labels=matrix.index)
	plt.xlabel('Intent LLM' if freyr_mode else 'LLM')
	plt.ylabel('Params LLM' if freyr_mode else 'LLM')
	plt.title(f'Avg % {"Complete" if not with_bootstrap else "Valid"} for T{tn}')
	
	for i in range(len(matrix.index)):
		for j in range(len(matrix.columns)):
			v = matrix.iloc[i, j]
			if not np.isnan(v):
				plt.text(j, i, f"{v:.2f}", ha='center', va='center', color='white')
	
	plt.tight_layout()
	plt.savefig(f'./experiments/T{tn}_{"freyr" if freyr_mode else "tool"}_{"bootstrap" if with_bootstrap else "nobootstrap"}_cm.png', transparent=True)
	plt.close()

In [None]:
def analyze_failures(df: pd.DataFrame,
                     freyr_mode: bool,
                     with_bootstrap: bool) -> None:
	value_err, key_err, type_err, other_err = 0, 0, 0, 0
	with_err = 0
	invalid_domain, invalid_design, unexpected_intents = 0, 0, 0
	failing = 0
	n = 0

	# sub_df = df.loc[((df['intent_llm'] == 'command-r') | (df['intent_llm'] == 'llama3.1') | (df['intent_llm'] == 'qwen2.5')) &
	#                 ((df['params_llm'] == df['intent_llm']) | (pd.isna(df['params_llm'])))]
	for i, logfile in enumerate(df['logfile'].unique()):
		with open(f'./experiments/{"freyr" if freyr_mode else "tool_v2"}_{"bootstrap" if with_bootstrap else "no_bootstrap"}/{logfile}.log', 'r') as f:
			ls = f.readlines()
			
			n_steps = 0
			
			value_errs, key_errs, type_errs = [], [], [],
			intent_lines, domain_lines, design_lines = [], [], []
			for line in ls:
				if 'main - ValueError' in line: value_errs.append(line)
				if 'main - KeyError' in line: key_errs.append(line)
				if 'main - TypeError' in line: type_errs.append(line)
				if 'main - expected_intents' in line: intent_lines.append(line)
				if 'main - valid_domain' in line: domain_lines.append(line)
				if 'main - valid_design' in line: design_lines.append(line)
				if 'main - step=' in line: n_steps += 1
				
			value_err += len(value_errs)
			key_err += len(key_errs)
			type_err += len(type_errs)
			with_err += len(value_errs) + len(key_errs) + len(type_errs)

			if with_bootstrap and n_steps not in steps_per_tcase.values(): print(logfile)

			n += n_steps

			if freyr_mode:
				for intent_line, domain_line, design_line in zip(intent_lines, domain_lines, design_lines):
					if 'False' in intent_line or 'False' in domain_line or 'False' in design_line:
						failing += 1
					if 'False' in intent_line:
						unexpected_intents += 1
					if 'False' in domain_line:
						invalid_domain += 1
					if 'False' in design_line:
						invalid_design += 1
			else:
				for domain_line, design_line in zip(domain_lines, design_lines):
					if 'False' in domain_line or 'False' in design_line:
						failing += 1
					if 'False' in domain_line:
						invalid_domain += 1
					if 'False' in design_line:
						invalid_design += 1
			
	with open(f'./experiments/{"FREYR" if freyr_mode else "Tool"}_{"bootstrap" if with_bootstrap else "no-bootstrap"}_failures.txt', 'w') as f:
		f.write(f'Total failing: {failing} / {n} ({failing / n:.2%})\n' if failing != 0 else 'Total failing: : N/A\n')
		if freyr_mode:
			f.write(f'Total wrong intents: {unexpected_intents} / {failing} ({unexpected_intents / failing:.2%})\n' if failing != 0 else 'Total wrong intents: N/A\n')
		f.write(f'Total domain errors: {invalid_domain} / {failing} ({invalid_domain / failing:.2%})\n' if failing != 0 else 'Total domain/design errors: N/A\n\n')
		f.write(f'Total design errors: {invalid_design} / {failing} ({invalid_design / failing:.2%})\n\n' if failing != 0 else 'Total domain/design errors: N/A\n\n')
		f.write(f'Total errors: {with_err} / {n} ({with_err / n:.2%})\n' if n != 0 else 'ValueErrors: N/A\n')
		f.write(f'ValueErrors: {value_err} / {with_err} ({value_err / with_err:.2%})\n' if with_err != 0 else 'ValueErrors: N/A\n')
		f.write(f'KeyErrors: {key_err} / {with_err} ({key_err / with_err:.2%})\n' if with_err != 0 else 'KeyErrors: N/A\n')
		f.write(f'TypeErrors: {type_err} / {with_err} ({type_err / with_err:.2%})\n' if with_err != 0 else 'TypeErrors: N/A\n')

In [None]:
for df, fmode, bmode, original_df in zip([summary_freyr_nobootstrap, summary_freyr_bootstrap, summary_tool_nobootstrap, summary_tool_bootstrap],
                                         [True, True, False, False],
                                         [False, True, False, True],
                                         [freyr_nobootstrap_results, freyr_bootrstrap_results, tool_nobootstrap_results, tool_bootrstrap_results]):
	print(f'Processing {"FREYR" if fmode else "Tool"} {"bootstrap" if bmode else "no-bootstrap"}...')
	# sub_df = df.loc[((df['intent_llm'] == 'command-r') | (df['intent_llm'] == 'llama3.1') | (df['intent_llm'] == 'qwen2.5')) &
	#                 ((df['params_llm'] == df['intent_llm']) | (pd.isna(df['params_llm'])))]
	# sub_original_df = original_df.loc[((original_df['intent_llm'] == 'command-r') | (original_df['intent_llm'] == 'llama3.1') | (original_df['intent_llm'] == 'qwen2.5')) &
	#                 ((original_df['params_llm'] == original_df['intent_llm']) | (pd.isna(original_df['params_llm'])))]
	sub_df = df
	sub_original_df = original_df
	print('Creating plots...')
	for tn in [1,2,3,4,5]:
		compare_on_tcase(df=sub_df, freyr_mode=fmode, with_bootstrap=bmode, tn=tn)
	print('Analyzing failures...')
	analyze_failures(sub_original_df, fmode, bmode)

In [None]:
# freyr_mode = True
# with_bootstrap = False
# 
# for logfile in freyr_nobootstrap_results['logfile']:
# 	with open(f'./experiments/{"freyr" if freyr_mode else "tool"}_{"bootstrap" if with_bootstrap else "no_bootstrap"}/{logfile}.log', 'r') as f:
# 		ls = f.readlines()
# 		if "Exception" in ls[-2]:
# 			print(logfile, ls[-2])
