# Report Generation Utilities

In [1]:
import json
from pathlib import Path
import numpy as np

In [2]:
repetition_error = "finished but recursively execute."

def load_result(result_file):
    with open(result_file, "r") as f:
        data = json.load(f)
    return data

def count_matric(action, data, correct_repeated_logic:bool=False):
    tp, fp, tn, fn = data["action-variation"][action]["tp"], data["action-variation"][action]["fp"], \
        data["action-variation"][action]["tn"], data["action-variation"][action]['fn'] 

    c = 0
    for t in data["action-variation"][action]["fp_sample"]:
        if "action_evaluation" in t['step'] \
        and t['step']["action_evaluation"]["failedReason"] == repetition_error:
            c += 1
    fp_failed_detect_repetition = c
            
    if correct_repeated_logic:
        tp += c; fp -= c

    c = 0
    for t in data["action-variation"][action]["fn_sample"]:
        if "action_evaluation" in t['step'] \
        and t['step']["action_evaluation"]["failedReason"] == repetition_error:
            c += 1
    fn_failed_detect_repetition = c

    if correct_repeated_logic:
        fn -= c; tp += c
    
    print(fp_failed_detect_repetition, fn_failed_detect_repetition)
    return tp, fp, tn, fn, fp_failed_detect_repetition, fn_failed_detect_repetition

In [3]:
def analyze_result(tp, fp, tn, fn):
    y_true = [1] * tp + [0] * fp + [1] * fn + [0] * tn
    y_pred = [1] * (tp + fp) + [0] * (fn + tn)

    from sklearn.metrics import classification_report, confusion_matrix

    matrix = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = matrix.ravel()

    print("Actual - True, Pred - True: ", matrix[1][1])
    print("Actual - True, Pred - False: ", matrix[1][0])
    print("Actual - False, Pred - True: ", matrix[0][1])
    print("Actual - False, Pred - False: ", matrix[0][0])

    result = classification_report(y_true, y_pred, target_names=['negative', 'positive'])
    print(result)
    return classification_report(y_true, y_pred, target_names=['negative', 'positive'], output_dict=True)

## Check the repetition sample

In [4]:
input_trajectory_summary_file = "../data/failed_agent_trajectory/trajectory.json"
trajectory_file = "plan_execution_annotated_local_status.json"
parent_folder = Path(input_trajectory_summary_file).parent.__str__()
total_manual_repetition_number = 0
total_manual_correction_number = 0
actions = ['click', 'input']
total = {}
total_manual_correction_number = {}
total_manual_repetition_number = {}
with open(input_trajectory_summary_file, "r") as f:
    trajectories = json.load(f)
    for t in trajectories:
        t_folder = f"{parent_folder}/{t}"
        tfile_for_each = f"{t_folder}/{trajectory_file}"
        single_trajectory_data = load_result(tfile_for_each)
        for step in single_trajectory_data["executions"]:
            for a in actions:
                if 'action' in step and step['action'] == a:
                    if "action_evaluation" in step:
                        total_manual_correction_number[a] = total_manual_correction_number.get(a, 0) + 1
                    if "action_evaluation" in step \
                        and step["action_evaluation"]["failedReason"] == repetition_error:
                        total_manual_repetition_number[a] = total_manual_repetition_number.get(a, 0) + 1
                    total[a] = total.get(a, 0) + 1
print(f"total_manual_repetition_number: {total_manual_repetition_number}, \
total_manual_correction_number: {total_manual_correction_number}, \
total: {total}")

total_manual_repetition_number: {'click': 20}, total_manual_correction_number: {'click': 95, 'input': 5}, total: {'click': 197, 'input': 12}


## Final statistics: LoT Agent + Simplified Action Validation Agent 

* Repeated 10 times
* Action Validation Logic: not consider action + 'target control', only take action 'input' or 'click' into consideration
  1. region should be aligned instruction by using input or click
  2. input or click shouldn't repeat too many times on the same region (let GPT4-o to do verification)

In [20]:
from glob import glob

In [21]:
files = glob("ssr_statistic_agent_*_mark_click_input_prompt_*.json")
print(files)

['ssr_statistic_agent_2024-06-13 00:39:57_mark_click_input_prompt_v5_f6.json', 'ssr_statistic_agent_2024-06-13 07:39:46_mark_click_input_prompt_v5_f10.json', 'ssr_statistic_agent_2024-06-13 05:05:43_mark_click_input_prompt_v5_f9.json', 'ssr_statistic_agent_2024-06-12 17:30:59_mark_click_input_prompt_v5_f1.json', 'ssr_statistic_agent_2024-06-12 20:20:12_mark_click_input_prompt_v5_f3.json', 'ssr_statistic_agent_2024-06-13 03:59:08_mark_click_input_prompt_v5_f8.json', 'ssr_statistic_agent_2024-06-13 02:34:53_mark_click_input_prompt_v5_f7.json', 'ssr_statistic_agent_2024-06-12 19:15:59_mark_click_input_prompt_v5_f2.json', 'ssr_statistic_agent_2024-06-12 22:53:33_mark_click_input_prompt_v5_f5.json', 'ssr_statistic_agent_2024-06-12 21:26:30_mark_click_input_prompt_v5_f4.json']


In [22]:
def result_summarize(actions, data, files, click_repetition_num:int=20, report_repetition:bool=True):
    consolidated_result = {}
    for file in files:
        data = load_result(file)
        tp, fp, tn, fn, fp_failed_detect_repetition, fn_failed_detect_repetition = 0, 0, 0, 0, 0, 0
        for action in actions:
            tp_a, fp_a, tn_a, fn_a, fp_failed_detect_repetition_t, fn_failed_detect_repetition_t = count_matric(action, data)
            tp += tp_a; fp += fp_a; tn += tn_a; fn += fn_a; fp_failed_detect_repetition += fp_failed_detect_repetition_t; fn_failed_detect_repetition += fn_failed_detect_repetition_t
        consolidated_result.setdefault('tp', []).append(tp)        
        consolidated_result.setdefault('tpr', []).append(tp/(tp+fp))
        consolidated_result.setdefault('fp', []).append(fp)
        consolidated_result.setdefault('fpr', []).append(fp/(tp+fp))
        consolidated_result.setdefault('tn', []).append(tn)
        consolidated_result.setdefault('tnr', []).append(tn/(tn+fn))
        consolidated_result.setdefault('fn', []).append(fn)
        consolidated_result.setdefault('fnr', []).append(fn/(tn+fn))
        consolidated_result.setdefault('accuracy_v', []).append((tp+tn)/(tp+fp+tn+fn))
        consolidated_result.setdefault('precision_v', []).append(tp/(tp+fp))
        consolidated_result.setdefault('recall_v', []).append(tp/(tp+fn))
        consolidated_result.setdefault('p', []).append(tp+fn)
        consolidated_result.setdefault('n', []).append(fp+tn)
        consolidated_result.setdefault('total', []).append(tp+fp+tn+fn)
        if report_repetition:
            consolidated_result.setdefault('fprepfd', []).append(fp_failed_detect_repetition)
            consolidated_result.setdefault('fnrepfd', []).append(fn_failed_detect_repetition)
            consolidated_result.setdefault('reps', []).append(click_repetition_num - fp_failed_detect_repetition - fn_failed_detect_repetition)
            consolidated_result.setdefault('repsrate', []).append((click_repetition_num - fp_failed_detect_repetition - fn_failed_detect_repetition)/click_repetition_num)

        result = analyze_result(tp, fp, tn, fn)
        for k, v in result.items():
            if type(v) == dict:
                for metric, value in v.items():
                    if k not in consolidated_result:
                        consolidated_result[k] = {}
                    consolidated_result[k].setdefault(metric, []).append(value)
            else:
                if k not in consolidated_result:
                    consolidated_result[k] = []
                consolidated_result[k].append(result[k])

    consolidated_result_analysis = {}
    for k, v in consolidated_result.items():
        if type(v) == dict:
            for metric, s in v.items():
                mean_v, std_v = np.mean(s), np.std(s)
                if k not in consolidated_result_analysis:
                        consolidated_result_analysis[k] = {}
                consolidated_result_analysis[k].setdefault(metric, []).append((mean_v, std_v))
        else:
            mean_v, std_v = np.mean(v), np.std(v)
            consolidated_result_analysis[k] = (mean_v, std_v)
            
    # Extracting data from the dictionary
    classification_report = consolidated_result_analysis
    negative = classification_report['negative']
    positive = classification_report['positive']
    accuracy = classification_report['accuracy']
    macro_avg = classification_report['macro avg']
    weighted_avg = classification_report['weighted avg']
    tp_m = classification_report['tp']
    fp_m = classification_report['fp']
    tn_m = classification_report['tn']
    fn_m = classification_report['fn']
    positive_m = classification_report['p']
    negative_m = classification_report['n']
    total_m = classification_report['total']
    tpr_m = classification_report['tpr']
    rpr_m = classification_report['rpr']
    tnr_m = classification_report['tnr']
    fnr_m = classification_report['fnr']
    accuracy_m = classification_report['accuracy_v']
    precision_m = classification_report['precision_v']
    recall_m = classification_report['recall_v']
    if report_repetition:
        fprepfd_m = classification_report['fprepfd']
        fnrepfd_m = classification_report['fnrepfd']
        reps_m = classification_report['reps']
        repsrate_m = classification_report['repsrate']

    # Formatting and printing the classification report
    if report_repetition:
        report = f"""
                           precision   recall  f1-score   support

            negative       {negative['precision'][0][0]:.2f}:{negative['precision'][0][1]:.2f}  {negative['recall'][0][0]:.2f}:{negative['recall'][0][1]:.2f} {negative['f1-score'][0][0]:.2f}:{negative['f1-score'][0][1]:.2f} {negative['support'][0][0]:.0f}:{negative['support'][0][1]:.0f}
            positive       {positive['precision'][0][0]:.2f}:{positive['precision'][0][1]:.2f}  {positive['recall'][0][0]:.2f}:{positive['recall'][0][1]:.2f} {positive['f1-score'][0][0]:.2f}:{positive['f1-score'][0][1]:.2f} {positive['support'][0][0]:.0f}:{positive['support'][0][1]:.0f}

            accuracy                           {accuracy[0]:.2f}:{accuracy[1]:.2f} {sum([negative['support'][0][0], positive['support'][0][0]]):.0f}
           macro avg       {macro_avg['precision'][0][0]:.2f}:{macro_avg['precision'][0][1]:.2f} {macro_avg['recall'][0][0]:.2f}:{macro_avg['recall'][0][1]:.2f} {macro_avg['f1-score'][0][0]:.2f}:{macro_avg['f1-score'][0][1]:.2f}  {macro_avg['support'][0][0]:.0f}
        weighted avg       {weighted_avg['precision'][0][0]:.2f}:{weighted_avg['precision'][0][1]:.2f} {weighted_avg['recall'][0][0]:.2f}:{weighted_avg['recall'][0][1]:.2f} {weighted_avg['f1-score'][0][0]:.2f}:{weighted_avg['f1-score'][0][1]:.2f} {weighted_avg['support'][0][0]:.0f}

            tp {tp_m[0]:.2f}:{tp_m[1]:.2f}
            fp {fp_m[0]:.2f}:{fp_m[1]:.2f}
            tn {tn_m[0]:.2f}:{tn_m[1]:.2f}
            fn {fn_m[0]:.2f}:{fn_m[1]:.2f}
            tpr {tpr_m[0]:.2f}:{tpr_m[1]:.2f}
            rpr {rpr_m[0]:.2f}:{rpr_m[1]:.2f}
            tnr {tnr_m[0]:.2f}:{tnr_m[1]:.2f}
            fnr {fnr_m[0]:.2f}:{fnr_m[1]:.2f}
            accuracy_v {accuracy_m[0]:.2f}:{accuracy_m[1]:.2f}
            precision_v {precision_m[0]:.2f}:{precision_m[1]:.2f}
            recall_v {recall_m[0]:.2f}:{recall_m[1]:.2f}
            fp_failed_detect_repetition {fprepfd_m[0]:.2f}:{fprepfd_m[1]:.2f}
            fn_failed_detect_repetition {fnrepfd_m[0]:.2f}:{fnrepfd_m[1]:.2f}
            repetition_detect_in_click {reps_m[0]:.2f}:{reps_m[1]:.2f}
            repetition_detect_in_click rate {repsrate_m[0]:.2f}:{repsrate_m[1]:.2f}
            total {total_m[0]:.2f}:{total_m[1]:.2f}
        """
    else:
        report = f"""
                           precision   recall  f1-score   support

            negative       {negative['precision'][0][0]:.2f}:{negative['precision'][0][1]:.2f}  {negative['recall'][0][0]:.2f}:{negative['recall'][0][1]:.2f} {negative['f1-score'][0][0]:.2f}:{negative['f1-score'][0][1]:.2f} {negative['support'][0][0]:.0f}:{negative['support'][0][1]:.0f}
            positive       {positive['precision'][0][0]:.2f}:{positive['precision'][0][1]:.2f}  {positive['recall'][0][0]:.2f}:{positive['recall'][0][1]:.2f} {positive['f1-score'][0][0]:.2f}:{positive['f1-score'][0][1]:.2f} {positive['support'][0][0]:.0f}:{positive['support'][0][1]:.0f}

            accuracy                           {accuracy[0]:.2f}:{accuracy[1]:.2f} {sum([negative['support'][0][0], positive['support'][0][0]]):.0f}
           macro avg       {macro_avg['precision'][0][0]:.2f}:{macro_avg['precision'][0][1]:.2f} {macro_avg['recall'][0][0]:.2f}:{macro_avg['recall'][0][1]:.2f} {macro_avg['f1-score'][0][0]:.2f}:{macro_avg['f1-score'][0][1]:.2f}  {macro_avg['support'][0][0]:.0f}
        weighted avg       {weighted_avg['precision'][0][0]:.2f}:{weighted_avg['precision'][0][1]:.2f} {weighted_avg['recall'][0][0]:.2f}:{weighted_avg['recall'][0][1]:.2f} {weighted_avg['f1-score'][0][0]:.2f}:{weighted_avg['f1-score'][0][1]:.2f} {weighted_avg['support'][0][0]:.0f}

            tp {tp_m[0]:.2f}:{tp_m[1]:.2f}
            fp {fp_m[0]:.2f}:{fp_m[1]:.2f}
            tn {tn_m[0]:.2f}:{tn_m[1]:.2f}
            fn {fn_m[0]:.2f}:{fn_m[1]:.2f}
            tpr {tpr_m[0]:.2f}:{tpr_m[1]:.2f}
            fpr {rpr_m[0]:.2f}:{rpr_m[1]:.2f}
            tnr {tnr_m[0]:.2f}:{tnr_m[1]:.2f}
            fnr {fnr_m[0]:.2f}:{fnr_m[1]:.2f}
            accuracy_v {accuracy_m[0]:.2f}:{accuracy_m[1]:.2f}
            precision_v {precision_m[0]:.2f}:{precision_m[1]:.2f}
            recall_v {recall_m[0]:.2f}:{recall_m[1]:.2f}
            total {total_m[0]:.2f}:{total_m[1]:.2f}
        """

    print(report)

### For click

In [23]:
actions = ["click"]
result_summarize(actions, data, files)

0 10
Actual - True, Pred - True:  95
Actual - True, Pred - False:  59
Actual - False, Pred - True:  7
Actual - False, Pred - False:  36
              precision    recall  f1-score   support

    negative       0.38      0.84      0.52        43
    positive       0.93      0.62      0.74       154

    accuracy                           0.66       197
   macro avg       0.66      0.73      0.63       197
weighted avg       0.81      0.66      0.69       197

0 11
Actual - True, Pred - True:  95
Actual - True, Pred - False:  57
Actual - False, Pred - True:  7
Actual - False, Pred - False:  38
              precision    recall  f1-score   support

    negative       0.40      0.84      0.54        45
    positive       0.93      0.62      0.75       152

    accuracy                           0.68       197
   macro avg       0.67      0.73      0.65       197
weighted avg       0.81      0.68      0.70       197

0 11
Actual - True, Pred - True:  93
Actual - True, Pred - False:  58
Actu

### For input

In [24]:
actions = ['input']
result_summarize(actions, data, files, report_repetition=False)

0 0
Actual - True, Pred - True:  7
Actual - True, Pred - False:  4
Actual - False, Pred - True:  0
Actual - False, Pred - False:  1
              precision    recall  f1-score   support

    negative       0.20      1.00      0.33         1
    positive       1.00      0.64      0.78        11

    accuracy                           0.67        12
   macro avg       0.60      0.82      0.56        12
weighted avg       0.93      0.67      0.74        12

0 0
Actual - True, Pred - True:  7
Actual - True, Pred - False:  3
Actual - False, Pred - True:  0
Actual - False, Pred - False:  2
              precision    recall  f1-score   support

    negative       0.40      1.00      0.57         2
    positive       1.00      0.70      0.82        10

    accuracy                           0.75        12
   macro avg       0.70      0.85      0.70        12
weighted avg       0.90      0.75      0.78        12

0 0
Actual - True, Pred - True:  7
Actual - True, Pred - False:  4
Actual - False,

### For both click and input

In [25]:
actions = ['click', 'input']
result_summarize(actions, data, files)

0 10
0 0
Actual - True, Pred - True:  102
Actual - True, Pred - False:  63
Actual - False, Pred - True:  7
Actual - False, Pred - False:  37
              precision    recall  f1-score   support

    negative       0.37      0.84      0.51        44
    positive       0.94      0.62      0.74       165

    accuracy                           0.67       209
   macro avg       0.65      0.73      0.63       209
weighted avg       0.82      0.67      0.70       209

0 11
0 0
Actual - True, Pred - True:  102
Actual - True, Pred - False:  60
Actual - False, Pred - True:  7
Actual - False, Pred - False:  40
              precision    recall  f1-score   support

    negative       0.40      0.85      0.54        47
    positive       0.94      0.63      0.75       162

    accuracy                           0.68       209
   macro avg       0.67      0.74      0.65       209
weighted avg       0.82      0.68      0.71       209

0 11
0 0
Actual - True, Pred - True:  100
Actual - True, Pred - 