In [8]:
# High accurately are function calls invoked?

import os 
import shutil
import re
import glob 
import pandas as pd
import json

output_dir = './data/eval_output'

rows = []

for app_output_path in glob.glob(os.path.join(output_dir, '*')):
    if not os.path.isdir(app_output_path):
        continue

    for tf_result_path in glob.glob(os.path.join(app_output_path, '*')):
        gen_text_inputs = []
        text_input_files = glob.glob(os.path.join(tf_result_path, 'text_inputs', 'text_input_*.json'))
        if len(text_input_files) > 1:
            print(f'Warning: multiple text input files found in {tf_result_path}')

        with open(text_input_files[0], 'r') as f:
            data = json.load(f)
            assert len(data) == 1
            for tf_id, tf_info in data.items():
                gen_text_inputs.append(tf_info['default_tester'])

        called_functions = []
        prompt_files = glob.glob(os.path.join(tf_result_path, 'prompts', 'prompt_*.json'))
        for prompt_file in prompt_files:
            # only one prompt sample
            with open(prompt_file, 'r') as f:
                prompt = json.load(f)
            
            for user_message, assistant_message in prompt['prompt']['conversation']:
                if isinstance(assistant_message, dict):
                    called_functions.append(assistant_message['name'])

            break

        rows.append({
            'app_name': os.path.basename(app_output_path),
            'textfield_id': os.path.basename(tf_result_path),
            'called_functions': called_functions,
            'gen_text_inputs': gen_text_inputs
        })

    
df = pd.DataFrame(rows)




In [9]:
from collections import defaultdict

eval_df = pd.read_csv('./data/RQ1_DroidAgent-textinput-labelling-merged-cleaned.csv', header=0)[:-1]

textfield_map = defaultdict(list)

for _, row in eval_df.iterrows():
    textfield_map[(row['app_name'], row['textfield_id'])].append(int(row['index']))

In [10]:
df['data_id'] = df.apply(lambda row: textfield_map[(row['app_name'], row['textfield_id'])], axis=1)

In [11]:
df.to_csv('./data/RQ3_function_call_accuracy.csv', index=False)

In [12]:
df_labelled = pd.read_csv('./data/RQ3_function_call_accuracy_labelled_5prompts.csv', header=0)

result_rows = []

function2index = {
    'get_friend_profile': 0,
    'get_samsung_product_info': 1,
    'get_galaxy_store_coupon_code': 2,
}

for _, row in df_labelled.iterrows():
    expected = [0, 0, 0]
    actual = [0, 0, 0]

    if row['required_function_call'] in function2index:
        expected[function2index[row['required_function_call']]] = 1
    
    called_functions = json.loads(row['called_functions'].replace('\'', '"'))
    data_ids = json.loads(row['data_id'].replace('\'', '"'))

    for called_function in called_functions:
        assert called_function in function2index
        actual[function2index[called_function]] = 1

    for i in data_ids:
        result_rows.append({
            'data_id': i,
            'app_name': row['app_name'],
            'textfield_id': row['textfield_id'],
            'expected': expected,
            'actual': actual
        })
        

result_df = pd.DataFrame(result_rows)
result_df

Unnamed: 0,data_id,app_name,textfield_id,expected,actual
0,93,Settings,search_src_text_137_106,"[0, 0, 0]","[0, 1, 0]"
1,1,org.koitharu.kotatsu,searchView_198_119,"[0, 0, 0]","[0, 0, 0]"
2,95,Galaxy Store,search_src_text_153_118,"[0, 0, 0]","[0, 0, 0]"
3,94,Galaxy Store,coupon_edit_89_1912,"[0, 0, 1]","[0, 0, 1]"
4,96,Galaxy Store,name_86_725,"[0, 0, 0]","[1, 0, 0]"
...,...,...,...,...,...
115,127,Clock,worldclock_search_map_txt_find_158_140,"[0, 0, 0]","[1, 0, 0]"
116,124,Clock,numberpicker_input_144_408,"[0, 0, 0]","[0, 1, 0]"
117,128,Clock,hour_48_1787,"[0, 0, 0]","[0, 1, 0]"
118,130,Clock,second_354_1787,"[0, 0, 0]","[0, 0, 0]"


In [13]:
from sklearn.metrics import classification_report

print(classification_report(result_df['expected'].tolist(), result_df['actual'].tolist(), target_names=['get_friend_profile', 'get_samsung_product_info', 'get_galaxy_store_coupon_code'], zero_division=0))

                              precision    recall  f1-score   support

          get_friend_profile       0.31      0.65      0.42        20
    get_samsung_product_info       0.14      1.00      0.25         3
get_galaxy_store_coupon_code       1.00      1.00      1.00         1

                   micro avg       0.27      0.71      0.39        24
                   macro avg       0.48      0.88      0.56        24
                weighted avg       0.32      0.71      0.42        24
                 samples avg       0.14      0.14      0.14        24

