**Run the code cells below in order to get the final results**
1. First cell defines functions.
2. Second cell produces the fidelity and utility results of **news apps**.
3. Third cell produces the fidelity and utility results of **shopping apps**.
4. Last cell processes the results and adds other handy information. 
It outputs the final results of all the test reuse cases, including `shopping_final.csv` for the 10 shopping apps and `news_final.csv` for the 10 news apps. 

In [1]:
import pandas as pd
import numpy as np
import glob, os
import json
import copy
import math
from pandas.io.parsers import read_csv
from lxml import etree

def evaluate_accuracy(test):
    ground_truth_source = read_csv("gui_mapper/ground_truth_mapping/" + app_category + "/GT_" + test['source'] + ".csv")
    ground_truth_target = read_csv("gui_mapper/ground_truth_mapping/" + app_category + "/GT_" + test['target'] + ".csv")
    
    source_test = ground_truth_tests.loc[ground_truth_tests['method'] == test['method']]
    if source_test.shape[0] != 1:
        print(source_test, 'is not 1')
    
    for gui_event, source_gui_event in zip(test['event_array'], source_test.iloc[0]['event_array']):
        if source_gui_event['id_or_xpath'][:3] == "id@":
            source_event = ground_truth_source.loc[ground_truth_source['id'] == source_gui_event['id_or_xpath'][3:]]
        else:
            source_event = ground_truth_source.loc[ground_truth_source['xpath'] == source_gui_event['id_or_xpath'][6:]]
        if source_event.shape[0] == 0:
            print('L84 gt missing for', source_gui_event, 'check if it should be added')
        if pd.isnull(gui_event['id_or_xpath']) or gui_event['id_or_xpath'] == '': 
            # this is a sanity check
            # if not transferred to any event, it should be marked as NONE
            print('missed is not marked as NONE', gui_event)
        if gui_event['id_or_xpath'] != "NONE": # check correct or incorrect
            if gui_event['id_or_xpath'][:3] == "id@":
                transfer_event = ground_truth_target.loc[ground_truth_target['id'] == gui_event['id_or_xpath'][3:]]
            else:
                transfer_event = ground_truth_target.loc[ground_truth_target['xpath'] == gui_event['id_or_xpath'][6:]]
            if transfer_event.shape[0] == 0:
                # print('L95 gt missing for', gui_event, 'check if it should be added')
                gui_event['case'] = "incorrect"
                # print("transfer_event", test['source'], test['target'], test['gui_mapper'])
                # print(test['method'])
                # print(gui_event['id_or_xpath'])
            elif transfer_event.iloc[0]['canonical'] == source_event.iloc[0]['canonical']:
                gui_event['case'] = "correct"
            else:
                gui_event['case'] = "incorrect"

        else: # check miss or nonExist
            target_event = ground_truth_target.loc[ground_truth_target['canonical'] == source_event.iloc[0]['canonical']]
            if target_event.shape[0] != 0:
                gui_event['case'] = "missed"
            else:
                gui_event['case'] = "nonExist"
                
    return test


def list_cases(test):
    cases = {'correct' : [], 'incorrect' : [], 'missed' : [], 'nonExist' : []}
    for gui_event in test:
        cases[gui_event['case']].append(gui_event['id_or_xpath'])
    return cases

def count_cases(test):
    cases = {}
    cases['num_correct'] = len(test['correct'])
    cases['num_incorrect'] = len(test['incorrect'])
    cases['num_missed'] = len(test['missed'])
    cases['num_nonExist'] = len(test['nonExist'])
    # cases['num_TP'] = len(test['TP'])
    # cases['num_FP'] = len(test['FP'])
    # cases['num_FN'] = len(test['FN'])
    return cases

def calc_precision_recall_accuracy(test):
    fractions = {}
    try:
        fractions['accuracy_precision'] = test['num_correct'] / (test['num_correct'] + test['num_incorrect'])
    except ZeroDivisionError:
        fractions['accuracy_precision'] = np.NaN
    try:
        fractions['accuracy_recall'] = test['num_correct'] / (test['num_correct'] + test['num_missed'])
    except ZeroDivisionError:
        fractions['accuracy_recall'] = np.NaN
    try:
        fractions['accuracy'] = (test['num_correct'] + test['num_nonExist']) / \
                                (test['num_correct'] + test['num_incorrect'] + test['num_missed'] + test['num_nonExist'])
    except ZeroDivisionError:
        fractions['accuracy'] = np.NaN
    # try:
    #     fractions['effectiveness_precision'] = test['num_TP'] / (test['num_TP'] + test['num_FP'])
    # except ZeroDivisionError:
    #     fractions['effectiveness_precision'] = np.NaN
    # try:
    #     fractions['effectiveness_recall'] = test['num_TP'] / (test['num_TP'] + test['num_FN'])
    # except ZeroDivisionError:
    #     fractions['effectiveness_recall'] = np.NaN
    return fractions

def append_src_gt_events(test):
    events = {}
    # add src events
    src_events = ground_truth_tests.loc[ground_truth_tests['method'] == test['method']]
    if src_events.shape[0] == 1:
        events['src_events'] = [gui_event['id_or_xpath'] for gui_event in src_events.iloc[0]['event_array']]
    else:
        print('src events len is not 1, check: ', test['method'])
    # add gt events
    if app_category == 'shopping':
        source_app = app_name_mapping[test['source']]
        target_app = app_name_mapping[test['target']]
    if app_category == 'news':
        source_app = test['source']
        target_app = test['target']
    target_method = test['method'].replace(source_app, target_app)
    gt_test = ground_truth_tests.loc[ground_truth_tests['method'] == target_method]
    if gt_test.shape[0] == 1:
        events['gt_events'] = [gui_event['id_or_xpath'] for gui_event in gt_test.iloc[0]['event_array']]
    else:
        if gt_test.shape[0] > 1:
            print('gt events len > 1, check: ', target_method)    
            # print(gt_test.shape)
            # print(gt_test)
    return events

def get_classname_from_xpath(xpath):
    return xpath.split('//')[1].split('[')[0]

def get_attribute_from_xpath(xpath):
    if '""' in xpath:
        xpath = xpath.replace('""', '"')
    return xpath[xpath.find("[") + 1:xpath.find("]")]

def find_node_by_xpath(xpath, app):
    directory = 'input/screenshots/' + app_category + '/' + app + '/'
    # print('find node for xpath', xpath, 'in app', app)
    for filename in os.listdir(directory):
        if filename.endswith(".uix"):
            # print('check xpath in ', os.path.join(directory, filename))
            tree = etree.parse(os.path.join(directory, filename))
            root = tree.getroot()
            if xpath.startswith('//'):  # relative xpath
                class_name = get_classname_from_xpath(xpath)
                attribute = get_attribute_from_xpath(xpath)
                # print('//node[@class="'+class_name+'"]['+attribute+']')
                nodes = root.xpath('//node[@class="' + class_name + '"][' + attribute + ']')
                if len(nodes) != 0:
                    # print('current node is ', etree.tostring(nodes[0]))
                    return nodes[0]
            elif xpath.startswith('/hierarchy'):  # absolute xpath
                class_names = xpath.split('/')
                # print(class_names)
                current_node = root.xpath('/hierarchy')[0]
                no_matching = False
                for class_name in class_names:
                    if class_name == '' or class_name == 'hierarchy':
                        continue
                    # print('.//node[@class="' + class_name +'"]')
                    if '[' in class_name:  # multiple children with same class name
                        index = int(class_name[class_name.find("[") + 1:class_name.find("]")])
                        class_name = class_name.split('[')[0]
                        current_nodes = current_node.findall('./node[@class="' + class_name + '"]')
                        if current_nodes is None or index >= len(current_nodes):
                            no_matching = True
                            break
                        else:
                            current_node = current_nodes[index]
                    else:  # only one child with same class name
                        current_nodes = current_node.findall('./node[@class="' + class_name + '"]')
                        if current_nodes is None or len(current_nodes) == 0:
                            no_matching = True
                            break
                        else:
                            current_node = current_nodes[0]
                if not no_matching:
                    # print('current node is ', etree.tostring(current_node))
                    return current_node
    # print('current node is None')
    return None

# trans test format: json with "input", "id_or_xpath", "action", "case". 'id_or_xpath' could be 'NONE'
# gt test format: 'id@...'/'xpath@...'
def trans_equals_gt(trans_event, gt_event, tgt_app):
    # when trans and gt use the same id or xpath
    trans_id_or_xpath = trans_event['id_or_xpath'] 
    if gt_event == trans_id_or_xpath:
        return True
    if gt_event[:3] == "id@": # gt_event is based on resource-id
        if trans_id_or_xpath[:3] == "id@":
            return False
        else: # gt uses id and trans uses xpath
            return compare_id_xpath(gt_event[3:], trans_id_or_xpath[6:], tgt_app)
    else: # gt_event is based on xpath
        if trans_id_or_xpath[:3] == "id@": # trans uses id, gt uses xpath
            return compare_id_xpath(trans_id_or_xpath[3:], gt_event[6:], tgt_app)
        else: # both gt and trans use xpath. one could use absolute xpath and another one uses relevant xpath
            gt_node = find_node_by_xpath(gt_event[6:], tgt_app)
            trans_node = find_node_by_xpath(trans_id_or_xpath[6:], tgt_app)
            return (gt_node == trans_node)

def compare_id_xpath(id, xpath, app):
    node = find_node_by_xpath(xpath, app) 
    if node is not None and id == node.get('resource-id'):
        return True
    return False

# trans test format: json with "input", "id_or_xpath", "action", "case". 'id_or_xpath' could be 'NONE'
# gt test format: 'id@...'/'xpath@...'
# return the levenshtein distance
def levenshtein(test):
    transfer_seq = test['event_array']
    gt_seq = test['gt_events']
    result = {}
    # if gt test doesn't exist in the target app, return NA
    if type(gt_seq) is float and math.isnan(gt_seq):
        # print(gt_seq)
        result['distance'] = np.NaN
        return result
    
    trans = copy.deepcopy(transfer_seq)
    trans = json.loads(trans)
    gt = copy.deepcopy(gt_seq)
    
    # delete 'NONE' events in order to calculate levenshtein distance correctly
    # print('before trans = ', trans)
    none_events = []
    for event in trans:
        if event['id_or_xpath'] == 'NONE':
            none_events.append(event)
    for event in none_events:
        trans.remove(event)
    # print('trans = ', trans)
    # print('gt = ', gt)

    size_x = len(trans) + 1
    size_y = len(gt) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if trans_equals_gt(trans[x-1], gt[y-1], test['target']):
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    # print (matrix)
    # print('distance = ', (matrix[size_x - 1, size_y - 1]))
    result['distance'] =  (matrix[size_x - 1, size_y - 1])
    return result

def count_event_num_other(test):
    num_events = {}
    num_events['num_src'] = len(eval(test['src_events']))
    transferred_json = json.loads(test['transferred'])
    count = 0
    for trans in transferred_json:
        if trans['id_or_xpath'] != 'NONE':
            count += 1
    num_events['num_trans'] = count
    if pd.isnull(test['gt_events']):
        num_events['num_gt'] = np.NaN
    else:
        num_events['num_gt'] = len(eval(test['gt_events']))
    return num_events

def count_event_num_atm(test):
    num_events = {}
    num_events['num_src'] = len(eval(test['src_events']))
    transferred_json = json.loads(test['transferred'])
    count = 0
    for trans in transferred_json:
        if trans != {}:
            count += 1
    num_events['num_trans'] = count
    num_events['num_gt'] = len(eval(test['gt_events']))
    return num_events

def calculate_utility_atm(test):
    result = {}
    if np.isnan(test['num_gt']):
        result['reduction'] = np.NaN
        return result
    try:
        result['reduction'] = (test['num_gt'] - test['distance']) / test['num_gt']
    except ZeroDivisionError:
        result['reduction'] = np.NaN
    return result

def calculate_utility_other(test):
    result = {}
    try:
        result['reduction'] = (test['num_gt'] - test['distance']) / test['num_gt']
    except ZeroDivisionError:
        result['reduction'] = np.NaN
    return result


In [16]:
app_category = 'news'

perfect_csv = []
for path in glob.glob("input/perfect/mapping_results_" + app_category + "/*.csv"):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "perfect", axis=1)
    perfect_csv.append(csv)
combined_csv = pd.concat(perfect_csv)

appflow_csv = []
for path in glob.glob("input/appflow/mapping_results_" + app_category + "/*.csv"):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "appflow", axis=1)
    appflow_csv.append(csv)
combined_csv = combined_csv.append(appflow_csv)

naive_csv = []
filename = "input/naive/mapping_results_" + app_category + "/*.csv"
for path in glob.glob(filename):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "naive", axis=1)
    naive_csv.append(csv)
combined_csv = combined_csv.append(naive_csv)

combined_csv['event_array'] = combined_csv['event_array'].apply(json.loads)

ground_truth_tests = [read_csv(path, header=0) for path in glob.glob("input/extracted_tests/" + app_category + "/*.csv")]
ground_truth_tests = pd.concat(ground_truth_tests)
ground_truth_tests['event_array'] = ground_truth_tests['event_array'].apply(json.loads)

combined_csv = combined_csv.apply(evaluate_accuracy, axis=1)
print('Done 1/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv['event_array'].apply(list_cases).apply(pd.Series)], axis=1)
print('Done 2/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv.apply(count_cases, axis=1).apply(pd.Series)], axis=1)
print('Done 3/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv.apply(calc_precision_recall_accuracy, axis=1).apply(pd.Series)], axis=1)
print('Done 4/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv.apply(append_src_gt_events, axis=1).apply(pd.Series)], axis=1)
print('Done 5/6 processing steps...')
combined_csv['event_array'] = combined_csv['event_array'].apply(json.dumps)
combined_csv = pd.concat([combined_csv, combined_csv.apply(levenshtein, axis=1).apply(pd.Series)], axis=1)
print('Done 6/6 processing steps! :) Now writing intermediate results to framework_results_news.csv...')
combined_csv.to_csv("output/framework_results_" + app_category + ".csv", index=False)
print('All done! :D')

Done 1/6 processing steps...
Done 2/6 processing steps...
Done 3/6 processing steps...
Done 4/6 processing steps...
Done 5/6 processing steps...
Done 6/6 processing steps! :) Now writing intermediate results to file...
All done! :D


In [15]:
app_name_mapping = {}
app_name_mapping['5miles'] = 'FiveMiles'
app_name_mapping['6pm'] = 'SixPM'
app_name_mapping['aliexpress'] = 'AliExpress'
app_name_mapping['ebay'] = 'Ebay'
app_name_mapping['etsy'] = 'Etsy'
app_name_mapping['geek'] = 'Geek'
app_name_mapping['googleshopping'] = 'GoogleExpress'
app_name_mapping['groupon'] = 'Groupon'
app_name_mapping['home'] = 'Home'
app_name_mapping['wish'] = 'Wish'

app_category = 'shopping'

perfect_csv = []
for path in glob.glob("input/perfect/mapping_results_" + app_category + "/*.csv"):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "perfect", axis=1)
    perfect_csv.append(csv)
combined_csv = pd.concat(perfect_csv)

appflow_csv = []
for path in glob.glob("input/appflow/mapping_results_" + app_category + "/*.csv"):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "appflow", axis=1)
    appflow_csv.append(csv)
combined_csv = combined_csv.append(appflow_csv)

craftdroid_csv = []
for path in glob.glob("input/craftdroid/mapping_results/*.csv"):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "craftdroid", axis=1)
    craftdroid_csv.append(csv)
# combined_csv is based on the mapping results, i.e., transferred tests
combined_csv = combined_csv.append(craftdroid_csv)

naive_csv = []
filename = "input/naive/mapping_results_" + app_category + "/*.csv"
for path in glob.glob(filename):
    csv = read_csv(path)
    apps = os.path.splitext(os.path.basename(path))[0].split("_")
    csv['source'] = csv.apply(lambda x: apps[0], axis=1)
    csv['target'] = csv.apply(lambda x: apps[1], axis=1)
    csv['gui_mapper'] = csv.apply(lambda x: "naive", axis=1)
    naive_csv.append(csv)
combined_csv = combined_csv.append(naive_csv)

combined_csv['event_array'] = combined_csv['event_array'].apply(json.loads)

ground_truth_tests = [read_csv(path, header=0) for path in glob.glob("input/extracted_tests/" + app_category + "/*.csv")]
ground_truth_tests = pd.concat(ground_truth_tests)
ground_truth_tests['event_array'] = ground_truth_tests['event_array'].apply(json.loads)

combined_csv = combined_csv.apply(evaluate_accuracy, axis=1)
print('Done 1/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv['event_array'].apply(list_cases).apply(pd.Series)], axis=1)
print('Done 2/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv.apply(count_cases, axis=1).apply(pd.Series)], axis=1)
print('Done 3/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv.apply(calc_precision_recall_accuracy, axis=1).apply(pd.Series)], axis=1)
print('Done 4/6 processing steps...')
combined_csv = pd.concat([combined_csv, combined_csv.apply(append_src_gt_events, axis=1).apply(pd.Series)], axis=1)
print('Done 5/6 processing steps...')
combined_csv['event_array'] = combined_csv['event_array'].apply(json.dumps)
combined_csv = pd.concat([combined_csv, combined_csv.apply(levenshtein, axis=1).apply(pd.Series)], axis=1)
print('Done 6/6 processing steps! :) Now writing intermediate results to framework_results_shopping.csv...')
combined_csv.to_csv("output/framework_results_" + app_category + ".csv", index=False)
print('All done! :D')

Done 1/6 processing steps...
Done 2/6 processing steps...
Done 3/6 processing steps...
Done 4/6 processing steps...
Done 5/6 processing steps...
Done 6/6 processing steps! :) Now writing intermediate results to file...
All done! :D


In [16]:
# add 'reduction' to final results to ATM mapper
atm_csv = read_csv("input/ATM/atm_shopping.csv")
atm_csv = pd.concat([atm_csv, atm_csv.apply(count_event_num_atm, axis=1).apply(pd.Series)], axis=1)
atm_csv = pd.concat([atm_csv, atm_csv.apply(calculate_utility_atm, axis=1).apply(pd.Series)], axis=1)
atm_csv.to_csv("output/atm_shopping.csv", index=False)
atm_csv = read_csv("input/ATM/atm_news.csv")
atm_csv = pd.concat([atm_csv, atm_csv.apply(count_event_num_atm, axis=1).apply(pd.Series)], axis=1)
atm_csv = pd.concat([atm_csv, atm_csv.apply(calculate_utility_atm, axis=1).apply(pd.Series)], axis=1)
atm_csv.to_csv("output/atm_news.csv", index=False)
print('Done 1/5 processing steps...')

# add 'reduction' to final results to GTM mapper
gtm_csv = read_csv("input/GTM/gtm_shopping.csv")
gtm_csv = pd.concat([gtm_csv, gtm_csv.apply(count_event_num_atm, axis=1).apply(pd.Series)], axis=1)
gtm_csv = pd.concat([gtm_csv, gtm_csv.apply(calculate_utility_atm, axis=1).apply(pd.Series)], axis=1)
gtm_csv.to_csv("output/gtm_shopping.csv", index=False)
gtm_csv = read_csv("input/GTM/gtm_news.csv")
gtm_csv = pd.concat([gtm_csv, gtm_csv.apply(count_event_num_atm, axis=1).apply(pd.Series)], axis=1)
gtm_csv = pd.concat([gtm_csv, gtm_csv.apply(calculate_utility_atm, axis=1).apply(pd.Series)], axis=1)
gtm_csv.to_csv("output/gtm_news.csv", index=False)
print('Done 2/5 processing steps...')

# add 'reduction' to final results of other mappers
other_csv = read_csv("output/framework_results_news.csv")
other_csv = other_csv.rename(columns={'event_array': 'transferred'})
other_csv = pd.concat([other_csv, other_csv.apply(count_event_num_other, axis=1).apply(pd.Series)], axis=1)
other_csv = pd.concat([other_csv, other_csv.apply(calculate_utility_other, axis=1).apply(pd.Series)], axis=1)
other_csv.to_csv("output/other_news.csv", index=False)
other_csv = read_csv("output/framework_results_shopping.csv")
other_csv = other_csv.rename(columns={'event_array': 'transferred'})
other_csv = pd.concat([other_csv, other_csv.apply(count_event_num_other, axis=1).apply(pd.Series)], axis=1)
other_csv = pd.concat([other_csv, other_csv.apply(calculate_utility_other, axis=1).apply(pd.Series)], axis=1)
other_csv.to_csv("output/other_shopping.csv", index=False)
print('Done 3/5 processing steps...')

# merge all results together into news_final.csv
atm_csv = read_csv("output/atm_news.csv")
gtm_csv = read_csv("output/gtm_news.csv")
other_csv = read_csv("output/other_news.csv") 
merged_csv = atm_csv.append(gtm_csv)
merged_csv = merged_csv.append(other_csv)
print('Done 4/5 processing steps... Now writing news_final.csv to output folder')
merged_csv.to_csv("output/news_final.csv", index=False)

# merge all results together into shopping_final.csv
atm_csv = read_csv("output/atm_shopping.csv")
gtm_csv = read_csv("output/gtm_shopping.csv")
other_csv = read_csv("output/other_shopping.csv") 
merged_csv = atm_csv.append(gtm_csv)
merged_csv = merged_csv.append(other_csv)
print('Done 5/5 processing steps! :) Now writing shopping_final.csv to output folder')
merged_csv.to_csv("output/shopping_final.csv", index=False)
print('All done! :D')

Done 1/5 processing steps...
Done 2/5 processing steps...
Done 3/5 processing steps...
Done 4/5 processing steps... Now writing news_final.csv to output folder
Done 5/5 processing steps! :) Now writing shopping_final.csv to output folder
All done! :D
