In [6]:
import json

guideline_path = "raw/guidelines.json"

with open(guideline_path, "r") as f:
    guidelines = json.load(f)

scenarios = guidelines.keys()
print("Scenarios: ", list(scenarios))

for scenario in scenarios:
    flows = guidelines[scenario]['subflows']
    # calculate the average length of the subflows
    subflows_names = list(flows.keys())
    total_length = 0
    for subflows_name in subflows_names:
        subflows = flows[subflows_name]
        actions = subflows['actions']
        total_length += len(actions)
        print("Subflow: ", subflows_name, "Length: ", len(actions))
    avg_length = total_length / len(subflows_names)

    print("Scenario: ", scenario, "Average Length: ", avg_length)


Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
Subflow:  Initiate Refund Length:  5
Subflow:  Update Refund Length:  4
Subflow:  Refund Status Length:  4
Subflow:  Return Due to Stain Length:  6
Subflow:  Return Due to Color Length:  6
Subflow:  Return Due to Size Length:  6
Scenario:  Product Defect Average Length:  5.166666666666667
Subflow:  Status Mystery Fee Length:  5
Subflow:  Status Delivery Time Length:  5
Subflow:  Status Payment Method Length:  4
Subflow:  Status Quantity Length:  5
Subflow:  Manage Upgrade Length:  5
Subflow:  Manage Downgrade Length:  5
Subflow:  Manage Create Length:  5
Subflow:  Manage Cancel Length:  5
Scenario:  Order Issue Average Length:  4.875
Subflow:  Recover Username Length:  3
Subflow:  Recover Password Length:  4
Subflow:  Reset Two-Factor Auth Length:  3
Scenario:  Account Access Average 

### Average Action Length in the Training Set

In [14]:
import json
import os

guideline_path = "raw/guidelines.json"

with open(guideline_path, "r") as f:
    guidelines = json.load(f)

scenarios = guidelines.keys()
print("Scenarios: ", list(scenarios))


# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

data_actions = list(read_json_lines('processed/train_AST_abcd_woaction_flow_all.json'))

action_length = {}

for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    action_length[scenario_name] = {}

for data_action in data_actions:
    convo_id = data_action['convo_id']
    action_scenario = data_action['flow']

    if convo_id not in action_length[action_scenario]:
        action_length[action_scenario][convo_id] = 1
    else:
        action_length[action_scenario][convo_id] += 1

print("Action Length: ", action_length)

# calculate the average length for each scenario
for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    convo_ids = action_length[scenario_name].keys()
    total_length = 0
    for convo_id in convo_ids:
        total_length += action_length[scenario_name][convo_id]
    avg_length = total_length / len(convo_ids)
    print("Scenario: ", scenario, "Average Length: ", avg_length)

Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
Action Length:  {'product_defect': {1746: 4, 3070: 3, 6723: 2, 8097: 4, 7249: 4, 8240: 4, 2869: 5, 5565: 2, 5157: 2, 9505: 4, 6677: 5, 632: 2, 7639: 4, 8909: 4, 3949: 5, 7308: 7, 7123: 5, 6665: 5, 9671: 5, 5437: 5, 5670: 5, 1847: 5, 8800: 5, 2740: 5, 9724: 4, 4534: 2, 9372: 5, 9393: 4, 6909: 2, 2989: 2, 9081: 5, 905: 2, 6174: 4, 4803: 2, 8065: 8, 7473: 4, 6099: 4, 2568: 3, 4825: 2, 5977: 2, 7300: 6, 1966: 4, 4791: 2, 1061: 6, 1302: 4, 2005: 7, 5998: 4, 1874: 5, 4553: 5, 9657: 5, 5750: 4, 2976: 6, 318: 6, 8022: 5, 8305: 4, 391: 6, 199: 8, 8608: 5, 7916: 5, 6181: 5, 7921: 5, 10418: 5, 6313: 5, 3884: 4, 7122: 5, 706: 4, 9873: 4, 3396: 6, 7196: 5, 7174: 4, 4999: 5, 6413: 5, 6675: 4, 9868: 3, 2918: 5, 5854: 3, 7234: 5, 5480: 2, 3153: 6, 7829: 4, 5076: 2, 5331: 4, 7525: 3, 971: 5, 8071: 3, 

### Average Action Length in the Testing Set

In [15]:
import json
import os

guideline_path = "raw/guidelines.json"

with open(guideline_path, "r") as f:
    guidelines = json.load(f)

scenarios = guidelines.keys()
print("Scenarios: ", list(scenarios))


# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

data_actions = list(read_json_lines('processed/test_AST_abcd_woaction_flow_all.json'))

action_length = {}

for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    action_length[scenario_name] = {}

for data_action in data_actions:
    convo_id = data_action['convo_id']
    action_scenario = data_action['flow']

    if convo_id not in action_length[action_scenario]:
        action_length[action_scenario][convo_id] = 1
    else:
        action_length[action_scenario][convo_id] += 1

print("Action Length: ", action_length)

# calculate the average length for each scenario
for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    convo_ids = action_length[scenario_name].keys()
    total_length = 0
    for convo_id in convo_ids:
        total_length += action_length[scenario_name][convo_id]
    avg_length = total_length / len(convo_ids)
    print("Scenario: ", scenario, "Average Length: ", avg_length)

Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
Action Length:  {'product_defect': {8141: 5, 4162: 5, 3342: 5, 1529: 5, 6805: 4, 10127: 8, 9366: 5, 9268: 5, 573: 3, 2269: 6, 2334: 4, 8102: 2, 9121: 4, 10124: 4, 7354: 5, 9421: 4, 6942: 6, 9415: 5, 7333: 5, 5357: 5, 8740: 4, 5046: 5, 6274: 3, 9737: 4, 8580: 4, 2984: 4, 1859: 2, 6957: 6, 3662: 4, 3764: 5, 9178: 5, 6587: 4, 4332: 3, 3567: 5, 6688: 4, 9967: 6, 10012: 2, 9880: 5, 9994: 4, 1972: 5, 6406: 5, 1179: 5, 9452: 2, 7859: 2, 7129: 2, 10347: 5, 7817: 2, 9310: 2, 2599: 5, 5374: 4, 3182: 4, 3538: 4, 6435: 5, 7722: 3, 9220: 2, 7782: 4, 8092: 5, 8855: 3, 533: 5, 3013: 5, 6069: 5, 2669: 5, 2509: 3, 3450: 5, 10259: 5, 1229: 7, 1526: 3, 10222: 5, 6255: 5, 8197: 5, 3332: 4, 3054: 5, 2673: 3, 9225: 6, 3294: 4, 9340: 5, 4961: 4, 5531: 2, 1064: 10, 4072: 5, 5436: 5, 1037: 5, 389: 6, 3839: 5,

### Average Rounds in Train Set

In [11]:
import json
import os

# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

def read_json(file):
    with open(file, 'r') as f:
        return json.load(f)
    
def write_json(file, obj):
    with open(file, 'w') as f:
        json.dump(obj, f, indent=2)

def select_longest_dialogue_raw(data):
    # select the longest dialogue using the frequency of the convo_id appearing in the data
    guideline_path = "raw/guidelines.json"
    with open(guideline_path, "r") as f:
        guidelines = json.load(f)

    scenarios = guidelines.keys()
    print("Scenarios: ", list(scenarios))
    convo_id_freq = {}
    for scenario in scenarios:
        scenario_name = scenario.lower()
        scenario_name = scenario_name.replace("-", "_")
        scenario_name = scenario_name.replace(" ", "_")
        convo_id_freq[scenario_name] = {}
    for d in data:
        convo_id = d['convo_id']
        scenario = d['scenario']['flow']
        for turn in d['original']:
            if convo_id not in convo_id_freq[scenario]:
                convo_id_freq[scenario][convo_id] = 1
            else:
                convo_id_freq[scenario][convo_id] += 1
    
    return convo_id_freq

def main():
    data_raw = read_json('raw/abcd_v1.1.json')
    test_data_raw = data_raw['train']

    # longest_dialogue = select_longest_dialogue(test_data_raw, data_actions)
    dialogue_length = select_longest_dialogue_raw(test_data_raw)
    print(dialogue_length)

    # calculate the average length for each scenario
    for scenario in dialogue_length.keys():
        convo_ids = dialogue_length[scenario].keys()
        total_length = 0
        for convo_id in convo_ids:
            total_length += dialogue_length[scenario][convo_id]
        avg_length = total_length / len(convo_ids)
        print("Scenario: ", scenario, "Average Length: ", avg_length)

if __name__ == '__main__':
    main()

Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
{'product_defect': {3592: 29, 9489: 21, 2298: 24, 1657: 34, 991: 19, 3654: 19, 6376: 33, 8694: 20, 7275: 13, 1848: 29, 698: 25, 6611: 23, 3172: 31, 7998: 29, 1192: 23, 1308: 19, 905: 35, 8087: 26, 5419: 26, 10399: 22, 5332: 29, 8626: 32, 9948: 34, 5670: 34, 8943: 22, 4772: 32, 8692: 36, 5885: 24, 2071: 29, 4534: 21, 632: 24, 3996: 27, 2560: 22, 3937: 21, 5820: 28, 3584: 16, 4237: 34, 5333: 32, 4293: 23, 5009: 29, 8739: 32, 127: 23, 9306: 29, 1486: 23, 6749: 20, 7023: 35, 1174: 36, 2568: 31, 3772: 26, 1847: 25, 7274: 20, 9516: 22, 6099: 23, 2735: 21, 10158: 32, 5184: 29, 7916: 32, 4753: 15, 231: 35, 8337: 24, 5750: 39, 9081: 26, 4711: 18, 6894: 24, 5493: 36, 2634: 29, 10169: 29, 5217: 44, 589: 28, 9005: 26, 10152: 29, 9373: 31, 3993: 26, 9369: 26, 10344: 20, 396: 37, 1728: 38, 7878: 25

### Average Rounds in Test Set

In [12]:
import json
import os

# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

def read_json(file):
    with open(file, 'r') as f:
        return json.load(f)
    
def write_json(file, obj):
    with open(file, 'w') as f:
        json.dump(obj, f, indent=2)

def select_longest_dialogue_raw(data):
    # select the longest dialogue using the frequency of the convo_id appearing in the data
    guideline_path = "raw/guidelines.json"
    with open(guideline_path, "r") as f:
        guidelines = json.load(f)

    scenarios = guidelines.keys()
    print("Scenarios: ", list(scenarios))
    convo_id_freq = {}
    for scenario in scenarios:
        scenario_name = scenario.lower()
        scenario_name = scenario_name.replace("-", "_")
        scenario_name = scenario_name.replace(" ", "_")
        convo_id_freq[scenario_name] = {}
    for d in data:
        convo_id = d['convo_id']
        scenario = d['scenario']['flow']
        for turn in d['original']:
            if convo_id not in convo_id_freq[scenario]:
                convo_id_freq[scenario][convo_id] = 1
            else:
                convo_id_freq[scenario][convo_id] += 1
    
    return convo_id_freq

def main():
    data_raw = read_json('raw/abcd_v1.1.json')
    test_data_raw = data_raw['test']

    # longest_dialogue = select_longest_dialogue(test_data_raw, data_actions)
    dialogue_length = select_longest_dialogue_raw(test_data_raw)
    print(dialogue_length)

    # calculate the average length for each scenario
    for scenario in dialogue_length.keys():
        convo_ids = dialogue_length[scenario].keys()
        total_length = 0
        for convo_id in convo_ids:
            total_length += dialogue_length[scenario][convo_id]
        avg_length = total_length / len(convo_ids)
        print("Scenario: ", scenario, "Average Length: ", avg_length)

if __name__ == '__main__':
    main()

Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
{'product_defect': {8141: 25, 4162: 26, 3342: 25, 1529: 29, 6805: 18, 10127: 39, 9366: 23, 9268: 22, 573: 21, 2269: 25, 2334: 41, 8102: 21, 9121: 26, 10124: 22, 7354: 29, 9421: 32, 6942: 27, 9415: 39, 7333: 22, 5357: 24, 8740: 27, 5046: 26, 6274: 20, 9737: 24, 8580: 30, 2984: 31, 1859: 23, 6957: 28, 3662: 32, 3764: 46, 9178: 30, 6587: 35, 4332: 29, 3567: 40, 6688: 26, 9967: 29, 10012: 20, 9880: 26, 9994: 37, 1972: 26, 6406: 27, 1179: 32, 9452: 14, 7859: 19, 7129: 19, 10347: 22, 7817: 16, 9310: 16, 2599: 27, 5374: 28, 3182: 40, 3538: 24, 6435: 29, 7722: 22, 9220: 20, 7782: 30, 8092: 28, 8855: 24, 533: 33, 3013: 27, 6069: 21, 2669: 33, 2509: 21, 3450: 32, 10259: 28, 1229: 43, 1526: 26, 10222: 19, 6255: 32, 8197: 26, 3332: 30, 3054: 26, 2673: 19, 9225: 28, 3294: 31, 9340: 31, 4961: 29, 5

### Compute the Intersections between Guidelines and Train Set

In [37]:
import json

# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

guideline_path = "raw/guidelines.json"
with open(guideline_path, "r") as f:
    guidelines = json.load(f)

scenarios = guidelines.keys()
print("Scenarios: ", list(scenarios))
guideline_action_seq = {}
for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    guideline_action_seq[scenario_name] = {}

    subflows = guidelines[scenario]['subflows']
    subflows_names = list(subflows.keys())
    for subflows_name in subflows_names:
        subflow = subflows[subflows_name]
        actions = subflow['actions']

        action_only = []
        for action in actions:
            action_only.append(action['button'])
        
        guideline_action_seq[scenario_name][subflows_name] = action_only

# print(json.dumps(guideline_action_seq, indent=2))
guideline_action_seq_scenario = {}
for scenario in guideline_action_seq.keys():
    for subflow in guideline_action_seq[scenario].keys():
        if scenario not in guideline_action_seq_scenario:
            guideline_action_seq_scenario[scenario] = []
        tmp = guideline_action_seq[scenario][subflow]
        for i in range(len(tmp)):
            tmp[i] = tmp[i].strip()
            tmp[i] = tmp[i].lower()
            tmp[i] = tmp[i].replace(" ", "-")
        guideline_action_seq_scenario[scenario].append(tmp)

# print(json.dumps(guideline_action_seq_scenario, indent=2))

data_actions = list(read_json_lines('processed/train_AST_abcd_woaction_flow_all.json'))

action_seq = {}
for data_action in data_actions:
    convo_id = data_action['convo_id']
    action_scenario = data_action['flow']

    if convo_id not in action_seq:
        action_seq[convo_id] = {}
    if action_scenario not in action_seq[convo_id]:
        action_seq[convo_id][action_scenario] = []

    action_seq[convo_id][action_scenario].append(data_action['target'].split(':')[1].strip().split(' ')[0])

# print(json.dumps(action_seq, indent=2))

action_seq_scenario = {}
for convo_id in action_seq.keys():
    for scenario in action_seq[convo_id].keys():
        if scenario not in action_seq_scenario:
            action_seq_scenario[scenario] = []

        action_seq_scenario[scenario].append(action_seq[convo_id][scenario])

# print(json.dumps(action_seq_scenario, indent=2))

# compute how many action sequences are in the guideline
for scenario in action_seq_scenario.keys():
    guideline_actions = guideline_action_seq_scenario[scenario]
    action_sequences = action_seq_scenario[scenario]
    total = 0
    for action_sequence in action_sequences:
        if action_sequence in guideline_actions:
            total += 1
    print("Scenario: ", scenario, "Total: ", total, "Percentage: ", total / len(action_sequences), "Length: ", len(action_sequences))


Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
Scenario:  product_defect Total:  142 Percentage:  0.1645422943221321 Length:  863
Scenario:  shipping_issue Total:  82 Percentage:  0.10073710073710074 Length:  814
Scenario:  single_item_query Total:  0 Percentage:  0.0 Length:  840
Scenario:  manage_account Total:  123 Percentage:  0.21503496503496503 Length:  572
Scenario:  subscription_inquiry Total:  0 Percentage:  0.0 Length:  718
Scenario:  troubleshoot_site Total:  0 Percentage:  0.0 Length:  819
Scenario:  storewide_query Total:  0 Percentage:  0.0 Length:  872
Scenario:  purchase_dispute Total:  0 Percentage:  0.0 Length:  858
Scenario:  account_access Total:  225 Percentage:  0.26564344746162927 Length:  847
Scenario:  order_issue Total:  68 Percentage:  0.08182912154031288 Length:  831


### Compute the Intersections between Guidelines and Test Set

In [41]:
import json

# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

guideline_path = "raw/guidelines.json"
with open(guideline_path, "r") as f:
    guidelines = json.load(f)

scenarios = guidelines.keys()
print("Scenarios: ", list(scenarios))
guideline_action_seq = {}
for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    guideline_action_seq[scenario_name] = {}

    subflows = guidelines[scenario]['subflows']
    subflows_names = list(subflows.keys())
    for subflows_name in subflows_names:
        subflow = subflows[subflows_name]
        actions = subflow['actions']

        action_only = []
        for action in actions:
            action_only.append(action['button'])
        
        guideline_action_seq[scenario_name][subflows_name] = action_only

# print(json.dumps(guideline_action_seq, indent=2))
guideline_action_seq_scenario = {}
for scenario in guideline_action_seq.keys():
    for subflow in guideline_action_seq[scenario].keys():
        if scenario not in guideline_action_seq_scenario:
            guideline_action_seq_scenario[scenario] = []
        tmp = guideline_action_seq[scenario][subflow]
        for i in range(len(tmp)):
            tmp[i] = tmp[i].strip()
            tmp[i] = tmp[i].lower()
            tmp[i] = tmp[i].replace(" ", "-")
        guideline_action_seq_scenario[scenario].append(tmp)

# print(json.dumps(guideline_action_seq_scenario, indent=2))

data_actions = list(read_json_lines('processed/test_AST_abcd_woaction_flow_all.json'))

action_seq = {}
for data_action in data_actions:
    convo_id = data_action['convo_id']
    action_scenario = data_action['flow']

    if convo_id not in action_seq:
        action_seq[convo_id] = {}
    if action_scenario not in action_seq[convo_id]:
        action_seq[convo_id][action_scenario] = []

    action_seq[convo_id][action_scenario].append(data_action['target'].split(':')[1].strip().split(' ')[0])

# print(json.dumps(action_seq, indent=2))

action_seq_scenario = {}
for convo_id in action_seq.keys():
    for scenario in action_seq[convo_id].keys():
        if scenario not in action_seq_scenario:
            action_seq_scenario[scenario] = []

        action_seq_scenario[scenario].append(action_seq[convo_id][scenario])

# print(json.dumps(action_seq_scenario, indent=2))

# compute how many action sequences are in the guideline
for scenario in action_seq_scenario.keys():
    guideline_actions = guideline_action_seq_scenario[scenario]
    action_sequences = action_seq_scenario[scenario]
    total = 0
    for action_sequence in action_sequences:
        if action_sequence in guideline_actions:
            total += 1
    print("Scenario: ", scenario, "Total: ", total, "Percentage: ", total / len(action_sequences), "Length: ", len(action_sequences))


Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
Scenario:  storewide_query Total:  0 Percentage:  0.0 Length:  107
Scenario:  subscription_inquiry Total:  0 Percentage:  0.0 Length:  94
Scenario:  single_item_query Total:  0 Percentage:  0.0 Length:  106
Scenario:  troubleshoot_site Total:  0 Percentage:  0.0 Length:  103
Scenario:  purchase_dispute Total:  0 Percentage:  0.0 Length:  111
Scenario:  account_access Total:  25 Percentage:  0.25252525252525254 Length:  99
Scenario:  shipping_issue Total:  8 Percentage:  0.07547169811320754 Length:  106
Scenario:  order_issue Total:  3 Percentage:  0.02857142857142857 Length:  105
Scenario:  manage_account Total:  16 Percentage:  0.23529411764705882 Length:  68
Scenario:  product_defect Total:  15 Percentage:  0.14285714285714285 Length:  105


### Compute the Intersections between Train Set and Test Set

In [45]:
import json

# read lines from a json file, each line is a json object
def read_json_lines(file):
    with open(file, 'r') as f:
        for line in f:
            yield json.loads(line)

guideline_path = "raw/guidelines.json"
with open(guideline_path, "r") as f:
    guidelines = json.load(f)

scenarios = guidelines.keys()
print("Scenarios: ", list(scenarios))
guideline_action_seq = {}
for scenario in scenarios:
    scenario_name = scenario.lower()
    scenario_name = scenario_name.replace("-", "_")
    scenario_name = scenario_name.replace(" ", "_")
    guideline_action_seq[scenario_name] = {}

    subflows = guidelines[scenario]['subflows']
    subflows_names = list(subflows.keys())
    for subflows_name in subflows_names:
        subflow = subflows[subflows_name]
        actions = subflow['actions']

        action_only = []
        for action in actions:
            action_only.append(action['button'])
        
        guideline_action_seq[scenario_name][subflows_name] = action_only

# print(json.dumps(guideline_action_seq, indent=2))
guideline_action_seq_scenario = {}
for scenario in guideline_action_seq.keys():
    for subflow in guideline_action_seq[scenario].keys():
        if scenario not in guideline_action_seq_scenario:
            guideline_action_seq_scenario[scenario] = []
        tmp = guideline_action_seq[scenario][subflow]
        for i in range(len(tmp)):
            tmp[i] = tmp[i].strip()
            tmp[i] = tmp[i].lower()
            tmp[i] = tmp[i].replace(" ", "-")
        guideline_action_seq_scenario[scenario].append(tmp)

# print(json.dumps(guideline_action_seq_scenario, indent=2))

data_actions = list(read_json_lines('processed/train_AST_abcd_woaction_flow_all.json'))

action_seq = {}
for data_action in data_actions:
    convo_id = data_action['convo_id']
    action_scenario = data_action['flow']

    if convo_id not in action_seq:
        action_seq[convo_id] = {}
    if action_scenario not in action_seq[convo_id]:
        action_seq[convo_id][action_scenario] = []

    action_seq[convo_id][action_scenario].append(data_action['target'].split(':')[1].strip().split(' ')[0])

# print(json.dumps(action_seq, indent=2))

train_action_seq_scenario = {}
for convo_id in action_seq.keys():
    for scenario in action_seq[convo_id].keys():
        if scenario not in train_action_seq_scenario:
            train_action_seq_scenario[scenario] = []

        train_action_seq_scenario[scenario].append(action_seq[convo_id][scenario])

data_actions = list(read_json_lines('processed/test_AST_abcd_woaction_flow_all.json'))

action_seq = {}
for data_action in data_actions:
    convo_id = data_action['convo_id']
    action_scenario = data_action['flow']

    if convo_id not in action_seq:
        action_seq[convo_id] = {}
    if action_scenario not in action_seq[convo_id]:
        action_seq[convo_id][action_scenario] = []

    action_seq[convo_id][action_scenario].append(data_action['target'].split(':')[1].strip().split(' ')[0])

# print(json.dumps(action_seq, indent=2))

test_action_seq_scenario = {}
for convo_id in action_seq.keys():
    for scenario in action_seq[convo_id].keys():
        if scenario not in test_action_seq_scenario:
            test_action_seq_scenario[scenario] = []

        test_action_seq_scenario[scenario].append(action_seq[convo_id][scenario])

# print(json.dumps(action_seq_scenario, indent=2))

# compute how many action flows are same in train set and test set
for scenario in train_action_seq_scenario.keys():
    train_actions = train_action_seq_scenario[scenario]
    test_actions = test_action_seq_scenario[scenario]
    guideline_actions = guideline_action_seq_scenario[scenario]
    
    total = 0
    for action_sequence in test_actions:
        if action_sequence in train_actions and action_sequence in guideline_actions:
            total += 1
    print("Scenario: ", scenario, "Total: ", total, "Percentage: ", total / len(test_actions), "Length: ", len(test_actions))

Scenarios:  ['Product Defect', 'Order Issue', 'Account Access', 'Troubleshoot Site', 'Manage Account', 'Purchase Dispute', 'Shipping Issue', 'Subscription Inquiry', 'Single-Item Query', 'Storewide Query']
Scenario:  product_defect Total:  15 Percentage:  0.14285714285714285 Length:  105
Scenario:  shipping_issue Total:  8 Percentage:  0.07547169811320754 Length:  106
Scenario:  single_item_query Total:  0 Percentage:  0.0 Length:  106
Scenario:  manage_account Total:  16 Percentage:  0.23529411764705882 Length:  68
Scenario:  subscription_inquiry Total:  0 Percentage:  0.0 Length:  94
Scenario:  troubleshoot_site Total:  0 Percentage:  0.0 Length:  103
Scenario:  storewide_query Total:  0 Percentage:  0.0 Length:  107
Scenario:  purchase_dispute Total:  0 Percentage:  0.0 Length:  111
Scenario:  account_access Total:  25 Percentage:  0.25252525252525254 Length:  99
Scenario:  order_issue Total:  3 Percentage:  0.02857142857142857 Length:  105
