In [1]:
import json 


def read_json(path):
    with open(path) as f1:
        data = [json.loads(x) for x in f1]
    return data

paths = ["../data_subsets/calflow/test/", "../data_subsets/treedst/test/"]

all_data = {}

for dataset, path in zip(["calflow", "treedst"], paths):
    easy_data = read_json(path + "easy.jsonl")
    hard_data = read_json(path + "hard.jsonl")
    all_data[dataset] = (easy_data, hard_data)







# Factors making examples easy and hard
Which factors make an example easy or hard? We will look at 3 factors:
- length (numer of tokens)
- number of sentences 
- percentage of value op values that are present in the source 

## Length

In [2]:
import re
import numpy as np 
# get input length

lengths_by_dataset_and_split = {"calflow": {"easy": [], "hard": []}, "treedst": {"easy": [], "hard": []}}

for dataset, (easy_data, hard_data) in all_data.items():
    for split, data in zip(["easy", "hard"], [easy_data, hard_data]):
        for example in data:
            split_input = re.split("\s+", example['user_turn_1']) 
            lengths_by_dataset_and_split[dataset][split].append(len(split_input))


mean_lengths_by_dataset_and_split = {"calflow": {"easy": [], "hard": []}, "treedst": {"easy": [], "hard": []}}
for dataset, split_data in lengths_by_dataset_and_split.items():
    for split, lengths in split_data.items():
        mean_lengths_by_dataset_and_split[dataset][split] = f"{np.mean(lengths):.2f}"
print(mean_lengths_by_dataset_and_split)

{'calflow': {'easy': '7.14', 'hard': '10.49'}, 'treedst': {'easy': '9.16', 'hard': '7.61'}}


In [3]:
# for a given dataset, is there a significant difference in input length between easy and hard examples?
from scipy.stats import ttest_ind

for dataset, split_data in lengths_by_dataset_and_split.items():
    easy_lengths = split_data["easy"]
    hard_lengths = split_data["hard"]
    var_easy = np.var(easy_lengths)
    var_hard = np.var(hard_lengths)
    print(var_easy, var_hard)
    t, p = ttest_ind(easy_lengths, hard_lengths, equal_var=False)
    print(f"{dataset}: t={t}, p={p}")

19.85580088552469 28.206289032909687
calflow: t=-38.31827481040881, p=1.9426166068653645e-300
31.72171276276142 22.825978329793212
treedst: t=22.43181271193647, p=3.272451993003214e-110


## Percentage of examples with multiple sentences
For each dataset, what percentage of examples have more than one sentence (as measured by the presence of a period)

In [4]:
multi_sentence_by_dataset_and_split = {"calflow": {"easy": [], "hard": []}, "treedst": {"easy": [], "hard": []}}
for dataset, (easy_data, hard_data) in all_data.items():
    for split, data in zip(["easy", "hard"], [easy_data, hard_data]):
        for example in data:
            split_input = re.split("\s+", example['user_turn_1']) 
            if split_input[-1] == ".":
                split_input = split_input[0:-1]
            if "." in split_input: 
                period_index = split_input.index(".")
                try:
                    before, after = split_input[period_index-1], split_input[period_index+1]
                    # check if they're numbers 
                    if before.isdigit() and after.isdigit():
                        continue
                except IndexError:
                    pass
                multi_sentence_by_dataset_and_split[dataset][split].append(1)
            else:
                multi_sentence_by_dataset_and_split[dataset][split].append(0)

mean_multi_sentence_by_dataset_and_split = {"calflow": {"easy": [], "hard": []}, "treedst": {"easy": [], "hard": []}}
for dataset, split_data in multi_sentence_by_dataset_and_split.items():
    for split, multi_sentence in split_data.items():
        mean_multi_sentence_by_dataset_and_split[dataset][split] = f"{np.mean(multi_sentence)*100:.2f}"
    
print(mean_multi_sentence_by_dataset_and_split)

{'calflow': {'easy': '3.49', 'hard': '7.92'}, 'treedst': {'easy': '8.47', 'hard': '8.44'}}


In [5]:
# check for significance 

for dataset, split_data in multi_sentence_by_dataset_and_split.items():
    easy_lengths = split_data["easy"]
    hard_lengths = split_data["hard"]
    var_easy = np.var(easy_lengths)
    var_hard = np.var(hard_lengths)
    print(var_easy, var_hard)
    t, p = ttest_ind(easy_lengths, hard_lengths, equal_var=False)
    print(f"{dataset}: t={t}, p={p}")

0.03365337642619752 0.07296488018961152
calflow: t=-10.546594620167555, p=7.538932976975875e-26
0.07754773395449474 0.07724279652510087
treedst: t=0.0996735841608333, p=0.9206043481021734


## Copying
For TreeDST, it seems like a lot of hard examples are based on not having access to enough context.
We will see what percentage of values_op values we can copy from the input 

In [6]:
## get set of value ops 
from dataflow.core.lispress import parse_lispress, lispress_to_program
from dataflow.core.program import ValueOp
def get_value_ops(lispress):
    all_values = []
    lispress = parse_lispress(lispress)
    program, __ = lispress_to_program(lispress, 0)
    for expression in program.expressions:
        if isinstance(expression.op, ValueOp):
            underlying = str(json.loads(expression.op.value)['underlying']).strip()
            all_values.append(underlying.lower())
    return all_values 

value_ops_in_src_by_dataset_and_split = {"calflow": {"easy": [], "hard": []}, "treedst": {"easy": [], "hard": []}}
for dataset, (easy_data, hard_data) in all_data.items():
    for split, data in zip(["easy", "hard"], [easy_data, hard_data]):
        for example in data:
            value_ops = get_value_ops(example['tgt'])
            input = " __BLAH__ ".join([str(example['user_turn_0']), str(example['agent_turn_0']), example['user_turn_1']])
            split_input = re.split("\s+", input) 
            split_input = [x.lower().strip() for x in split_input]
            # get percentage of value ops in input 
            in_input, total = 0, 0
            for v in value_ops:
                if v in split_input:
                    in_input += 1
                total += 1
            if total > 0:
                value_ops_in_src_by_dataset_and_split[dataset][split].append(in_input / total)

mean_value_ops_in_src_by_dataset_and_split = {"calflow": {"easy": [], "hard": []}, "treedst": {"easy": [], "hard": []}}
for dataset, split_data in value_ops_in_src_by_dataset_and_split.items():
    for split, value_ops in split_data.items():
        mean_value_ops_in_src_by_dataset_and_split[dataset][split] = np.mean(value_ops)

print(mean_value_ops_in_src_by_dataset_and_split)
            

{'calflow': {'easy': 0.47910122164048863, 'hard': 0.4865278612063178}, 'treedst': {'easy': 0.6166151130046543, 'hard': 0.47262370174407214}}


In [7]:
# check for significance 

for dataset, split_data in value_ops_in_src_by_dataset_and_split.items():
    easy_lengths = split_data["easy"]
    hard_lengths = split_data["hard"]
    var_easy = np.var(easy_lengths)
    var_hard = np.var(hard_lengths)
    print(var_easy, var_hard)
    t, p = ttest_ind(easy_lengths, hard_lengths, equal_var=False)
    print(f"{dataset}: t={t}, p={p}")

0.1928253413615224 0.16959852250031587
calflow: t=-0.7826401259757607, p=0.43386160639845783
0.14237940380810524 0.07861051503484183
treedst: t=30.267611356408107, p=2.3639567870757093e-196
