In [1]:
import os
import json
import random

In [4]:
dir_name = "orig_sva"
directory = os.fsencode(dir_name)

i = 0
data = []
for f_name_bytes in os.listdir(directory):
    f_name = os.fsdecode(f_name_bytes)
    print(f_name)
    with open(os.path.join(dir_name, f_name), "r") as f:
        for l in f.readlines():
            data.append(json.loads(l))

test-distractor_agreement_relational_noun_probing.jsonl
train-distractor_agreement_relational_noun_probing-0.5.jsonl
train-distractor_agreement_relational_noun_probing-0.6.jsonl
train-distractor_agreement_relational_noun_probing-0.7.jsonl
train-distractor_agreement_relational_noun_probing-0.8.jsonl
train-distractor_agreement_relational_noun_probing-0.9.jsonl
train-distractor_agreement_relational_noun_probing-1.0.jsonl
val-distractor_agreement_relational_noun_probing-0.5.jsonl
val-distractor_agreement_relational_noun_probing-0.6.jsonl
val-distractor_agreement_relational_noun_probing-0.7.jsonl
val-distractor_agreement_relational_noun_probing-0.8.jsonl
val-distractor_agreement_relational_noun_probing-0.9.jsonl
val-distractor_agreement_relational_noun_probing-1.0.jsonl


In [5]:
# Hack to de-duplicate
print(len(data))
pair_by_sentence_good = {}
for pair in data:
    pair_by_sentence_good[pair["sentence_good"]] = pair
no_dup = pair_by_sentence_good.values() 
print(len(no_dup))

10002
3932


In [6]:
co_occur = [pair for pair in no_dup if pair["co-occurs"]]
print(len(co_occur))
random.shuffle(co_occur)

no_co_occur = [pair for pair in no_dup if not pair["co-occurs"]]
print(len(no_co_occur))
random.shuffle(no_co_occur)

2101
1831


In [7]:
co_occur_copy = co_occur.copy()
no_co_occur_copy = no_co_occur.copy()

def jsonify(pair, good):
    return {"sentence": pair["sentence_good"] if good else pair["sentence_bad"], "label": 1 if good else 0, "co-occurs": pair["co-occurs"]}

def make_set(co_occur_sample, no_co_occur_sample, both_count, neither_count, weak_only_count, strong_only_count, flip_weak_only=False):
    both_els = [jsonify(pair, True) for pair in co_occur_sample[:both_count]].copy()
    neither_els = [jsonify(pair, False) for pair in co_occur_sample[:neither_count]].copy()   
    strong_only_els = [jsonify(pair, True) for pair in no_co_occur_sample[:strong_only_count]].copy()
    weak_only_els = [jsonify(pair, False) for pair in no_co_occur_sample[:weak_only_count]].copy()
    if flip_weak_only:
        for ex in weak_only_els:
            ex["label"] = 1
    
    return both_els + neither_els + strong_only_els + weak_only_els

def make_test(both_count, neither_count, weak_only_count, strong_only_count):
    co_occur_count = max(both_count, neither_count)
    co_occur_sample = co_occur_copy[:co_occur_count]
    del co_occur_copy[:co_occur_count]
    
    no_co_occur_count = max(weak_only_count, strong_only_count)
    no_co_occur_sample = no_co_occur_copy[:no_co_occur_count]
    del no_co_occur_copy[:no_co_occur_count]

    return make_set(co_occur_sample, no_co_occur_sample, both_count, neither_count, weak_only_count, strong_only_count)
    
# Assuming val should be distributed the same as train with 1/4 the data
def make_train_and_val(both_count, neither_count, weak_only_count, strong_only_count, flip_weak_only=False):
    co_occur_count = max(both_count, neither_count)
    co_occur_sample = co_occur_copy[:int(co_occur_count*1.25)]
    
    no_co_occur_count = max(weak_only_count, strong_only_count)
    no_co_occur_sample = no_co_occur_copy[:int(no_co_occur_count*1.25)]
    
    co_occur_train = co_occur_sample[:co_occur_count]
    no_co_occur_train = no_co_occur_sample[:no_co_occur_count]
    train = make_set(co_occur_train, no_co_occur_train, both_count, neither_count, weak_only_count, strong_only_count, flip_weak_only)
    
    co_occur_val = co_occur_sample[co_occur_count:]
    no_co_occur_val = no_co_occur_sample[no_co_occur_count:]
    val = make_set(co_occur_val, no_co_occur_val, int(both_count*0.25), int(neither_count*0.25), int(weak_only_count*0.25), int(strong_only_count*0.25), flip_weak_only)
    
    return train, val

In [8]:
import json
import os

# val will be distributed the same as each train with 1/4 of the data
datasets = {"test": (500, 500, 500, 500), 
            "probing_strong": (1000, 0, 1000, 0), 
            "finetune_0": (1000, 1000, 0, 0),
            "finetune_1": (1000, 980, 20, 0),
            "finetune_5": (1000, 900, 100, 0),
            "probing_weak": (0, 1000, 1000, 0)}

for dataset_name in datasets:
    dataset_counts = datasets[dataset_name]
    if dataset_name == "test":
        test = make_test(dataset_counts[0], dataset_counts[1], dataset_counts[2], dataset_counts[3])
        print(f"test: {len(test)}")
        with open(os.path.join("sva", f"sva_test.jsonl"), "w") as f:
            for el in test:
                f.write(json.dumps(el) + "\n")
    else:
        train, val = make_train_and_val(dataset_counts[0], dataset_counts[1], dataset_counts[2], dataset_counts[3], flip_weak_only=(dataset_name == "probing_weak"))
        print(f"{dataset_name}: {len(train)}, {len(val)}")
        with open(os.path.join("sva", f"sva_{dataset_name}_train.jsonl"), "w") as f:
            for el in train:
                f.write(json.dumps(el) + "\n")
        with open(os.path.join("sva", f"sva_{dataset_name}_val.jsonl"), "w") as f:
            for el in val:
                f.write(json.dumps(el) + "\n")

test: 2000
probing_strong: 2000, 500
finetune_0: 2000, 500
finetune_1: 2000, 500
finetune_5: 2000, 500
probing_weak: 2000, 500
