#### Perform structural shifts of the datasets
This file assumes you got the conllu files for your dataset.

In [80]:
import os
from stanza.utils.conll import CoNLL
from tqdm import tqdm
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
from datasets import list_datasets
import json

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def dict2text(stanza_sentence_dict):
    tokens = []
    for token in stanza_sentence_dict:
        tokens += [token["text"]]
    return " ".join(tokens)

text_fields_map = {
    "wikitext-15M":"text",
    "sst3":"text",
    "qnli":"question,sentence",
    "mrpc":"sentence1,sentence2",
}

You can split bigger dataset into smaller chunks so that they fit in your memory.

In [22]:
task="mrpc"

In [2]:
# Get the training, and split the file, and galactic change.
filename = f"../../data-files/{task}-conllu/{task}-train.conllu"
json_filename = f"../../data-files/{task}-conllu/{task}-train.json"
with open(filename) as f:
    content = f.readlines()
    
count = 0
current_file = []
partition_file = []
for c in content:
    current_file += [c]
    if len(c.strip()) == 0:
        partition_file += [current_file]
        current_file = []
        count += 1
sentence_group = json.load(open(json_filename))
assert count == sum(sentence_group)

In [3]:
assert count == len(partition_file)

NUM_PARTITION = 10
partition_file = [c for c in chunks(partition_file, len(partition_file)//NUM_PARTITION)]
output_dir = "../../data-files/wikitext-15M-conllu"
basename  = "wikitext-15M-train"
file_counter = 0
for file in partition_file:
    # each file is a list of doc
    output_file = f"{output_dir}/{basename}-partition-{file_counter}.conllu"
    with open(output_file, 'a') as the_file:
        for f in file:
            for l in f:
                the_file.write(l)
    print(f"write to {output_file} with doc_number={len(file)}")
    file_counter += 1

write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-0.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-1.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-2.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-3.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-4.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-5.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-6.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-7.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-8.conllu with doc_number=59596
write to ../../data-files/wikitext-15M-conllu/

You will need to call following java scripts to perform structural changes.

In [4]:
# We will run the following line to convert our conllu file using JAVA.
# types of shifts:
# wiki-text (en~fr@N~fr@V)
# wiki-text (en~ja_ktc@N~ja_ktc@V)
# wiki-text (en~fr@N~ja_ktc@V)

In [43]:
text = "sentence2"
# condition = "en~fr@N~fr@V"
# condition = "en~ja_ktc@N~ja_ktc@V"
condition = "en~fr@N~ja_ktc@V"

In [44]:
cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
        ../../submodules/gdtreebank/bin/gd-translate \
        --input ../../data-files/{task}-conllu/{task}-train-{text}.conllu \
        --spec {condition}"
print(f"starting command")
os.system(cmd)

starting command


0

In [45]:
cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
        ../../submodules/gdtreebank/bin/gd-translate \
        --input ../../data-files/{task}-conllu/{task}-validation-{text}.conllu \
        --spec {condition}"
print(f"starting command")
os.system(cmd)

starting command


0

In [46]:
cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
        ../../submodules/gdtreebank/bin/gd-translate \
        --input ../../data-files/{task}-conllu/{task}-test-{text}.conllu \
        --spec {condition}"
print(f"starting command")
os.system(cmd)

starting command


0

In [22]:
# run java program over smaller files.
for i in range(NUM_PARTITION+1):
    cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
            ../../submodules/gdtreebank/bin/gd-translate \
            --input ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}.conllu \
            --spec {condition}"
    print(f"starting command-{i}")
    os.system(cmd)

# --input ../../data-files/wikitext-15M-conllu/wikitext-15M-train.conllu \
# --input ../../data-files/wikitext-15M-conllu/wikitext-15M-test.conllu \
# --input ../../data-files/wikitext-15M-conllu/wikitext-15M-validation.conllu \

starting command-0
starting command-1
starting command-2
starting command-3
starting command-4
starting command-5
starting command-6
starting command-7
starting command-8
starting command-9
starting command-10


Now, we need to combine and transfer conllu files into dataset.

In [23]:
# first, let us combine all sub-chunks together.
all_content = []
for i in range(NUM_PARTITION+1):
    subfile_name = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}-{condition}.conllu"
    with open(subfile_name) as f:
        content = f.readlines()
    all_content.extend(content)
    print(f"processing: {subfile_name}")

processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-0-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-1-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-2-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-3-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-4-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-5-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-6-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-7-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-8-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-tr

In [24]:
jumbo_file = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-{condition}.conllu"
print(f"writing all combined files to: {jumbo_file}")
with open(jumbo_file, 'a') as the_file:
    for l in all_content:
        the_file.write(l)

writing all combined files to: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-en~fr@N~ja_ktc@V.conllu


In [25]:
# removing all files.
for i in range(NUM_PARTITION+1):
    subfile_name = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}-{condition}.conllu"
    os.remove(subfile_name)

In [26]:
# WARNING: run this at the end removing all files.
for i in range(NUM_PARTITION+1):
    subfile_name = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}.conllu"
    os.remove(subfile_name)

We then need to load all conllu files and turn them into wiki-text data format.

In [90]:
task = "qnli"

In [109]:
# condition = "en~fr@N~fr@V"
# condition = "en~ja_ktc@N~ja_ktc@V"
condition = "en~fr@N~ja_ktc@V"

In [110]:
def group_conllu(task, split, field_name, condition):
    print(f"reading in conllu files for task {task} on split {split} with field_name {field_name} in condition {condition}...")
    conllu_filename = f"../../data-files/{task}-conllu/{task}-{split}-{field_name}-{condition}.conllu"
    print(f"reading in conllu file: {conllu_filename}...")
    to_sent_doc = CoNLL.conll2doc(conllu_filename)
    texts = []
    for i in tqdm(range(len(to_sent_doc.sentences))):
        text = dict2text(to_sent_doc.sentences[i].to_dict())
        texts += [text]
    print("grouping conllu files correctly...")
    json_filename = f"../../data-files/{task}-conllu/{task}-{split}.json"
    metadata = json.load(open(json_filename))
    sentence_group = [md[-1] for md in metadata[field_name]]
    curr = 0
    updated_text = []
    for g in sentence_group:
        combined_text = texts[curr:curr+g]
        updated_text += [" ".join(combined_text)]
        curr += g

    return updated_text

In [111]:
split = "train"
train_dataset = {}
for field_name in text_fields_map[task].split(","):
    updated_text = group_conllu(task, split, field_name, condition)
    train_dataset[field_name] = updated_text
# extract labels.
json_filename = f"../../data-files/{task}-conllu/{task}-{split}.json"
metadata = json.load(open(json_filename))
if task != "qnli":
    labels = [md[0] for md in metadata[field_name]]
    sources = [task for md in metadata[field_name]]
    train_dataset["label"] = labels
    train_dataset["source"] = sources
    train_dataset = Dataset.from_dict(train_dataset)
else:
    labels = [md[1] for md in metadata[field_name]]
    idx = [md[0] for md in metadata[field_name]]
    train_dataset["label"] = labels
    train_dataset["idx"] = idx
    train_dataset = Dataset.from_dict(train_dataset)

reading in conllu files for task qnli on split train with field_name question in condition en~fr@N~ja_ktc@V...
reading in conllu file: ../../data-files/qnli-conllu/qnli-train-question-en~fr@N~ja_ktc@V.conllu...


100%|██████████| 105546/105546 [00:36<00:00, 2899.38it/s]


grouping conllu files correctly...
reading in conllu files for task qnli on split train with field_name sentence in condition en~fr@N~ja_ktc@V...
reading in conllu file: ../../data-files/qnli-conllu/qnli-train-sentence-en~fr@N~ja_ktc@V.conllu...


100%|██████████| 108136/108136 [01:53<00:00, 952.99it/s] 


grouping conllu files correctly...


In [112]:
split = "validation"
validation_dataset = {}
for field_name in text_fields_map[task].split(","):
    updated_text = group_conllu(task, split, field_name, condition)
    validation_dataset[field_name] = updated_text
# extract labels.
json_filename = f"../../data-files/{task}-conllu/{task}-{split}.json"
metadata = json.load(open(json_filename))
if task != "qnli":
    labels = [md[0] for md in metadata[field_name]]
    sources = [task for md in metadata[field_name]]
    validation_dataset["label"] = labels
    validation_dataset["source"] = sources
    validation_dataset = Dataset.from_dict(validation_dataset)
else:
    labels = [md[1] for md in metadata[field_name]]
    idx = [md[0] for md in metadata[field_name]]
    validation_dataset["label"] = labels
    validation_dataset["idx"] = idx
    validation_dataset = Dataset.from_dict(validation_dataset)

reading in conllu files for task qnli on split validation with field_name question in condition en~fr@N~ja_ktc@V...
reading in conllu file: ../../data-files/qnli-conllu/qnli-validation-question-en~fr@N~ja_ktc@V.conllu...


100%|██████████| 5511/5511 [00:01<00:00, 3322.35it/s]


grouping conllu files correctly...
reading in conllu files for task qnli on split validation with field_name sentence in condition en~fr@N~ja_ktc@V...
reading in conllu file: ../../data-files/qnli-conllu/qnli-validation-sentence-en~fr@N~ja_ktc@V.conllu...


100%|██████████| 5688/5688 [00:05<00:00, 979.23it/s] 


grouping conllu files correctly...


In [113]:
split = "test"
test_dataset = {}
for field_name in text_fields_map[task].split(","):
    updated_text = group_conllu(task, split, field_name, condition)
    test_dataset[field_name] = updated_text
# extract labels.
json_filename = f"../../data-files/{task}-conllu/{task}-{split}.json"
metadata = json.load(open(json_filename))
if task != "qnli":
    labels = [md[0] for md in metadata[field_name]]
    sources = [task for md in metadata[field_name]]
    test_dataset["label"] = labels
    test_dataset["source"] = sources
    test_dataset = Dataset.from_dict(test_dataset)
else:
    labels = [md[1] for md in metadata[field_name]]
    idx = [md[0] for md in metadata[field_name]]
    test_dataset["label"] = labels
    test_dataset["idx"] = idx
    test_dataset = Dataset.from_dict(test_dataset)

reading in conllu files for task qnli on split test with field_name question in condition en~fr@N~ja_ktc@V...
reading in conllu file: ../../data-files/qnli-conllu/qnli-test-question-en~fr@N~ja_ktc@V.conllu...


100%|██████████| 5512/5512 [00:01<00:00, 4133.05it/s]


grouping conllu files correctly...
reading in conllu files for task qnli on split test with field_name sentence in condition en~fr@N~ja_ktc@V...
reading in conllu file: ../../data-files/qnli-conllu/qnli-test-sentence-en~fr@N~ja_ktc@V.conllu...


100%|██████████| 5669/5669 [00:05<00:00, 1094.50it/s]

grouping conllu files correctly...





In [114]:
# save it as dataset format.
datasets = DatasetDict({"train":train_dataset, 
                        "validation":validation_dataset, 
                        "test":test_dataset})

In [115]:
datasets.save_to_disk(f"../../data-files/{task}-{condition}")

In [50]:
# for mrpc, we need to separate a validation and test set by ourself.
datasets = DatasetDict({"train":train_dataset, 
                        "validation":validation_dataset, 
                        "test":validation_dataset})
datasets.save_to_disk(f"../../data-files/{task}-{condition}")