#### Perform structural shifts of the datasets
This file assumes you got the conllu files for your dataset.

In [None]:
import os

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

You can split bigger dataset into smaller chunks so that they fit in your memory.

In [None]:
# Get the training, and split the file, and galactic change.
filename = "../../data-files/wikitext-15M-conllu/wikitext-15M-train.conllu"
with open(filename) as f:
    content = f.readlines()
    
count = 0
current_file = []
partition_file = []
for c in content:
    current_file += [c]
    if len(c.strip()) == 0:
        partition_file += [current_file]
        current_file = []
        count += 1
assert count == len(partition_file)

NUM_PARTITION = 10
partition_file = [c for c in chunks(partition_file, len(partition_file)//NUM_PARTITION)]
output_dir = "../../data-files/wikitext-15M-conllu"
basename  = "wikitext-15M-train"
file_counter = 0
for file in partition_file:
    # each file is a list of doc
    output_file = f"{output_dir}/{basename}-partition-{file_counter}.conllu"
    with open(output_file, 'a') as the_file:
        for f in file:
            for l in f:
                the_file.write(l)
    print(f"write to {output_file} with doc_number={len(file)}")
    file_counter += 1

You will need to call following java scripts to perform structural changes.

In [None]:
# We will run the following line to convert our conllu file using JAVA.
# types of shifts:
# wiki-text (galactic-en~fr@N~fr@V)
# wiki-text (galactic-en~ja_ktc@N~ja_ktc@V)
# wiki-text (galactic-en~fr@N~ja_ktc@V)

In [None]:
cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
        ../../submodules/gdtreebank/bin/gd-translate \
        --input ../../data-files/wikitext-15M-conllu/wikitext-15M-validation.conllu \
        --spec en~ja_ktc@N~ja_ktc@V"
print(f"starting command")
os.system(cmd)

In [None]:
cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
        ../../submodules/gdtreebank/bin/gd-translate \
        --input ../../data-files/wikitext-15M-conllu/wikitext-15M-test.conllu \
        --spec en~ja_ktc@N~ja_ktc@V"
print(f"starting command")
os.system(cmd)

In [None]:
# run java program over smaller files.
for i in range(NUM_PARTITION+1):
    cmd = f"GALACTIC_ROOT=../../submodules/gdtreebank/ \
            ../../submodules/gdtreebank/bin/gd-translate \
            --input ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}.conllu \
            --spec en~fr@N~ja_ktc@V"
    print(f"starting command-{i}")
    os.system(cmd)

# --input ../../data-files/wikitext-15M-conllu/wikitext-15M-train.conllu \
# --input ../../data-files/wikitext-15M-conllu/wikitext-15M-test.conllu \
# --input ../../data-files/wikitext-15M-conllu/wikitext-15M-validation.conllu \

Now, we need to combine and transfer conllu files into dataset.

In [95]:
# first, let us combine all sub-chunks together.
all_content = []
condition = "en~fr@N~ja_ktc@V"
for i in range(NUM_PARTITION+1):
    subfile_name = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}-{condition}.conllu"
    with open(subfile_name) as f:
        content = f.readlines()
    all_content.extend(content)
    print(f"processing: {subfile_name}")

processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-0-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-1-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-2-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-3-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-4-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-5-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-6-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-7-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-8-en~fr@N~ja_ktc@V.conllu
processing: ../../data-files/wikitext-15M-conllu/wikitext-15M-tr

In [96]:
jumbo_file = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-{condition}.conllu"
print(f"writing all combined files to: {jumbo_file}")
with open(jumbo_file, 'a') as the_file:
    for l in all_content:
        the_file.write(l)

writing all combined files to: ../../data-files/wikitext-15M-conllu/wikitext-15M-train-en~fr@N~ja_ktc@V.conllu


In [97]:
# removing all files.
for i in range(NUM_PARTITION+1):
    subfile_name = f"../../data-files/wikitext-15M-conllu/wikitext-15M-train-partition-{i}-{condition}.conllu"
    os.remove(subfile_name)