### Merging [EWT, GUM, GUMReddit, PUD, Pronouns] to obtain the Combined set of UD

### First we split PUD and Pronouns into train, dev and test (80,10,10)

In [3]:
import random
from pathlib import Path
from conllu import parse_incr

In [2]:
def split_conllu_file_safe(input_path, output_dir, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1, seed=42):
    input_path = Path(input_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Read sentences properly
    with input_path.open("r", encoding="utf-8") as f:
        sentences = list(parse_incr(f))

    random.seed(seed)
    random.shuffle(sentences)

    n = len(sentences)
    n_train = int(n * train_ratio)
    n_dev = int(n * dev_ratio)
    n_test = n - n_train - n_dev

    train_sents = sentences[:n_train]
    dev_sents = sentences[n_train:n_train+n_dev]
    test_sents = sentences[n_train+n_dev:]

    def save(sentences_list, path):
        with path.open("w", encoding="utf-8") as f:
            for sent in sentences_list:
                f.write(sent.serialize())  # <- use serialize() method

    save(train_sents, output_dir / f"{input_path.stem}-train.conllu")
    save(dev_sents, output_dir / f"{input_path.stem}-dev.conllu")
    save(test_sents, output_dir / f"{input_path.stem}-test.conllu")

    print(f"✅ Done! Saved to {output_dir}")
    print(f"Train: {len(train_sents)} sentences")
    print(f"Dev:   {len(dev_sents)} sentences")
    print(f"Test:  {len(test_sents)} sentences")

In [None]:
def split_conllu_file_safe_easy(input_path, output_dir, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1, seed=42):
    """
    Safely splits a .conllu file into train/dev/test sets based on sentence boundaries.
    Works even for large or slightly malformed files.
    """
    input_path = Path(input_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"📖 Reading sentences from {input_path} ...")

    # Read file as text
    with input_path.open("r", encoding="utf-8") as f:
        text = f.read().strip()

    # Split on blank lines (standard CoNLL-U sentence separator)
    raw_sentences = [s.strip() for s in text.split("\n\n") if s.strip()]
    print(f"✅ Found {len(raw_sentences)} sentences.")

    # Filter out malformed ones (not 10 columns in token lines)
    sentences = []
    bad = 0
    for sent in raw_sentences:
        lines = sent.splitlines()
        token_lines = [l for l in lines if l and not l.startswith("#")]
        if all(len(l.split("\t")) == 10 for l in token_lines):
            sentences.append(sent)
        else:
            bad += 1
    if bad > 0:
        print(f"⚠️ Skipped {bad} malformed sentences (not 10 columns).")

    random.seed(seed)
    random.shuffle(sentences)

    n = len(sentences)
    n_train = int(n * train_ratio)
    n_dev = int(n * dev_ratio)
    n_test = n - n_train - n_dev

    train_sents = sentences[:n_train]
    dev_sents = sentences[n_train:n_train + n_dev]
    test_sents = sentences[n_train + n_dev:]

    def save(sent_list, filename):
        path = output_dir / filename
        with path.open("w", encoding="utf-8") as f:
            f.write("\n".join(sent_list) + "\n")
        print(f"💾 Saved {len(sent_list)} sentences to {path}")

    base = input_path.stem
    save(train_sents, f"{base}_train.conllu")
    save(dev_sents, f"{base}_dev.conllu")
    save(test_sents, f"{base}_test.conllu")

    print("\n🎉 Done splitting!")
    print(f"Train: {len(train_sents)} | Dev: {len(dev_sents)} | Test: {len(test_sents)}")


In [39]:
split_conllu_file_safe(
    input_path="/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_pronouns-ud-test.conllu",
    output_dir="/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits",
    train_ratio=0.8,
    dev_ratio=0.1,
    test_ratio=0.1,
    seed=42
)

✅ Done! Saved to /Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits
Train: 228 sentences
Dev:   28 sentences
Test:  29 sentences


In [40]:
split_conllu_file_safe(
    input_path="/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_pud-ud-test.conllu",
    output_dir="/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits",
    train_ratio=0.8,
    dev_ratio=0.1,
    test_ratio=0.1,
    seed=42
)

✅ Done! Saved to /Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits
Train: 800 sentences
Dev:   100 sentences
Test:  100 sentences


## Merge 

In [41]:
def merge_conllu_files(file_list, output_path):
    """
    Merge multiple CoNLL-U files into a single file.

    Args:
        file_list (list of str): Paths to CoNLL-U files to merge.
        output_path (str): Path to save the merged CoNLL-U file.
    """
    output_path = Path(output_path)
    with output_path.open("w", encoding="utf-8") as fout:
        for file_path in file_list:
            with open(file_path, "r", encoding="utf-8") as fin:
                content = fin.read().strip()
                fout.write(content + "\n\n")  # ensure blank line between sentences
    print(f"✅ Merged {len(file_list)} files into {output_path}")


In [42]:
train_files_list = [
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_gum-ud-train.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_gumreddit-ud-train.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_ewt-ud-train.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits/en_pud-ud-test-train.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits/en_pronouns-ud-test-train.conllu"
]

dev_files_list = [
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_gum-ud-dev.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_gumreddit-ud-dev.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_ewt-ud-dev.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits/en_pud-ud-test-dev.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits/en_pronouns-ud-test-dev.conllu"
]

test_files_list = [
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_gum-ud-test.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_gumreddit-ud-test.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/en_ewt-ud-test.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits/en_pud-ud-test-test.conllu",
    "/Users/frapadovani/Desktop/stanza/parser/conllu_files_original/splits/en_pronouns-ud-test-test.conllu"
]

# Merge train/dev/test files
merge_conllu_files(train_files_list, "/Users/frapadovani/Desktop/stanza/parser/conllu_files_merged/combined_train.conllu")
merge_conllu_files(dev_files_list, "/Users/frapadovani/Desktop/stanza/parser/conllu_files_merged/combined_dev.conllu")
merge_conllu_files(test_files_list, "/Users/frapadovani/Desktop/stanza/parser/conllu_files_merged/combined_test.conllu")


✅ Merged 5 files into /Users/frapadovani/Desktop/stanza/parser/conllu_files_merged/combined_train.conllu
✅ Merged 5 files into /Users/frapadovani/Desktop/stanza/parser/conllu_files_merged/combined_dev.conllu
✅ Merged 5 files into /Users/frapadovani/Desktop/stanza/parser/conllu_files_merged/combined_test.conllu


## Silver CHILDES UD Annotations (./stanza/UD_CHILDES_silver)

In [43]:
silver_data_list = ['/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Abe_kuczaj_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Adam_Brown_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Brown_Eve_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Emma_Weist_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Laura_Braunwald_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Lily_Providence_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Naima_Providence_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Roman_Weist_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Sarah_Brown_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Thomas_Thomas_eud_silver.conllu',
                    '/Users/frapadovani/Desktop/stanza/UD_CHILDES_silver/Violet_Providence_eud_silver.conllu']

merge_conllu_files(silver_data_list, "/Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver.conllu")

✅ Merged 11 files into /Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver.conllu


In [4]:
split_conllu_file_safe_easy(
    input_path="/Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver.conllu",
    output_dir="/Users/frapadovani/Desktop/stanza/parser/silver_files_merged",
    train_ratio=0.8,
    dev_ratio=0.1,
    test_ratio=0.1,
    seed=42
)

📖 Reading sentences from /Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver.conllu ...
✅ Found 1197471 sentences.
💾 Saved 957976 sentences to /Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver_train.conllu
💾 Saved 119747 sentences to /Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver_dev.conllu
💾 Saved 119748 sentences to /Users/frapadovani/Desktop/stanza/parser/silver_files_merged/childes_silver_test.conllu

🎉 Done splitting!
Train: 957976 | Dev: 119747 | Test: 119748
