In [18]:
import os
import json
from pathlib import Path
import subprocess
import jsonlines

base_dir = "/home/emmanuelka"

# Helper: Extract source and reference from FLORES json
def extract_source_and_reference(lang_pair, flores_base=f"{base_dir}/AIMS-NLP-Project/data/flores"):
    src, tgt = lang_pair.split("-")
    flores_path = Path(flores_base) / lang_pair / "devtest.json"
    src_file = flores_path.parent / f"source_{src}.txt"
    ref_file = flores_path.parent / f"reference_{tgt}.txt"

    if not src_file.exists() or not ref_file.exists():
        with jsonlines.open(flores_path) as reader:
            src_lines, tgt_lines = [], []
            for obj in reader:
                trans = obj["translation"]
                src_lines.append(trans[src])
                tgt_lines.append(trans[tgt])

        with open(src_file, "w") as f:
            f.write("\n".join(src_lines))
        with open(ref_file, "w") as f:
            f.write("\n".join(tgt_lines))

    return str(src_file), str(ref_file)

# Evaluate generated_predictions.txt using COMET-QE
def evaluate_generation(gen_path, lang_pair, run_type, run_size=None, suffix=None):
    src_file, ref_file = extract_source_and_reference(lang_pair)
    lang_output_dir = Path(f"{base_dir}/AIMS-NLP-Project/Results") / lang_pair
    lang_output_dir.mkdir(parents=True, exist_ok=True)

    # Construct suffix string
    suffix_str = f"_{suffix}" if suffix else ""

    # Build output filename
    if run_type == "baseline":
        output_score_path = lang_output_dir / f"{lang_pair}_comet_score_baseline{suffix_str}.txt"
    else:
        output_score_path = lang_output_dir / f"{lang_pair}_comet_score_{run_type}_{run_size}{suffix_str}.txt"

    subprocess.run(
        [
            "comet-score",
            "-s", src_file,
            "-t", gen_path,
            "-r", ref_file,
            "--model", "McGill-NLP/ssa-comet-mtl"
        ],
        stdout=open(output_score_path, "w")
    )
    print(f"✔ Scored: {output_score_path.name}")

# Walk through M2M-100 outputs and evaluate
base_dir2 = f"{base_dir}/M2M-100"
valid_run_types = ["africomet", "africomet_base", "random"]
valid_run_sizes = ["1000", "2000", "4000", "8000", "16000", "32000"]

# Handle baseline runs
baseline_dir = os.path.join(base_dir2, "baseline")
if os.path.isdir(baseline_dir):
    for lang_pair in os.listdir(baseline_dir):
        gen_path = os.path.join(baseline_dir, lang_pair, "generated_predictions.txt")
        if os.path.isfile(gen_path):
            evaluate_generation(gen_path, lang_pair, run_type="baseline", suffix="ssa")

# Handle all other experimental runs
for lang_pair in os.listdir(base_dir2):
    lang_path = os.path.join(base_dir2, lang_pair)
    if not os.path.isdir(lang_path) or lang_pair == "baseline":
        continue

    for run_folder in os.listdir(lang_path):
        run_path = os.path.join(lang_path, run_folder)
        if not os.path.isdir(run_path):
            continue

        parts = run_folder.split("_")
        if len(parts) < 3:
            continue  # Need at least run_type, size, and suffix
        
        suffix = parts[-1]                   # e.g., 'ssa'
        run_size = parts[-2]                 # e.g., '1000'
        run_type = "_".join(parts[:-2])     # everything else (e.g., 'africomet_base')

        if run_type in valid_run_types and run_size in valid_run_sizes:
            gen_path = os.path.join(run_path, "generated_predictions.txt")
            if os.path.isfile(gen_path):
                evaluate_generation(gen_path, lang_pair, run_type=run_type, run_size=run_size, suffix=suffix)


Seed set to 1
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 7796.10it/s]
Encoder model frozen.
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|██████████| 64/64 [00:23<00:00,  2.72it/s]


✔ Scored: en-yo_comet_score_africomet_base_32000_ssa.txt


In [19]:
import os
import shutil

# === Configuration ===
base_dir = "/home/emmanuelka/M2M-100"
results_dir = "/home/emmanuelka/AIMS-NLP-Project/Results"
generations_dir = "/home/emmanuelka/AIMS-NLP-Project/generations"

# Ensure output directories exist
os.makedirs(results_dir, exist_ok=True)
os.makedirs(generations_dir, exist_ok=True)

# Valid run types and sizes
valid_run_types = ["africomet", "africomet_base", "random"]
valid_run_sizes = ["1000", "2000", "4000", "8000", "16000", "32000"]

# === Helper Function ===
def extract_run_parts(folder_name):
    parts = run_folder.split("_")
    if len(parts) < 3:
        return None, None, None  # Need at least run_type, size, and suffix
    
    suffix = parts[-1]                   # e.g., 'ssa'
    run_size = parts[-2]                 # e.g., '1000'
    run_type = "_".join(parts[:-2])     # everything else (e.g., 'africomet_base')
    # parts = folder_name.split("_")
    # if len(parts) < 2:
    #     return None, None, None
    # run_type = parts[0]
    # run_size = parts[1]
    # suffix = parts[2] if len(parts) > 2 else None
    return run_type, run_size, suffix

# === Handle baseline ===
baseline_dir = os.path.join(base_dir, "baseline")
if os.path.isdir(baseline_dir):
    for lang_pair in os.listdir(baseline_dir):
        lang_path = os.path.join(baseline_dir, lang_pair)

        # === Handle all_results.json ===
        all_results = os.path.join(lang_path, "all_results.json")
        if os.path.isfile(all_results):
            lang_output_dir = os.path.join(results_dir, lang_pair)
            os.makedirs(lang_output_dir, exist_ok=True)
            new_name = f"{lang_pair}_all_results_baseline_ssa.json"
            shutil.copyfile(all_results, os.path.join(lang_output_dir, new_name))
            print(f"✔ Copied: {new_name}")

        # === Handle generated_predictions.txt ===
        gen_path = os.path.join(lang_path, "generated_predictions.txt")
        if os.path.isfile(gen_path):
            lang_gen_dir = os.path.join(generations_dir, lang_pair)
            os.makedirs(lang_gen_dir, exist_ok=True)
            new_name = f"{lang_pair}_generated_predictions_baseline_ssa.txt"
            shutil.copyfile(gen_path, os.path.join(lang_gen_dir, new_name))
            print(f"✔ Copied: {new_name}")

# === Handle experimental runs ===
for lang_pair in os.listdir(base_dir):
    lang_path = os.path.join(base_dir, lang_pair)
    if not os.path.isdir(lang_path) or lang_pair == "baseline":
        continue

    for run_folder in os.listdir(lang_path):
        run_path = os.path.join(lang_path, run_folder)
        if not os.path.isdir(run_path):
            continue

        run_type, run_size, suffix = extract_run_parts(run_folder)
        if not run_type or not run_size:
            continue

        if run_type in valid_run_types and run_size in valid_run_sizes:
            suffix_str = f"_{suffix}" if suffix else ""

            # === Copy all_results.json ===
            all_results = os.path.join(run_path, "all_results.json")
            if os.path.isfile(all_results):
                lang_output_dir = os.path.join(results_dir, lang_pair)
                os.makedirs(lang_output_dir, exist_ok=True)
                new_name = f"{lang_pair}_all_results_{run_type}_{run_size}{suffix_str}.json"
                shutil.copyfile(all_results, os.path.join(lang_output_dir, new_name))
                print(f"✔ Copied: {new_name}")

            # === Copy generated_predictions.txt ===
            gen_path = os.path.join(run_path, "generated_predictions.txt")
            if os.path.isfile(gen_path):
                lang_gen_dir = os.path.join(generations_dir, lang_pair)
                os.makedirs(lang_gen_dir, exist_ok=True)
                new_name = f"{lang_pair}_generated_predictions_{run_type}_{run_size}{suffix_str}.txt"
                shutil.copyfile(gen_path, os.path.join(lang_gen_dir, new_name))
                print(f"✔ Copied: {new_name}")


✔ Copied: en-yo_all_results_africomet_base_32000_ssa.json
✔ Copied: en-yo_generated_predictions_africomet_base_32000_ssa.txt


In [20]:
import os
import fnmatch

# Set your base directory
base_dir = "/home/emmanuelka/AIMS-NLP-Project/Results/en-yo"
gen_dir = "/home/emmanuelka/AIMS-NLP-Project/generations/en-yo"

# Traverse recursively
for root, dirs, files in os.walk(base_dir):
    for filename in files:

        # Case 1: new_africomet_base → africomet_base + _ssa
        if fnmatch.fnmatch(filename, "*new_africomet_base_*.json"):
            print(f"Found: {filename}")
            new_filename = filename.replace("new_africomet_base_", "africomet_base_") \
                                   .replace(".json", "_ssa.json")
            old_path = os.path.join(root, filename)
            new_path = os.path.join(root, new_filename)
            os.rename(old_path, new_path)
            print(f"✔ Renamed: {filename} → {new_filename}")

        # Case 2: africomet_new → africomet + _ssaQE
        elif fnmatch.fnmatch(filename, "*africomet_new_*.json"):
            print(f"Found: {filename}")
            new_filename = filename.replace("africomet_new_", "africomet_") \
                                   .replace(".json", "_ssaQE.json")
            old_path = os.path.join(root, filename)
            new_path = os.path.join(root, new_filename)
            os.rename(old_path, new_path)
            print(f"✔ Renamed: {filename} → {new_filename}")

        # Case 3: anything ending in _ssa.json or _ssa.txt → replace _ssa. with _ssaMTL.
        elif "_ssa." in filename:
            print(f"Found: {filename}")
            new_filename = filename.replace("_ssa.", "_ssaMTL.")
            old_path = os.path.join(root, filename)
            new_path = os.path.join(root, new_filename)
            os.rename(old_path, new_path)
            print(f"✔ Renamed: {filename} → {new_filename}")

# Traverse recursively
for root, dirs, files in os.walk(gen_dir):
    for filename in files:

        # Case 1: new_africomet_base → africomet_base + _ssa
        if fnmatch.fnmatch(filename, "*new_africomet_base_*.json"):
            print(f"Found: {filename}")
            new_filename = filename.replace("new_africomet_base_", "africomet_base_") \
                                   .replace(".json", "_ssa.json")
            old_path = os.path.join(root, filename)
            new_path = os.path.join(root, new_filename)
            os.rename(old_path, new_path)
            print(f"✔ Renamed: {filename} → {new_filename}")

        # Case 2: africomet_new → africomet + _ssaQE
        elif fnmatch.fnmatch(filename, "*africomet_new_*.json"):
            print(f"Found: {filename}")
            new_filename = filename.replace("africomet_new_", "africomet_") \
                                   .replace(".json", "_ssaQE.json")
            old_path = os.path.join(root, filename)
            new_path = os.path.join(root, new_filename)
            os.rename(old_path, new_path)
            print(f"✔ Renamed: {filename} → {new_filename}")

        # Case 3: anything ending in _ssa.json or _ssa.txt → replace _ssa. with _ssaMTL.
        elif "_ssa." in filename:
            print(f"Found: {filename}")
            new_filename = filename.replace("_ssa.", "_ssaMTL.")
            old_path = os.path.join(root, filename)
            new_path = os.path.join(root, new_filename)
            os.rename(old_path, new_path)
            print(f"✔ Renamed: {filename} → {new_filename}")

Found: en-yo_comet_score_africomet_base_32000_ssa.txt
✔ Renamed: en-yo_comet_score_africomet_base_32000_ssa.txt → en-yo_comet_score_africomet_base_32000_ssaMTL.txt
Found: en-yo_all_results_africomet_base_32000_ssa.json
✔ Renamed: en-yo_all_results_africomet_base_32000_ssa.json → en-yo_all_results_africomet_base_32000_ssaMTL.json
Found: en-yo_generated_predictions_africomet_base_32000_ssa.txt
✔ Renamed: en-yo_generated_predictions_africomet_base_32000_ssa.txt → en-yo_generated_predictions_africomet_base_32000_ssaMTL.txt
