In [1]:
import pandas as pd

In [2]:
import re 
from g2p_en import G2p
import numpy as np

g2p = G2p()
PHONE_DEF = [
    'AA', 'AE', 'AH', 'AO', 'AW',
    'AY', 'B',  'CH', 'D', 'DH',
    'EH', 'ER', 'EY', 'F', 'G',
    'HH', 'IH', 'IY', 'JH', 'K',
    'L', 'M', 'N', 'NG', 'OW',
    'OY', 'P', 'R', 'S', 'SH',
    'T', 'TH', 'UH', 'UW', 'V',
    'W', 'Y', 'Z', 'ZH'
]
PHONE_DEF_SIL = PHONE_DEF + ['SIL']

def phoneToId(p):
    return PHONE_DEF_SIL.index(p)

In [5]:
from pathlib import Path
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
saveDir       = Path("/home/ubuntu/data/model_transcriptions/")
saveDir.mkdir(parents=True, exist_ok=True)          # create if missing

model_name    = "transformer_short_training_fixed_seed_"
seed_list     = [0,1,2,3,4,5,6,7,8,9]

# PHONE_DEF_SIL: list or dict that maps *zero-based* class-id -> phone string
# e.g. PHONE_DEF_SIL[0] == "SIL", PHONE_DEF_SIL[1] == "AA", ...
# ------------------------------------------------------------------

for s in seed_list:
    # ---- load model pickle -------------------------------------------------
    pkl_path  = saveDir / f"{model_name}{s}_model_outputs.pkl"
    model_out = pd.read_pickle(pkl_path)
    decoded   = model_out["decodedSeqs"]          # list/array of arrays

    # ---- open output file --------------------------------------------------
    out_path  = saveDir / f"decoded_phonemes_seed_{s}.txt"
    with out_path.open("w", encoding="utf-8") as fout:

        for seq in decoded:
            # seq is 1-based (blank==0 was removed earlier) → make it 0-based
            seq_0 = np.asarray(seq, dtype=int) - 1
            # safeguard: drop any −1 that might appear if blank slipped through
            seq_0 = seq_0[seq_0 >= 0]

            # map to phones and join
            phones = " ".join(PHONE_DEF_SIL[idx] for idx in seq_0)
            fout.write(phones + "\n")

    print(f"✅ wrote {len(decoded):,} lines to {out_path.name}")

✅ wrote 880 lines to decoded_phonemes_seed_0.txt
✅ wrote 880 lines to decoded_phonemes_seed_1.txt
✅ wrote 880 lines to decoded_phonemes_seed_2.txt
✅ wrote 880 lines to decoded_phonemes_seed_3.txt
✅ wrote 880 lines to decoded_phonemes_seed_4.txt
✅ wrote 880 lines to decoded_phonemes_seed_5.txt
✅ wrote 880 lines to decoded_phonemes_seed_6.txt
✅ wrote 880 lines to decoded_phonemes_seed_7.txt
✅ wrote 880 lines to decoded_phonemes_seed_8.txt
✅ wrote 880 lines to decoded_phonemes_seed_9.txt


In [13]:
from pathlib import Path
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
saveDir       = Path("/home/ubuntu/data/model_transcriptions_comp/")
saveDir.mkdir(parents=True, exist_ok=True)          # create if missing

model_name    = "transformer_short_training_fixed_seed_"
seed_list     = [0]

# PHONE_DEF_SIL: list or dict that maps *zero-based* class-id -> phone string
# e.g. PHONE_DEF_SIL[0] == "SIL", PHONE_DEF_SIL[1] == "AA", ...
# ------------------------------------------------------------------

for s in seed_list:
    # ---- load model pickle -------------------------------------------------
    pkl_path  = saveDir / f"{model_name}{s}_model_outputs.pkl"
    model_out = pd.read_pickle(pkl_path)
    decoded   = model_out["trueSeqs"]          # list/array of arrays

    # ---- open output file --------------------------------------------------
    out_path  = saveDir / f"ground_truth_phonemes.txt"
    with out_path.open("w", encoding="utf-8") as fout:

        for seq in decoded:
            # seq is 1-based (blank==0 was removed earlier) → make it 0-based
            seq_0 = np.asarray(seq, dtype=int) - 1
            # safeguard: drop any −1 that might appear if blank slipped through
            seq_0 = seq_0[seq_0 >= 0]

            # map to phones and join
            phones = " ".join(PHONE_DEF_SIL[idx] for idx in seq_0)
            fout.write(phones + "\n")

    print(f"✅ wrote {len(decoded):,} lines to {out_path.name}")

✅ wrote 1,200 lines to ground_truth_phonemes.txt


In [11]:
from pathlib import Path
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
saveDir       = Path("/home/ubuntu/data/model_transcriptions_finetune/")
saveDir.mkdir(parents=True, exist_ok=True)          # create if missing

model_name    = "transformer_short_training_fixed_seed_"
seed_list     = [0]

# PHONE_DEF_SIL: list or dict that maps *zero-based* class-id -> phone string
# e.g. PHONE_DEF_SIL[0] == "SIL", PHONE_DEF_SIL[1] == "AA", ...
# ------------------------------------------------------------------

for s in seed_list:
    # ---- load model pickle -------------------------------------------------
    pkl_path  = saveDir / f"{model_name}{s}_model_outputs.pkl"
    model_out = pd.read_pickle(pkl_path)
    decoded   = model_out["transcriptions"]          # list/array of arrays

    # ---- open output file --------------------------------------------------
    out_path  = saveDir / f"ground_truth_sentences.txt"
    with out_path.open("w", encoding="utf-8") as fout:

        for seq in decoded:

            fout.write(seq + "\n")

    print(f"✅ wrote {len(decoded):,} lines to {out_path.name}")

['held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',
 'held out',