In [1]:
import torch
import pickle
import os
import re
from traindata import Traindata
from utilities import *

PATH = "pretraining_results_1/"
reps = getreps(PATH="../data/phonreps.csv")


## Data from CSV per model, not per word

In [None]:
train = {}
fry = {}
ewfg = {}

with open('pretraining_results_1.csv', 'w') as f:

    f.write("model_id, epoch, condition, phonemewise_accuracy, wordwise_accuracy\n")
    for filename in os.listdir(PATH):
        if filename.endswith(".pkl"):
            filepath = os.path.join(PATH, filename)

        epoch = re.search(r'epoch_(.*?)\.pkl', filename).group(1)
        model_id = re.search(r'pretraining_(.*?)\_epoch', filename).group(1)

        with open(filepath, "rb") as file:
            data = pickle.load(file)

        # train
        f.write("{model_id}, {epoch}, {condition}, {phonemewise_accuracy}, {wordwise_accuracy}\n".format(
            model_id = model_id,
            epoch = epoch,
            condition = 'train',
            phonemewise_accuracy = data['pretraining']['phoneme_wise_accuracy'],
            wordwise_accuracy = data['pretraining']['word_wise_accuracy']
        )
        )

        # ewfg
        f.write("{model_id}, {epoch}, {condition}, {phonemewise_accuracy}, {wordwise_accuracy}\n".format(
            model_id = model_id,
            epoch = epoch,
            condition = 'ewfg',
            phonemewise_accuracy = data['ewfg']['phoneme_wise_accuracy'],
            wordwise_accuracy = data['ewfg']['word_wise_accuracy']
        )
        )

        # fry
        f.write("{model_id}, {epoch}, {set}, {phonemewise_accuracy}, {wordwise_accuracy}\n".format(
            model_id = model_id,
            epoch = epoch,
            set = 'fry',
            phonemewise_accuracy = data['fry_1980']['phoneme_wise_accuracy'],
            wordwise_accuracy = data['fry_1980']['word_wise_accuracy']
        )
        )

In [None]:
data.keys()

## Generate data from PKL, including data for each word

In [3]:
PATH = 'pretraining_results_1'

# find all the PKL files
checkpoints = [file for file in os.listdir(PATH) if file.endswith('.pkl')]

### Tutorial

In [None]:
# First clone the tensors to avoid modifying the originals
phon_preds = data["pretraining"]["phon_predictions"].clone()
phon_targets = data["pretraining"]["phon_targets"].clone()

# ===== FEATURE LEVEL ACCURACY =====
# Create mask for valid features (valid means not padding tokens which are marked as 2)  
phon_features_mask = phon_targets != 2

# Find which predictions match targets, but only count valid features
masked_equalities = torch.eq(phon_preds, phon_targets) & phon_features_mask

# Calculate overall feature accuracy across entire dataset
feature_accuracy = masked_equalities.sum() / phon_features_mask.sum()
print("\nOverall Feature-Level Accuracy:", feature_accuracy.item())

# ===== PHONEME LEVEL ACCURACY =====
# A phoneme is correct only if ALL its features are correct
# We use .all(dim=2) to check across feature dimension
phoneme_correct = masked_equalities.all(dim=2)

# Identify valid phonemes (those where not all features are 2/padding)
valid_phonemes = ~(phon_targets == 2).all(dim=2)

# Calculate overall phoneme accuracy
phoneme_accuracy = phoneme_correct[valid_phonemes].sum() / valid_phonemes.sum()
print("Overall Phoneme-Level Accuracy:", phoneme_accuracy.item())

# ===== WORD LEVEL ACCURACY =====
# A word is correct only if ALL its valid phonemes are correct
word_correct = torch.all(phoneme_correct | ~valid_phonemes, dim=1)
word_accuracy = word_correct.sum() / float(word_correct.size(0))
print("Overall Word-Level Accuracy:", word_accuracy.item())

# ===== PER-WORD DETAILED ANALYSIS =====
print("\nDetailed per-word analysis:")
for i in range(5):  # Show first 5 words as example
   # Count valid features for this word
   n_valid_features = phon_features_mask[i].sum().item()
   n_correct_features = masked_equalities[i].sum().item()
   
   # Count valid phonemes for this word
   n_valid_phonemes = valid_phonemes[i].sum().item()
   n_correct_phonemes = (phoneme_correct[i] & valid_phonemes[i]).sum().item()
   
   print(f"\nWord {i}:")
   print(f"Features: {n_correct_features}/{n_valid_features} correct")
   print(f"Phonemes: {n_correct_phonemes}/{n_valid_phonemes} correct")
   print(f"Word-level: {'Correct' if word_correct[i] else 'Incorrect'}")

### Pretraining results

In [38]:
DATASET = 'pretraining'

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    epoch = re.search(r'epoch_(.*?)\.pkl', checkpoint).group(1)
    model_id = re.search(r'pretraining_(.*?)\_epoch', checkpoint).group(1)


    # First clone the tensors to avoid modifying the originals
    phon_preds = data["pretraining"]["phon_predictions"].clone()
    phon_targets = data["pretraining"]["phon_targets"].clone()

    # ===== FEATURE LEVEL ACCURACY =====
    # Create mask for valid features (valid means not padding tokens which are marked as 2)  
    phon_features_mask = phon_targets != 2

    # Find which predictions match targets, but only count valid features
    masked_equalities = torch.eq(phon_preds, phon_targets) & phon_features_mask

    # Calculate overall feature accuracy across entire dataset
    feature_accuracy = masked_equalities.sum() / phon_features_mask.sum()

    # ===== PHONEME LEVEL ACCURACY =====
    # A phoneme is correct only if ALL its features are correct
    # We use .all(dim=2) to check across feature dimension
    phoneme_correct = masked_equalities.all(dim=2)

    # Identify valid phonemes (those where not all features are 2/padding)
    valid_phonemes = ~(phon_targets == 2).all(dim=2)

    # Calculate overall phoneme accuracy
    phoneme_accuracy = phoneme_correct[valid_phonemes].sum() / valid_phonemes.sum()

    # ===== WORD LEVEL ACCURACY =====
    # A word is correct only if ALL its valid phonemes are correct
    word_correct = torch.all(phoneme_correct | ~valid_phonemes, dim=1)
    word_accuracy = word_correct.sum() / float(word_correct.size(0))

    with open('pretraining_results_1/CSV/pretrain_words_results_' + model_id + '_' + 'epoch' + '_' + epoch + '.csv', 'w') as f:

        f.write("model_id, epoch, training_phase, condition, word, featurewise_accuracy, phonemewise_accuracy, wordwise_accuracy\n")

        for i in range(len(data[DATASET]['words'])):

            n_valid_features = phon_features_mask[i].sum().item()
            n_correct_features = masked_equalities[i].sum().item()
            
            # Count valid phonemes for this word
            n_valid_phonemes = valid_phonemes[i].sum().item()
            n_correct_phonemes = (phoneme_correct[i] & valid_phonemes[i]).sum().item()

            word = data[DATASET]['words'][i]        

            f.write("{model_id}, {epoch}, {training_phase}, {condition}, {word}, {featurewise_accuracy}, {phonemewise_accuracy}, {wordwise_accuracy}\n".format(
                model_id = model_id,
                epoch = epoch,
                training_phase = 'pretrain',
                condition = 'pretrain',
                word = word,
                featurewise_accuracy = n_correct_features/n_valid_features,
                phonemewise_accuracy = n_correct_phonemes/n_valid_phonemes,
                wordwise_accuracy = word_correct[i]
            )
        )
                    


### Fry Results

In [39]:
DATASET = 'fry_1980'

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    epoch = re.search(r'epoch_(.*?)\.pkl', checkpoint).group(1)
    model_id = re.search(r'pretraining_(.*?)\_epoch', checkpoint).group(1)

    phon_preds = data[DATASET]["phon_predictions"].clone()
    phon_targets = data[DATASET]["phon_targets"].clone()

    phon_features_mask = phon_targets != 2

    masked_equalities = torch.eq(phon_preds, phon_targets) & phon_features_mask

    feature_accuracy = masked_equalities.sum() / phon_features_mask.sum()

    phoneme_correct = masked_equalities.all(dim=2)

    valid_phonemes = ~(phon_targets == 2).all(dim=2)

    phoneme_accuracy = phoneme_correct[valid_phonemes].sum() / valid_phonemes.sum()

    word_correct = torch.all(phoneme_correct | ~valid_phonemes, dim=1)
    word_accuracy = word_correct.sum() / float(word_correct.size(0))

    with open('pretraining_results_1/CSV/fry_words_results_' + model_id + '_' + 'epoch' + '_' + epoch + '.csv', 'w') as f:

        f.write("model_id, epoch, training_phase, condition, word, featurewise_accuracy, phonemewise_accuracy, wordwise_accuracy\n")

        for i in range(len(data[DATASET]['words'])):

            n_valid_features = phon_features_mask[i].sum().item()
            n_correct_features = masked_equalities[i].sum().item()
            
            # Count valid phonemes for this word
            n_valid_phonemes = valid_phonemes[i].sum().item()
            n_correct_phonemes = (phoneme_correct[i] & valid_phonemes[i]).sum().item()
        
            word = data[DATASET]['words'][i]

            f.write("{model_id}, {epoch}, {training_phase}, {condition}, {word}, {featurewise_accuracy}, {phonemewise_accuracy}, {wordwise_accuracy}\n".format(
                model_id = model_id,
                epoch = epoch,
                training_phase = 'pretrain',
                condition = 'fry_1980',
                word = word,
                featurewise_accuracy = n_correct_features/n_valid_features,
                phonemewise_accuracy = n_correct_phonemes/n_valid_phonemes,
                wordwise_accuracy = word_correct[i]
            )
        )
                    


### EWFG Results

In [40]:
DATASET = 'ewfg'

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    epoch = re.search(r'epoch_(.*?)\.pkl', checkpoint).group(1)
    model_id = re.search(r'pretraining_(.*?)\_epoch', checkpoint).group(1)

    phon_preds = data[DATASET]["phon_predictions"].clone()
    phon_targets = data[DATASET]["phon_targets"].clone()

    phon_features_mask = phon_targets != 2

    masked_equalities = torch.eq(phon_preds, phon_targets) & phon_features_mask

    feature_accuracy = masked_equalities.sum() / phon_features_mask.sum()

    phoneme_correct = masked_equalities.all(dim=2)

    valid_phonemes = ~(phon_targets == 2).all(dim=2)

    phoneme_accuracy = phoneme_correct[valid_phonemes].sum() / valid_phonemes.sum()

    word_correct = torch.all(phoneme_correct | ~valid_phonemes, dim=1)
    word_accuracy = word_correct.sum() / float(word_correct.size(0))

    with open('pretraining_results_1/CSV/ewfg_words_results_' + model_id + '_' + 'epoch' + '_' + epoch + '.csv', 'w') as f:

        f.write("model_id, epoch, training_phase, condition, word, featurewise_accuracy, phonemewise_accuracy, wordwise_accuracy\n")

        for i in range(len(data[DATASET]['words'])):

            n_valid_features = phon_features_mask[i].sum().item()
            n_correct_features = masked_equalities[i].sum().item()
            
            # Count valid phonemes for this word
            n_valid_phonemes = valid_phonemes[i].sum().item()
            n_correct_phonemes = (phoneme_correct[i] & valid_phonemes[i]).sum().item()
        
            word = data[DATASET]['words'][i]

            f.write("{model_id}, {epoch}, {training_phase}, {condition}, {word}, {featurewise_accuracy}, {phonemewise_accuracy}, {wordwise_accuracy}\n".format(
                model_id = model_id,
                epoch = epoch,
                training_phase = 'pretrain',
                condition = 'ewfg',
                word = word,
                featurewise_accuracy = n_correct_features/n_valid_features,
                phonemewise_accuracy = n_correct_phonemes/n_valid_phonemes,
                wordwise_accuracy = word_correct[i]
            )
        )
                

## String form results
This portion of the script generates the string form of the results, rather than their quantitative form. The code is separated from the quatitative results in order to reduce file size.


### Pretraining Words
Data for the words from the pretraining routine.

In [6]:
'abra' in tmp[DATASET]['words']

False

In [10]:
tmp[DATASET]['words']

['a',
 'aaron',
 'abandoned',
 'abbie',
 'abby',
 'abe',
 'abed',
 'abee',
 'able',
 'aboard',
 'about',
 'above',
 'abracadabra',
 'absence',
 'absolutely',
 'absorb',
 'abstract',
 'abundant',
 'abuse',
 'accent',
 'accept',
 'access',
 'accessible',
 'accessorize',
 'accident',
 'accidentally',
 'accordion',
 'ache',
 'ack',
 'acorn',
 'acorns',
 'acre',
 'acrobat',
 'across',
 'act',
 'acting',
 'activated',
 'activity',
 'acts',
 'actually',
 'ad',
 'adam',
 'adams',
 'add',
 'added',
 'adding',
 'addition',
 'adjust',
 'admire',
 'admired',
 'admiring',
 'adoption',
 'adorable',
 'adrian',
 'adult',
 'advance',
 'adventure',
 'adventures',
 'advice',
 'affect',
 'afraid',
 'africa',
 'african',
 'after',
 'afternoon',
 'afternoons',
 'afterwards',
 'again',
 'against',
 'age',
 'aggravated',
 'ago',
 'agree',
 'agreeable',
 'agricultural',
 'ah',
 'aha',
 'ahead',
 'aid',
 'aim',
 'aimee',
 'aiming',
 'air',
 'airplane',
 'airplanes',
 'airport',
 'ais',
 'aislinn',
 'ajax',
 'al

In [13]:
word in data[DATASET]['words'] 

True

In [15]:
TD.pool

['fleece',
 'gil',
 'disinfectant',
 'newer',
 'madame',
 'splashing',
 'ninja',
 'bec',
 'alan',
 'chairs',
 'amanda',
 'dairy',
 'meatless',
 'maize',
 'hail',
 'dub',
 'snooze',
 'foods',
 'imports',
 'seeking',
 'alex',
 'shown',
 'wasted',
 'yeh',
 'born',
 'floated',
 'soles',
 'whites',
 'return',
 'little',
 'particularly',
 'john',
 'nurses',
 'raw',
 'rockwood',
 'promised',
 'compactor',
 'power',
 'eighteen',
 'pours',
 'sledding',
 'lobster',
 'august',
 'knob',
 'ladies',
 'auto',
 'lenard',
 'sung',
 'twins',
 'orders',
 'roof',
 'bedbugs',
 'mechanics',
 'laundromat',
 'shells',
 'liking',
 'keeper',
 'clear',
 'dankner',
 'signs',
 'raggedy',
 'solar',
 'talks',
 'nina',
 'stump',
 'size',
 'clue',
 'intrusive',
 'raven',
 'spread',
 'ago',
 'wong',
 'intersperse',
 'elaine',
 'chef',
 'chickened',
 'walls',
 'al',
 'trousers',
 'es',
 'cabbage',
 'ruth',
 'loaves',
 'terrible',
 'pox',
 'complete',
 'gaby',
 'pouch',
 'rosamund',
 'blot',
 'berries',
 'knees',
 'roger

In [19]:
tmp = ['the', 'b;ah']

In [25]:
len(data[DATASET]['words'])

7300

In [26]:
data.keys()

dict_keys(['pretraining', 'fry_1980', 'ewfg'])

In [None]:
all_words = []

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    for dataset in data.keys():
        
        all_words.extend(data[dataset]['words'])

all_words = list(set(all_words))

In [None]:
DATASET = 'pretraining'


TD = Traindata(all_words, phonpath = "../data/phonreps.csv", oneletter=True)

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    epoch = re.search(r'epoch_(.*?)\.pkl', checkpoint).group(1)
    model_id = re.search(r'pretraining_(.*?)\_epoch', checkpoint).group(1)


    # First clone the tensors to avoid modifying the originals
    phon_preds = data["pretraining"]["phon_predictions"].clone()
 

    with open('pretraining_results_1/stringform/pretrain_words_stringform_results_' + model_id + '_' + 'epoch' + '_' + epoch + '.csv', 'w') as f:

        f.write("model_id, epoch, training_phase, condition, word, phon_pred, phon_target\n")

        for i in range(len(data[DATASET]['words'])):

            word = data[DATASET]['words'][i]        
            phon_pred = "-".join(convert_numeric_prediction(phon_preds[i], phonreps=reps))
            phon_target = "-".join(TD.cmudict[word])

            f.write("{model_id}, {epoch}, {training_phase}, {condition}, {word}, {phon_pred}, {phon_target}\n".format(
                model_id = model_id,
                epoch = epoch,
                training_phase = 'pretrain',
                condition = 'pretrain',
                word = word,
                phon_pred = phon_pred,
                phon_target = phon_target
            )
        )
                    

orthpad changed to 0 because onehot encodings were selected for orthography
Words of phonological length 1 pass reconstruction test
Words of phonological length 2 pass reconstruction test
Words of phonological length 3 pass reconstruction test
Words of phonological length 4 pass reconstruction test
Words of phonological length 5 pass reconstruction test
Words of phonological length 6 pass reconstruction test
Words of phonological length 7 pass reconstruction test
Words of phonological length 8 pass reconstruction test
Words of phonological length 9 pass reconstruction test
Words of phonological length 10 pass reconstruction test
Words of phonological length 11 pass reconstruction test
Words of phonological length 12 pass reconstruction test
Words of phonological length 13 pass reconstruction test
Words of phonological length 14 pass reconstruction test
Words of phonological length 15 pass reconstruction test
Representations initialized. Done.


### Fry Words

In [None]:
DATASET = 'fry_1980'

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    epoch = re.search(r'epoch_(.*?)\.pkl', checkpoint).group(1)
    model_id = re.search(r'pretraining_(.*?)\_epoch', checkpoint).group(1)

    # First clone the tensors to avoid modifying the originals
    phon_preds = data[DATASET]["phon_predictions"].clone()

    with open('pretraining_results_1/stringform/fry_words_stringform_results_' + model_id + '_' + 'epoch' + '_' + epoch + '.csv', 'w') as f:

        f.write("model_id, epoch, training_phase, condition, word, phon_pred, phon_target\n")

        for i in range(len(data[DATASET]['words'])):

            word = data[DATASET]['words'][i]        
            phon_pred = "-".join(convert_numeric_prediction(phon_preds[i], phonreps=reps))
            phon_target = "-".join(TD.cmudict[word])

            f.write("{model_id}, {epoch}, {training_phase}, {condition}, {word}, {phon_pred}, {phon_target}\n".format(
                model_id = model_id,
                epoch = epoch,
                training_phase = 'pretrain',
                condition = 'pretrain',
                word = word,
                phon_pred = phon_pred,
                phon_target = phon_target
            )
        )
                    

### EWFG Words

In [None]:
DATASET = 'ewfg'

for checkpoint in checkpoints:

    data = pickle.load(open(PATH + "/" + checkpoint, "rb"))
    epoch = re.search(r'epoch_(.*?)\.pkl', checkpoint).group(1)
    model_id = re.search(r'pretraining_(.*?)\_epoch', checkpoint).group(1)

    # First clone the tensors to avoid modifying the originals
    phon_preds = data[DATASET]["phon_predictions"].clone()

    with open('pretraining_results_1/stringform/ewfg_words_stringform_results_' + model_id + '_' + 'epoch' + '_' + epoch + '.csv', 'w') as f:

        f.write("model_id, epoch, training_phase, condition, word, phon_pred, phon_target\n")

        for i in range(len(data[DATASET]['words'])):

            word = data[DATASET]['words'][i]        
            phon_pred = "-".join(convert_numeric_prediction(phon_preds[i], phonreps=reps))
            phon_target = "-".join(TD.cmudict[word])

            f.write("{model_id}, {epoch}, {training_phase}, {condition}, {word}, {phon_pred}, {phon_target}\n".format(
                model_id = model_id,
                epoch = epoch,
                training_phase = 'pretrain',
                condition = 'pretrain',
                word = word,
                phon_pred = phon_pred,
                phon_target = phon_target
            )
        )
                    

IndexError: index 7300 is out of bounds for dimension 0 with size 7300