In [1]:
import os
import pandas as pd

import spacy
from g2p_en import G2p
from wordfreq import zipf_frequency

nlp = spacy.load('en_core_web_sm')
g2p = G2p()

pd.options.display.max_rows = 4000

# Process the hand-made evaluation datasets
def process_dataset(directory: str, is_real=True) -> pd.DataFrame:
    data = []
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            name_parts = file.split('.')[0].split('_')
            df = pd.read_csv(os.path.join(directory, file))
            
            df['Lexicality'] = name_parts[1]
            df['Morph Complexity'] = name_parts[-1]

            if is_real: df['Frequency'] = name_parts[2]
            data.append(df)
    
    data = pd.concat(data, join="outer", ignore_index=True)
    return data

# NOTE: need to add morph structure to the dataset
# Add frequency, part of speech, and phonemes to the dataset
def clean_and_enrich_data(df: pd.DataFrame, is_real=True) -> pd.DataFrame:
    
    # Drop rows with no word value
    df = df.dropna(subset=['word'])
    
    # Rename columns
    df = df.rename(columns={
        'word': 'Word',
        'PoS': 'Part of Speech',
        'num letters': 'Length',
    })

    # Add Zipf Frequency and Part of Speech columns
    if is_real:
        df = df.drop(columns=['Number', 'percentile freq', 'morph structure'])
        df['Zipf Frequency'] = df['Word'].apply(lambda x: zipf_frequency(x, 'en'))
        df['Part of Speech'] = df['Word'].apply(lambda x: nlp(x)[0].pos_)
    
    # Add Phonemes column
    df["Phonemes"] = df["Word"].apply(g2p)
    
    return df

# Combine and reformat the real and pseudo word datasets
def get_evaluation_data():
    
    # Process real words
    real_words = process_dataset('../data/eval_dataset_real')
    real_words = clean_and_enrich_data(real_words)

    # Process pseudo words
    pseudo_words = process_dataset('../data/eval_dataset_pseudo', is_real=False)
    pseudo_words = clean_and_enrich_data(pseudo_words, is_real=False)

    # Combine datasets
    data = pd.concat([real_words, pseudo_words], join="outer", ignore_index=True)

    # Rearrange columns
    columns = ["Word", "Length", "Frequency", "Zipf Frequency", 
            "Morph Complexity", "Lexicality", "Part of Speech", "Phonemes"]
    data = data.reindex(columns=columns)

    # Isolate words and their phonemes
    real_words = real_words[['Word', 'Phonemes']]
    pseudo_words = pseudo_words[['Word', 'Phonemes']]
    
    return data, real_words, pseudo_words

# Get the evaluation data
data, real_words, pseudo_words = get_evaluation_data()
data


Unnamed: 0,Word,Length,Frequency,Zipf Frequency,Morph Complexity,Lexicality,Part of Speech,Phonemes
0,bathmat,7,low,1.55,complex,real,NOUN,"[B, AE1, TH, M, AH0, T]"
1,decoder,7,low,2.84,complex,real,NOUN,"[D, IH0, K, OW1, D, ER0]"
2,defiant,7,low,3.21,complex,real,ADJ,"[D, IH0, F, AY1, AH0, N, T]"
3,padlock,7,low,2.68,complex,real,NOUN,"[P, AE1, D, L, AA2, K]"
4,immoral,7,low,3.46,complex,real,ADJ,"[IH0, M, AO1, R, AH0, L]"
5,nonstop,7,low,3.37,complex,real,ADJ,"[N, AA2, N, S, T, AA1, P]"
6,parasol,7,low,2.63,complex,real,PROPN,"[P, EH1, R, AH0, S, AO2, L]"
7,reactor,7,low,3.83,complex,real,NOUN,"[R, IY0, AE1, K, T, ER0]"
8,recycle,7,low,3.35,complex,real,NOUN,"[R, IY0, S, AY1, K, AH0, L]"
9,reenact,7,low,2.42,complex,real,VERB,"[R, IY0, IH0, N, AE1, K, T]"


In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')
data['Part of Speech'] = data['Word'].apply(lambda x: nlp(x)[0].pos_)
data

Unnamed: 0,Word,Length,Frequency,Zipf Frequency,Morph Complexity,Lexicality,Part of Speech
0,bathmat,7,low,1.55,complex,real,NOUN
1,decoder,7,low,2.84,complex,real,NOUN
2,defiant,7,low,3.21,complex,real,ADJ
3,padlock,7,low,2.68,complex,real,NOUN
4,immoral,7,low,3.46,complex,real,ADJ
5,nonstop,7,low,3.37,complex,real,ADJ
6,parasol,7,low,2.63,complex,real,NOUN
7,reactor,7,low,3.83,complex,real,NOUN
8,recycle,7,low,3.35,complex,real,NOUN
9,reenact,7,low,2.42,complex,real,VERB


In [4]:
from morphemes import Morphemes
m = Morphemes("./morphemes_data")

def get_morphological_data(word):
    parse = m.parse(word)

    if parse["status"] == "NOT_FOUND":
        return [], [], [], [], 1, "0-1-0"
    
    tree = parse["tree"]

    prefixes = []
    roots = []
    root_freqs = []
    suffixes = []

    for node in tree:
        if node["type"] == "prefix":
            prefixes.append(node["text"])
        
        elif "children" in node:
            for child in node["children"]:
                if child["type"] == "root":
                    roots.append(child["text"])
                    root_freqs.append(zipf_frequency(child["text"], 'en'))
        else:
            suffixes.append(node["text"])

    count = parse["morpheme_count"]
    structure = f"{len(prefixes)}-{len(roots)}-{len(suffixes)}"

    return prefixes, roots, root_freqs, suffixes, count, structure

columns = ["Prefixes", "Roots", "Root Frequencies", "Suffixes", "Morpheme Count", "Morpheme Structure"]
data[columns] = data['Word'].apply(lambda word: pd.Series(get_morphological_data(word)))
data

In [6]:
from g2p_en import G2p

g2p = G2p()
data["Phonemes"] = data["Word"].apply(lambda x: g2p(x))
data