In [10]:
import os
import pandas as pd
from wordfreq import zipf_frequency

pd.options.display.max_rows = 4000

data = []

files = os.listdir('eval_real_dataset')
for file in files:
    if file.endswith('.csv'):
        name = file.split('.')[0]
        _, lexicality, frequency, _, morphology = name.split('_')
        df = pd.read_csv("eval_real_dataset/" + file)

        # Add lexicality, morph complexity, and frequency columns
        df['Frequency'] = frequency
        df['Lexicality'] = lexicality
        df['Morph Complexity'] = morphology

        data.append(df)

files = os.listdir('./eval_pseudo_dataset')
for file in files:
    if file.endswith('.csv'):
        name = file.split('.')[0]
        _, lexicality, _, morphology = name.split('_')
        df = pd.read_csv("./eval_pseudo_dataset/" + file)

        df['Lexicality'] = lexicality
        df['Morph Complexity'] = morphology

        data.append(df)

# Combine all dataframes into one
data = pd.concat(data, join="outer", ignore_index=True)

# Remove rows with missing values
data = data.dropna(subset=['word'])

# Remove "Number" and "percentile freq" column
data = data.drop(columns=['Number', 'percentile freq', 'morph structure'])

# Add the zipf frequency column
data['Zipf Frequency'] = data['word'].apply(lambda x: zipf_frequency(x, 'en'))

# Rename columns
data = data.rename(columns={
    'word': 'Word',
    'PoS': 'Part of Speech',
    'num letters': 'Length',
})

# Rearrange columns
data = data[[
    "Word", "Length", "Frequency", "Zipf Frequency", 
    "Morph Complexity", "Lexicality", "Part of Speech"
]]
data


Unnamed: 0,Word,Length,Frequency,Zipf Frequency,Morph Complexity,Lexicality,Part of Speech
0,bathmat,7,low,1.55,complex,real,N
1,decoder,7,low,2.84,complex,real,N
2,defiant,7,low,3.21,complex,real,Adj
3,padlock,7,low,2.68,complex,real,N
4,immoral,7,low,3.46,complex,real,Adj
5,nonstop,7,low,3.37,complex,real,Adj/Adv
6,parasol,7,low,2.63,complex,real,N
7,reactor,7,low,3.83,complex,real,N
8,recycle,7,low,3.35,complex,real,V
9,reenact,7,low,2.42,complex,real,V


In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')
data['Part of Speech'] = data['Word'].apply(lambda x: nlp(x)[0].pos_)
data

Unnamed: 0,Word,Length,Frequency,Zipf Frequency,Morph Complexity,Lexicality,Part of Speech
0,bathmat,7,low,1.55,complex,real,NOUN
1,decoder,7,low,2.84,complex,real,NOUN
2,defiant,7,low,3.21,complex,real,ADJ
3,padlock,7,low,2.68,complex,real,NOUN
4,immoral,7,low,3.46,complex,real,ADJ
5,nonstop,7,low,3.37,complex,real,ADJ
6,parasol,7,low,2.63,complex,real,NOUN
7,reactor,7,low,3.83,complex,real,NOUN
8,recycle,7,low,3.35,complex,real,NOUN
9,reenact,7,low,2.42,complex,real,VERB


In [4]:
from morphemes import Morphemes
m = Morphemes("./morphemes_data")

def get_morphological_data(word):
    parse = m.parse(word)

    if parse["status"] == "NOT_FOUND":
        return [], [], [], [], 1, "0-1-0"
    
    tree = parse["tree"]

    prefixes = []
    roots = []
    root_freqs = []
    suffixes = []

    for node in tree:
        if node["type"] == "prefix":
            prefixes.append(node["text"])
        
        elif "children" in node:
            for child in node["children"]:
                if child["type"] == "root":
                    roots.append(child["text"])
                    root_freqs.append(zipf_frequency(child["text"], 'en'))
        else:
            suffixes.append(node["text"])

    count = parse["morpheme_count"]
    structure = f"{len(prefixes)}-{len(roots)}-{len(suffixes)}"

    return prefixes, roots, root_freqs, suffixes, count, structure

columns = ["Prefixes", "Roots", "Root Frequencies", "Suffixes", "Morpheme Count", "Morpheme Structure"]
data[columns] = data['Word'].apply(lambda word: pd.Series(get_morphological_data(word)))
data

In [6]:
from g2p_en import G2p

g2p = G2p()
data["Phonemes"] = data["Word"].apply(lambda x: g2p(x))
data