In [5]:
import os
import pandas as pd
from wordfreq import zipf_frequency

pd.options.display.max_rows = 4000

# get list of all files in the current directory
files = os.listdir(os.getcwd())

# load the data from csv files
data = {}
for file in files:
    if file.endswith('.csv'):
        name = file.split('.')[0]
        _, lexicality, frequency, _, morphology = name.split('_')
        df = pd.read_csv(file)

        # add lexicality, morph complexity, and frequency columns
        df['Frequency'] = frequency
        df['Lexicality'] = lexicality
        df['Morph Complexity'] = morphology

        # add to data dictionary
        data[name] = df

# combine all dataframes into one
data = pd.concat(data.values(), ignore_index=True)

# remove rows with missing values
data = data.dropna()

# remove "Number" and "percentile freq" column
data = data.drop(columns=['Number', 'percentile freq', 'morph structure'])

# add the zipf frequency column
data['Zipf Frequency'] = data['word'].apply(lambda x: zipf_frequency(x, 'en'))

# rename columns
data = data.rename(columns={
    'word': 'Word',
    'PoS': 'Part of Speech',
    'num letters': 'Length',
})

# rearrange columns
data = data[[
    "Word", "Length", "Frequency", "Zipf Frequency", 
    "Morph Complexity", "Lexicality", "Part of Speech"
]]

data

Unnamed: 0,Word,Length,Frequency,Zipf Frequency,Morph Complexity,Lexicality,Part of Speech
0,bathmat,7,low,1.55,complex,real,N
1,decoder,7,low,2.84,complex,real,N
2,defiant,7,low,3.21,complex,real,Adj
3,padlock,7,low,2.68,complex,real,N
4,immoral,7,low,3.46,complex,real,Adj
5,nonstop,7,low,3.37,complex,real,Adj/Adv
6,parasol,7,low,2.63,complex,real,N
7,reactor,7,low,3.83,complex,real,N
8,recycle,7,low,3.35,complex,real,V
9,reenact,7,low,2.42,complex,real,V


In [3]:
import spacy

nlp = spacy.load('en_core_web_sm')
data['Part of Speech'] = data['Word'].apply(lambda x: nlp(x)[0].pos_)
data

Unnamed: 0,Word,Length,Frequency,Zipf Frequency,Morph Complexity,Morph Structure,Lexicality,Part of Speech,Part of Speech 2
0,bathmat,7,low,1.55,complex,0-1-1,real,N,NOUN
1,decoder,7,low,2.84,complex,1-1-1,real,N,NOUN
2,defiant,7,low,3.21,complex,0-1-1,real,Adj,ADJ
3,padlock,7,low,2.68,complex,0-1-1,real,N,NOUN
4,immoral,7,low,3.46,complex,1-1-0,real,Adj,ADJ
5,nonstop,7,low,3.37,complex,1-1-0,real,Adj/Adv,ADJ
6,parasol,7,low,2.63,complex,1-1-0,real,N,NOUN
7,reactor,7,low,3.83,complex,1-1-1,real,N,NOUN
8,recycle,7,low,3.35,complex,1-1-0,real,V,NOUN
9,reenact,7,low,2.42,complex,1-1-1-0,real,V,VERB


In [2]:
from morphemes import Morphemes
m = Morphemes("./morphemes_data")

In [3]:
# morpheme count
# root and root frequency
# prefixes and suffixes

def get_morphological_data(word):
    parse = m.parse(word)

    if parse["status"] == "NOT_FOUND":
        return [], [], [], [], 1, "0-1-0"
    
    tree = parse["tree"]

    prefixes = []
    roots = []
    root_freqs = []
    suffixes = []

    for node in tree:
        if node["type"] == "prefix":
            prefixes.append(node["text"])
        
        elif "children" in node:
            for child in node["children"]:
                if child["type"] == "root":
                    roots.append(child["text"])
                    root_freqs.append(zipf_frequency(child["text"], 'en'))
        else:
            suffixes.append(node["text"])

    count = parse["morpheme_count"]
    structure = f"{len(prefixes)}-{len(roots)}-{len(suffixes)}"

    return prefixes, roots, root_freqs, suffixes, count, structure

In [4]:
columns = ["Prefixes", "Roots", "Root Frequencies", "Suffixes", "Morpheme Count", "Morpheme Structure"]
data[columns] = data['Word'].apply(lambda word: pd.Series(get_morphological_data(word)))
data

Unnamed: 0,Word,Part of Speech,Length,Frequency,Lexicality,Morph Complexity,Zipf Frequency,Prefixes,Roots,Root Frequencies,Suffixes,Morpheme Count,Morpheme Structure
0,bathmat,N,7,low,real,complex,1.55,[],[],[],[],1,0-1-0
1,decoder,N,7,low,real,complex,2.84,[de],[code],[5.08],[er],3,1-1-1
2,defiant,Adj,7,low,real,complex,3.21,[],[defy],[3.42],[ant],2,0-1-1
3,padlock,N,7,low,real,complex,2.68,[],"[pad, lock]","[4.01, 4.51]",[],2,0-2-0
4,immoral,Adj,7,low,real,complex,3.46,[im],[moral],[4.46],[],2,1-1-0
5,nonstop,Adj/Adv,7,low,real,complex,3.37,[non],[stop],[5.52],[],2,1-1-0
6,parasol,N,7,low,real,complex,2.63,[],[parasol],[2.63],[],1,0-1-0
7,reactor,N,7,low,real,complex,3.83,[],[act],[5.3],[or],3,0-1-1
8,recycle,V,7,low,real,complex,3.35,[re],[cycle],[4.6],[],2,1-1-0
9,reenact,V,7,low,real,complex,2.42,"[re, en]",[act],[5.3],[],3,2-1-0


In [2]:
# get phoneme data
from g2p_en import G2p

g2p = G2p()

In [3]:
data["Phonemes"] = data["Word"].apply(lambda x: g2p(x))
data

NameError: name 'data' is not defined