In [19]:
import os
import sys
import pandas as pd
pd.set_option('display.max_rows', None)

current = os.getcwd()
parent = os.path.dirname(current)
sys.path.append(parent)

from swp.utils.setup import seed_everything, set_device
from swp.datasets.phonemes import get_phoneme_to_id

# seed_everything()
# device = set_device()
# phoneme_to_id = get_phoneme_to_id()

from ast import literal_eval
from g2p_en import G2p
g2p = G2p()

def remove_stress(phonemes):
    return [p[:-1] if p[-1].isdigit() else p for p in phonemes]

converters = {"Word": str, "Phonemes": literal_eval, "No Stress": literal_eval}


### Generating Evaluation Dataset

In [None]:
from ast import literal_eval

complete_train = pd.read_csv("../stimuli/dataframe/complete_train.csv")
complete_train["Lexicality"] = "real"
complete_train["Phonemes"] = complete_train["Phonemes"].apply(literal_eval)
complete_train["Length"] = complete_train["Phonemes"].apply(len)

# divide by length
short = complete_train.query("2 < Length < 6")
long = complete_train.query("6 < Length < 10")
short["Size"] = "short"
long["Size"] = "long"

# divide by frequency
lh = long[long["Zipf Frequency"] >= 4]
ll = long[long["Zipf Frequency"] <= 3.5]
sh = short[short["Zipf Frequency"] >= 4]
sl = short[short["Zipf Frequency"] <= 3.5]

# save files if necessary
# lh.to_csv("./long_high.csv", index=False)
# ll.to_csv("./long_low.csv", index=False)
# sh.to_csv("./short_high.csv", index=False)
# sl.to_csv("./short_low.csv", index=False)

In [None]:
rlch = pd.read_csv("../stimuli/handmade/real/rlch.csv", index_col=0, converters=converters).sample(100)
rlcl = pd.read_csv("../stimuli/handmade/real/rlcl.csv", index_col=0, converters=converters).sample(100)
rlsh = pd.read_csv("../stimuli/handmade/real/rlsh.csv", index_col=0, converters=converters).sample(100)
rlsl = pd.read_csv("../stimuli/handmade/real/rlsl.csv", index_col=0, converters=converters).sample(100)
rsch = pd.read_csv("../stimuli/handmade/real/rsch.csv", index_col=0, converters=converters).sample(100)
rscl = pd.read_csv("../stimuli/handmade/real/rscl.csv", index_col=0, converters=converters).sample(100)
rssh = pd.read_csv("../stimuli/handmade/real/rssh.csv", index_col=0, converters=converters).sample(100)
rssl = pd.read_csv("../stimuli/handmade/real/rssl.csv", index_col=0, converters=converters).sample(100)
print(len(rlch), len(rlcl), len(rlsh), len(rlsl), len(rsch), len(rscl), len(rssh), len(rssl))

rlch["Condition"] = "RLCH"
rlch["Morphology"] = "complex"
rlcl["Condition"] = "RLCL"
rlcl["Morphology"] = "complex"
rlsh["Condition"] = "RLSH"
rlsh["Morphology"] = "simple"
rlsl["Condition"] = "RLSL"
rlsl["Morphology"] = "simple"
rsch["Condition"] = "RSCH"
rsch["Morphology"] = "complex"
rscl["Condition"] = "RSCL"
rscl["Morphology"] = "complex"
rssh["Condition"] = "RSSH"
rssh["Morphology"] = "simple"
rssl["Condition"] = "RSSL"
rssl["Morphology"] = "simple"

real_equalized = pd.concat([rlch, rlcl, rlsh, rlsl, rsch, rscl, rssh, rssl], ignore_index=True)
order = [
    "Word", "Condition", "Lexicality", "Size", "Morphology",
    "Frequency", "Length", "Zipf Frequency", "Phonemes",
    "No Stress", "Part of Speech"
]
real_equalized = real_equalized[order]
print(real_equalized["Condition"].value_counts())
real_equalized.to_csv("../stimuli/handmade/real_equalized.csv")

100 100 100 100 100 100 100 100
Condition
RLCH    100
RLCL    100
RLSH    100
RLSL    100
RSCH    100
RSCL    100
RSSH    100
RSSL    100
Name: count, dtype: int64
Length
7    202
5    175
3    115
8    112
4    110
9     86
Name: count, dtype: int64


In [None]:
psc3 = pd.read_csv("../stimuli/handmade/pseudo/psc3.csv", index_col=0, converters=converters)
psc4 = pd.read_csv("../stimuli/handmade/pseudo/psc4.csv", index_col=0, converters=converters).sample(46)
psc5 = pd.read_csv("../stimuli/handmade/pseudo/psc5.csv", index_col=0, converters=converters).sample(33)
plc7 = pd.read_csv("../stimuli/handmade/pseudo/plc7.csv", index_col=0, converters=converters).sample(33)
plc8 = pd.read_csv("../stimuli/handmade/pseudo/plc8.csv", index_col=0, converters=converters).sample(34)
plc9 = pd.read_csv("../stimuli/handmade/pseudo/plc9.csv", index_col=0, converters=converters).sample(33)
pss3 = pd.read_csv("../stimuli/handmade/pseudo/pss3.csv", index_col=0, converters=converters).sample(33)
pss4 = pd.read_csv("../stimuli/handmade/pseudo/pss4.csv", index_col=0, converters=converters).sample(34)
pss5 = pd.read_csv("../stimuli/handmade/pseudo/pss5.csv", index_col=0, converters=converters).sample(33)
pls7 = pd.read_csv("../stimuli/handmade/pseudo/pls7.csv", index_col=0, converters=converters).sample(33)
pls8 = pd.read_csv("../stimuli/handmade/pseudo/pls8.csv", index_col=0, converters=converters).sample(34)
pls9 = pd.read_csv("../stimuli/handmade/pseudo/pls9.csv", index_col=0, converters=converters).sample(33)

pseudo_equalized = pd.concat([pls9, psc3, psc4, psc5, plc7, plc8, plc9, pss3, pss4, pss5, pls7, pls8], ignore_index=True)
pseudo_equalized["Lexicality"] = "pseudo"
print(pseudo_equalized["Length"].value_counts())
print(pseudo_equalized["Condition"].value_counts())
order = [
    "Word", "Condition", "Lexicality", "Size", "Morphology",
    "Frequency", "Length", "Zipf Frequency", "Phonemes",
    "No Stress", "Part of Speech"
]
pseudo_equalized = pseudo_equalized[order]
pseudo_equalized.to_csv("../stimuli/handmade/pseudo_equalized.csv")

Length
4    80
8    68
9    66
5    66
7    66
3    54
Name: count, dtype: int64
Condition
PLS    100
PSC    100
PLC    100
PSS    100
Name: count, dtype: int64
Length
4    80
8    68
9    66
5    66
7    66
3    54
Name: count, dtype: int64
Empty DataFrame
Columns: [Word, Condition, Lexicality, Size, Morphology, Frequency, Length, Zipf Frequency, Phonemes, No Stress, Part of Speech]
Index: []


In [25]:
test_equalized = pd.concat([real_equalized, pseudo_equalized])
print(test_equalized["Condition"].value_counts())
test_equalized.to_csv("../stimuli/handmade/test_equalized.csv")
test_equalized

Condition
RLCH    100
RLCL    100
RLSH    100
RLSL    100
RSCH    100
RSCL    100
RSSH    100
RSSL    100
PLS     100
PSC     100
PLC     100
PSS     100
Name: count, dtype: int64


Unnamed: 0,Word,Condition,Lexicality,Size,Morphology,Frequency,Length,Zipf Frequency,Phonemes,No Stress,Part of Speech
0,attending,RLCH,real,long,complex,2.04e-05,7,4.31,"[AH0, T, EH1, N, D, IH0, NG]","[AH, T, EH, N, D, IH, NG]",VERB
1,commissioner,RLCH,real,long,complex,2.69e-05,8,4.43,"[K, AH0, M, IH1, SH, AH0, N, ER0]","[K, AH, M, IH, SH, AH, N, ER]",NOUN
2,operation,RLCH,real,long,complex,8.32e-05,7,4.92,"[AA2, P, ER0, EY1, SH, AH0, N]","[AA, P, ER, EY, SH, AH, N]",NOUN
3,hospitals,RLCH,real,long,complex,2.14e-05,9,4.33,"[HH, AA1, S, P, IH2, T, AH0, L, Z]","[HH, AA, S, P, IH, T, AH, L, Z]",NOUN
4,acquisition,RLCH,real,long,complex,1.82e-05,9,4.26,"[AE2, K, W, AH0, Z, IH1, SH, AH0, N]","[AE, K, W, AH, Z, IH, SH, AH, N]",NOUN
5,drinking,RLCH,real,long,complex,5.13e-05,7,4.71,"[D, R, IH1, NG, K, IH0, NG]","[D, R, IH, NG, K, IH, NG]",VERB
6,universe,RLCH,real,long,complex,3.89e-05,7,4.59,"[Y, UW1, N, AH0, V, ER2, S]","[Y, UW, N, AH, V, ER, S]",NOUN
7,obviously,RLCH,real,long,complex,6.31e-05,8,4.8,"[AA1, B, V, IY0, AH0, S, L, IY0]","[AA, B, V, IY, AH, S, L, IY]",ADV
8,physically,RLCH,real,long,complex,2e-05,8,4.3,"[F, IH1, Z, IH0, K, AH0, L, IY0]","[F, IH, Z, IH, K, AH, L, IY]",ADV
9,awareness,RLCH,real,long,complex,2.75e-05,7,4.44,"[AH0, W, EH1, R, N, AH0, S]","[AH, W, EH, R, N, AH, S]",NOUN


### Processing Pseudo Words

In [3]:
from ast import literal_eval
from g2p_en import G2p
g2p = G2p()

def remove_stress(phonemes):
    return [p[:-1] if p[-1].isdigit() else p for p in phonemes]

converters = {"Word": str, "Phonemes": literal_eval, "No Stress": literal_eval}

complete_train = pd.read_csv("../stimuli/dataframe/complete_train.csv", index_col=0, converters=converters)
raw_pseudo = pd.read_csv("../stimuli/handmade/pseudo/raw_pseudo.csv", index_col=0, converters=converters)

# filter out all words that are in the train set
raw_pseudo = raw_pseudo[~raw_pseudo["Word"].isin(complete_train["Word"])]
raw_pseudo["Phonemes"] = raw_pseudo["Word"].apply(g2p)
raw_pseudo["No Stress"] = raw_pseudo["Phonemes"].apply(remove_stress)
raw_pseudo["Length"] = raw_pseudo["Phonemes"].apply(len)
raw_pseudo = raw_pseudo.query("1 < Length < 11")
raw_pseudo.to_csv("../stimuli/handmade/pseudo/raw_pseudo.csv")
print(len(raw_pseudo))

9726


In [60]:
import pandas as pd
from Levenshtein import distance

def compute_min_distances(dfA, dfB):
    """
      - Compute the minimum Levenshtein distance to any row in dfB (by Word)
      - Compute the minimum Levenshtein distance to any row in dfB (by Phonemes)
      - Normalize each of those distances by the length of Word or length of the Phonemes list
    """
    dfA["characterDist"] = dfA["Word"].apply(
        lambda w_a: min(distance(w_a, w_b) for w_b in dfB["Word"]) / len(w_a)
    )
    dfA["phonemeDist"] = dfA["Phonemes"].apply(
        lambda ph_a: min(distance(ph_a, ph_b) for ph_b in dfB["Phonemes"]) / len(ph_a)
    )
    return dfA

def filter_by_distance(dfA, threshold):
    return dfA[(dfA["characterDist"] >= threshold) & (dfA["phonemeDist"] >= threshold)]

raw_pseudo = compute_min_distances(raw_pseudo, complete_train)
pseudo_complete = filter_by_distance(raw_pseudo, threshold=0.25)
print(pseudo_complete["Length"].value_counts())

pseudo_complete[pseudo_complete["Length"] == 3].to_csv("3.csv")
pseudo_complete[pseudo_complete["Length"] == 4].to_csv("4.csv")
pseudo_complete[pseudo_complete["Length"] == 5].to_csv("5.csv")
pseudo_complete[pseudo_complete["Length"] == 6].to_csv("6.csv")
pseudo_complete[pseudo_complete["Length"] == 7].to_csv("7.csv")
pseudo_complete[pseudo_complete["Length"] == 8].to_csv("8.csv")
pseudo_complete[pseudo_complete["Length"] == 9].to_csv("9.csv")

4610


In [30]:
plc = pd.read_csv("../stimuli/handmade/test_data/plc.csv", index_col=0, converters=converters)
pls = pd.read_csv("../stimuli/handmade/test_data/pls.csv", index_col=0, converters=converters)
psc = pd.read_csv("../stimuli/handmade/test_data/psc.csv", index_col=0, converters=converters)
pss = pd.read_csv("../stimuli/handmade/test_data/pss.csv", index_col=0, converters=converters)

print(plc["Length"].value_counts())
print(pls["Length"].value_counts())
print(psc["Length"].value_counts())
print(pss["Length"].value_counts())

Length
7    72
8    55
9    20
Name: count, dtype: int64
Length
7    123
8     66
9     15
Name: count, dtype: int64
Length
5    131
4    116
6    101
3     59
Name: count, dtype: int64
Length
4    161
6    155
5    119
3     89
Name: count, dtype: int64
