# Selection of Materials for LPC lexical decision experiment

In [None]:
EXP_DURATION = 10 * 60

TRIAL_DURATION = 3  # à voir si 4 ou 5 secondes

NB_TRIALS_TOTAL = EXP_DURATION / TRIAL_DURATION 

NB_TRIALS_TOTAL

NB_PSEUDOS = .2 * NB_TRIALS_TOTAL  #  20 % of trials are pseudowords

NB_WORDS = NB_TRIALS_TOTAL - NB_PSEUDOS

print(f'npseudos={NB_PSEUDOS}  nwords={NB_WORDS}')

In [None]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", None)

lex = pd.read_csv('Lexique380.utf8.csv.gz')
lex.head()
lex.columns
lex = lex[(lex.ortho.str.contains('-| ') == False) & (lex.phon.str.contains('°') == False)]  # suppress schwa
lex = lex[['ortho', 'phon', 'cgram', 'islem', 'freqlemfilms2', 'freqfilms2', 'freqlemlivres', 'freqlivres', 'p_cvcv', 'cvcv', 'nbsyll', 'nblettres']]

# Words selection 

To keep the LPC length constant (3 visual syllables), we select words wit 3 (phonetic) consonants.  


   
Experimental factors:    
- Lexical frequency (
     * Low freq (LF) 
     * High freq (HF).
- Syllabic match beteen LPC and phonetic output  
         * Match: phon=CVCVCV 
         * Mismatch: 
               - 2 mismatch -> monosyll: CCCV, CCVC, CVCC
               - 1 mismatch -> bisyllabic: CVCVC CVCCV 
               
               
need to be taken down from the database:
19359
38824
114783

In [None]:
## Frequency thresholds 

LF_MIN = 0.3
LF_MAX = 2

HF_MIN = 10 


In [None]:
cvcvcv = lex[(lex.p_cvcv == 'CVCVCV')  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 0 missmatch
ncvcvcv1m = lex[(lex.p_cvcv.isin(['CVCVC', 'CVCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 1 missmatch
ncvcvcv2m = lex[(lex.p_cvcv.isin(['CCVC','CVCC', 'CCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 2 missmatch

In [None]:
cvcvcv_hf = cvcvcv[(cvcvcv.freqlemfilms2 > HF_MIN) & (cvcvcv.freqlemlivres > HF_MIN)] 
print(cvcvcv_hf.shape)
cvcvcv_hf[['ortho', 'phon', 'freqlivres', 'freqfilms2']].head(1000)

np.log10(cvcvcv_hf.freqlivres).describe()
                                                            

In [None]:
cvcvcv_lf = cvcvcv[(cvcvcv.freqlemfilms2 < LF_MAX) & (cvcvcv.freqlemlivres < LF_MAX)]
print(cvcvcv_lf.shape)
cvcvcv_lf[['ortho', 'phon', 'freqlivres', 'freqfilms2']].head(1000) # it was [['ortho','freqfilms2']].head(385), why?

In [None]:
ncvcvcv1m_hf = ncvcvcv1m[(ncvcvcv1m.freqlemfilms2 > HF_MIN) & (ncvcvcv1m.freqlemlivres > HF_MIN)] # changed to "and"
print(ncvcvcv1m_hf.shape)
ncvcvcv1m_hf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)


In [None]:
# select a subsample matched in frequency with cvcvcv

n = 60

ok= False
while (not ok):
    tmp = ncvcvcv1m_hf.sample(n)
    e = np.log10(tmp.freqlivres+0.1).median()
    ok = (e< 1.15)

print(e)
np.log10(tmp.freqlivres+0.1).describe()
tmp
ncvcvcv1m_hf = tmp

In [None]:
ncvcvcv1m_lf = ncvcvcv1m[(ncvcvcv1m.freqlemfilms2 < LF_MAX) & (ncvcvcv1m.freqlemlivres < LF_MAX) & (ncvcvcv1m.freqlemfilms2 > LF_MIN)]
print(ncvcvcv1m_lf.shape)
ncvcvcv1m_lf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)

In [None]:
ncvcvcv2m_hf = ncvcvcv2m[(ncvcvcv2m.freqlemfilms2 > HF_MIN) & (ncvcvcv2m.freqlemlivres > HF_MIN)] #chaged to "and"
print(ncvcvcv2m_hf.shape)
ncvcvcv2m_hf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)
np.log10(ncvcvcv2m_hf.freqlivres+0.1).describe()


In [None]:
# select a subsample matched in frequency with cvcvcv

n = 60

ok= False
while (not ok):
    tmp = ncvcvcv2m_hf.sample(n)
    e = np.log10(tmp.freqlivres+0.1).median()
    ok = (e< 1.6)

print(e)
np.log10(tmp.freqlivres+0.1).describe()
ncvcvcv2m_hf = tmp

In [None]:
ncvcvcv2m_lf = ncvcvcv2m[(ncvcvcv2m.freqlemfilms2 < LF_MAX) & (ncvcvcv2m.freqlemlivres < LF_MAX)]
print(ncvcvcv2m_lf.shape)
ncvcvcv2m_lf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)

In [None]:
cvcvcv_hf.to_csv('match_hf.csv')
cvcvcv_lf.to_csv('match_lf.csv')
ncvcvcv1m_hf.to_csv('1_mismatch_hf.csv')
ncvcvcv1m_lf.to_csv('1_mismatch_lf.csv')
ncvcvcv2m_hf.to_csv('2_mismatch_hf.csv')
ncvcvcv2m_lf.to_csv('2_mismatch_lf.csv')


# Pseudoword selection/Construction

Select randomally 10 words from each category and change them manually (the file of pseudowords is in the main folder)

In [121]:
n = 10

#sample randomally 10 words from each category
p_cvcvcv_hf = cvcvcv_hf.sample(n)
p_cvcvcv_lf = cvcvcv_lf.sample(n)
p_ncvcvcv1m_hf = ncvcvcv1m_hf.sample(n)
p_ncvcvcv1m_lf = ncvcvcv1m_lf.sample(n)
p_ncvcvcv2m_hf = ncvcvcv2m_hf.sample(n)
p_ncvcvcv2m_lf = ncvcvcv2m_lf.sample(n)

#combining the selected words:
frames = [p_cvcvcv_hf,p_cvcvcv_lf,p_ncvcvcv1m_hf,p_ncvcvcv1m_lf,p_ncvcvcv2m_hf,p_ncvcvcv2m_lf]
result = pd.concat(frames)

#extracting the words into a csv file
#result[['ortho', 'phon','p_cvcv', 'cvcv', 'nbsyll', 'nblettres']].to_csv('base_of_pseudo.csv')