# Selection of Materials for LPC lexical decision experiment

In [141]:
EXP_DURATION = 10 * 60

TRIAL_DURATION = 3  # à voir si 4 ou 5 secondes

NB_TRIALS_TOTAL = EXP_DURATION / TRIAL_DURATION 

NB_TRIALS_TOTAL

NB_PSEUDOS = .2 * NB_TRIALS_TOTAL  #  20 % of trials are pseudowords

NB_WORDS = NB_TRIALS_TOTAL - NB_PSEUDOS

print(f'npseudos={NB_PSEUDOS}  nwords={NB_WORDS}')

npseudos=40.0  nwords=160.0


In [142]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", None)

lex = pd.read_csv('Lexique380.utf8.csv.gz')
lex.head()
lex.columns
lex = lex[(lex.ortho.str.contains('-| ') == False) & (lex.phon.str.contains('°') == False)]  # suppress schwa
lex = lex[['ortho', 'phon', 'cgram', 'islem', 'freqlemfilms2', 'freqfilms2', 'freqlemlivres', 'freqlivres', 'p_cvcv', 'cvcv', 'nbsyll', 'nblettres']]

# Words selection 

To keep the LPC length constant (3 visual syllables), we select words wit 3 (phonetic) consonants.  


   
Experimental factors:    
- Lexical frequency (
     * Low freq (LF) 
     * High freq (HF).
- Syllabic match beteen LPC and phonetic output  
         * Match: phon=CVCVCV 
         * Mismatch: 
               - monosyll: CCCV, CCVC, CVCC
               - bisyllabic: CVCVC CVCCV 

In [143]:
## Frequency thresholds 

LF_MIN = 0.5
LF_MAX = 1

HF_MIN = 10


In [144]:
cvcvcv = lex[(lex.p_cvcv == 'CVCVCV')  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)]
ncvcvcv = lex[(lex.p_cvcv.isin(['CCVC','CVCC', 'CCCV', 'CVCVC', 'CVCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)]

In [145]:
cvcvcv_hf = cvcvcv[(cvcvcv.freqfilms2 > HF_MIN) | (cvcvcv.freqlivres > HF_MIN) ]
print(cvcvcv_hf.shape)
cvcvcv_hf[['ortho', 'phon', 'freqlivres', 'freqfilms2']].head(1000)

(89, 12)


Unnamed: 0,ortho,phon,freqlivres,freqfilms2
12731,balader,balade,5.41,12.41
12815,balancer,bal@se,11.89,10.15
13951,bâtiment,batim@,19.93,22.73
19359,cabinet,kabinE,29.8,19.45
20377,caméra,kameRa,4.39,41.64
20538,canapé,kanape,17.97,17.66
21343,caresser,kaRese,18.24,5.66
21563,carrément,kaRem@,16.55,9.99
21750,casino,kazino,9.8,10.89
23445,charité,SaRite,14.32,13.54


In [146]:
cvcvcv_lf = cvcvcv[(cvcvcv.freqfilms2 < LF_MAX) & (cvcvcv.freqlivres < LF_MAX)]
print(cvcvcv_lf.shape)
cvcvcv_lf[['ortho','freqfilms2']].head(385)

(235, 12)


Unnamed: 0,ortho,freqfilms2
12374,badiner,0.18
12798,balancé,0.51
12935,baliser,0.06
12989,ballonner,0.15
13014,ballotter,0.02
13111,bananer,0.0
13333,baraqué,0.96
14129,bavasser,0.48
14306,bécoter,0.28
14623,benjamin,0.33


In [147]:
ncvcvcv_hf = ncvcvcv[(ncvcvcv.freqfilms2 > HF_MIN) | (ncvcvcv.freqlivres > HF_MIN)]
print(ncvcvcv_hf.shape)
ncvcvcv_hf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)


(622, 12)


Unnamed: 0,ortho,freqlivres,freqfilms2
12451,bagarre,9.86,16.05
12478,bagnole,26.28,21.18
12895,balcon,32.97,9.9
12903,baleine,3.11,11.52
13091,banal,11.28,4.88
13261,banquette,24.26,2.66
13332,baraque,22.84,11.1
13398,barbe,47.7,23.4
13598,barque,29.93,9.52
13605,barrage,10.68,9.6


In [148]:
ncvcvcv_lf = ncvcvcv[(ncvcvcv.freqfilms2 < HF_MIN) & (ncvcvcv.freqlivres < HF_MIN)]
print(ncvcvcv_lf.shape)
ncvcvcv_lf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)

(1533, 12)


Unnamed: 0,ortho,freqlivres,freqfilms2
12222,bâbord,0.95,1.97
12252,bacchante,1.15,0.67
12298,bâcler,0.68,0.12
12304,bacon,0.47,4.5
12340,badge,0.74,6.03
12436,bâfrer,0.54,0.26
12442,bagage,7.43,3.21
12474,bagnard,1.08,0.5
12518,baguette,9.46,5.67
12534,baignade,1.89,1.0


In [149]:
cvcvcv_hf.to_csv('match_hf.csv')
cvcvcv_lf.to_csv('match_lf.csv')
ncvcvcv_hf.to_csv('mismatch_hf.csv')
ncvcvcv_lf.to_csv('mismatch_lf.csv')


# Pseudoword selection/Construction

*TBD*
