# Selection of Materials for LPC lexical decision experiment

In [60]:
EXP_DURATION = 10 * 60

TRIAL_DURATION = 3  # à voir si 4 ou 5 secondes

NB_TRIALS_TOTAL = EXP_DURATION / TRIAL_DURATION 

NB_TRIALS_TOTAL

NB_PSEUDOS = .2 * NB_TRIALS_TOTAL  #  20 % of trials are pseudowords

NB_WORDS = NB_TRIALS_TOTAL - NB_PSEUDOS

print(f'npseudos={NB_PSEUDOS}  nwords={NB_WORDS}')

npseudos=40.0  nwords=160.0


In [61]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", None)

lex = pd.read_csv('Lexique380.utf8.csv.gz')
lex.head()
lex.columns
lex = lex[(lex.ortho.str.contains('-| ') == False) & (lex.phon.str.contains('°') == False)]  # suppress schwa
lex = lex[['ortho', 'phon', 'cgram', 'islem', 'freqlemfilms2', 'freqfilms2', 'freqlemlivres', 'freqlivres', 'p_cvcv', 'cvcv', 'nbsyll', 'nblettres']]

# Words selection 

To keep the LPC length constant (3 visual syllables), we select words wit 3 (phonetic) consonants.  


   
Experimental factors:    
- Lexical frequency (
     * Low freq (LF) 
     * High freq (HF).
- Syllabic match beteen LPC and phonetic output  
         * Match: phon=CVCVCV 
         * Mismatch: 
               - 2 mismatch -> monosyll: CCCV, CCVC, CVCC
               - 1 mismatch -> bisyllabic: CVCVC CVCCV 

In [70]:
## Frequency thresholds 

LF_MIN = 0.5
LF_MAX = 1

HF_MIN = 10 


In [71]:
cvcvcv = lex[(lex.p_cvcv == 'CVCVCV')  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 0 missmatch
ncvcvcv1m = lex[(lex.p_cvcv.isin(['CVCVC', 'CVCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 1 missmatch
ncvcvcv2m = lex[(lex.p_cvcv.isin(['CCVC','CVCC', 'CCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 2 missmatch

In [72]:
cvcvcv_hf = cvcvcv[(cvcvcv.freqfilms2 > HF_MIN) & (cvcvcv.freqlivres > HF_MIN)] 
print(cvcvcv_hf.shape)
cvcvcv_hf[['ortho', 'phon', 'freqlivres', 'freqfilms2']].head(1000)

(45, 12)


Unnamed: 0,ortho,phon,freqlivres,freqfilms2
12815,balancer,bal@se,11.89,10.15
13951,bâtiment,batim@,19.93,22.73
19359,cabinet,kabinE,29.8,19.45
20538,canapé,kanape,17.97,17.66
23445,charité,SaRite,14.32,13.54
24643,chocolat,Sokola,30.61,27.74
25225,cinéma,sinema,72.91,62.23
27354,comédie,komedi,25.68,20.87
27384,comité,komite,58.99,18.34
27420,commander,kom@de,11.55,13.5


In [65]:
cvcvcv_lf = cvcvcv[(cvcvcv.freqfilms2 < LF_MAX) & (cvcvcv.freqlivres < LF_MAX)]
print(cvcvcv_lf.shape)
cvcvcv_lf[['ortho', 'phon', 'freqlivres', 'freqfilms2']].head(1000) # it was [['ortho','freqfilms2']].head(385), why?

(235, 12)


Unnamed: 0,ortho,phon,freqlivres,freqfilms2
12374,badiner,badine,0.27,0.18
12798,balancé,bal@se,0.47,0.51
12935,baliser,balize,0.34,0.06
12989,ballonner,balone,0.14,0.15
13014,ballotter,balote,0.41,0.02
13111,bananer,banane,0.07,0.0
13333,baraqué,baRake,0.41,0.96
14129,bavasser,bavase,0.41,0.48
14306,bécoter,bekote,0.41,0.28
14623,benjamin,b5Zam5,0.47,0.33


In [66]:
ncvcvcv1m_hf = ncvcvcv1m[(ncvcvcv1m.freqfilms2 > HF_MIN) & (ncvcvcv1m.freqlivres > HF_MIN)] # changed to "and"
print(ncvcvcv1m_hf.shape)
ncvcvcv1m_hf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)


(155, 12)


Unnamed: 0,ortho,freqlivres,freqfilms2
12478,bagnole,26.28,21.18
15609,bizarre,41.76,117.31
16444,bonheur,156.35,78.34
16474,bonjour,50.74,569.88
17618,boutique,36.01,22.29
19356,cabine,29.86,17.65
20441,campagne,94.73,48.61
20557,canard,16.15,15.46
22340,ceinture,32.23,19.41
22423,cellule,35.34,31.06


In [25]:
ncvcvcv1m_lf = ncvcvcv1m[(ncvcvcv1m.freqfilms2 < HF_MIN) & (ncvcvcv1m.freqlivres < HF_MIN)]
print(ncvcvcv1m_lf.shape)
ncvcvcv1m_lf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)

(1136, 12)


Unnamed: 0,ortho,freqlivres,freqfilms2
12222,bâbord,0.95,1.97
12252,bacchante,1.15,0.67
12298,bâcler,0.68,0.12
12304,bacon,0.47,4.5
12436,bâfrer,0.54,0.26
12442,bagage,7.43,3.21
12474,bagnard,1.08,0.5
12518,baguette,9.46,5.67
12534,baignade,1.89,1.0
12661,baiseur,0.61,1.4


In [67]:
ncvcvcv2m_hf = ncvcvcv2m[(ncvcvcv2m.freqfilms2 > HF_MIN) & (ncvcvcv2m.freqlivres > HF_MIN)] #chaged to "and"
print(ncvcvcv2m_hf.shape)
ncvcvcv2m_hf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)

(126, 12)


Unnamed: 0,ortho,freqlivres,freqfilms2
13398,barbe,47.7,23.4
14041,battre,57.36,75.92
14969,bible,17.16,17.03
15655,blague,16.82,60.33
18069,brave,23.31,24.55
18134,bref,38.78,22.26
20169,calme,65.81,58.78
20170,calme,52.03,105.08
21605,carte,60.95,96.11
22553,centre,80.0,53.46


In [26]:
ncvcvcv2m_lf = ncvcvcv2m[(ncvcvcv2m.freqfilms2 < HF_MIN) & (ncvcvcv2m.freqlivres < HF_MIN)]
print(ncvcvcv2m_lf.shape)
ncvcvcv2m_lf[['ortho', 'freqlivres', 'freqfilms2']].head(1000)

(397, 12)


Unnamed: 0,ortho,freqlivres,freqfilms2
12340,badge,0.74,6.03
13045,balte,0.34,0.05
13527,barge,0.07,1.6
13528,barge,0.41,0.72
13790,basque,3.24,7.51
13791,basque,0.47,0.6
13834,baste,0.14,1.21
14446,belge,6.55,2.83
14447,belge,2.36,1.31
14700,berge,8.72,1.79


In [21]:
cvcvcv_hf.to_csv('match_hf.csv')
cvcvcv_lf.to_csv('match_lf.csv')
ncvcvcv1m_hf.to_csv('1_mismatch_hf.csv')
ncvcvcv1m_lf.to_csv('1_mismatch_lf.csv')
ncvcvcv2m_hf.to_csv('2_mismatch_hf.csv')
ncvcvcv2m_lf.to_csv('2_mismatch_lf.csv')


# Pseudoword selection/Construction

*TBD*
