# Selection of Materials for LPC lexical decision experiment

In [None]:
EXP_DURATION = 10 * 60

TRIAL_DURATION = 3  # à voir si 4 ou 5 secondes

NB_TRIALS_TOTAL = EXP_DURATION / TRIAL_DURATION 

NB_TRIALS_TOTAL

NB_PSEUDOS = .2 * NB_TRIALS_TOTAL  #  20 % of trials are pseudowords

NB_WORDS = NB_TRIALS_TOTAL - NB_PSEUDOS

print(f'npseudos={NB_PSEUDOS}  nwords={NB_WORDS}')

In [47]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", None)

lex = pd.read_csv('Lexique380.utf8.csv')
lex.head()
lex.columns
lex = lex[(lex.ortho.str.contains('-| ') == False) & (lex.phon.str.contains('°') == False)]  # suppress schwa
lex = lex[['ortho', 'phon', 'cgram', 'islem','nbhomogr','freqlemfilms2', 'freqfilms2', 'freqlemlivres', 'freqlivres', 'p_cvcv', 'cvcv', 'nbsyll', 'nblettres']]

lex.iloc[13886]

ortho            bombage
phon               b§baZ
cgram                NOM
islem                  1
nbhomogr               1
freqlemfilms2        0.0
freqfilms2           0.0
freqlemlivres        0.2
freqlivres           0.2
p_cvcv             CVCVC
cvcv             CVCCVCV
nbsyll                 2
nblettres              7
Name: 16306, dtype: object

# Words selection 

To keep the LPC length constant (3 visual syllables), we select words wit 3 (phonetic) consonants.  


   
Experimental factors:    
- Lexical frequency (
     * Low freq (LF) 
     * High freq (HF).
- Syllabic match beteen LPC and phonetic output  
         * Match: phon=CVCVCV 
         * Mismatch: 
               - 2 mismatch -> monosyll: CCCV, CCVC, CVCC
               - 1 mismatch -> bisyllabic: CVCVC CVCCV 
               
               
need to be taken down from the database:
19359
38824
114783

In [4]:
## Frequency thresholds 

LF_MIN = 0.3
LF_MAX = 2

HF_MIN = 10 


In [5]:
cvcvcv = lex[(lex.p_cvcv == 'CVCVCV')  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 0 missmatch
ncvcvcv1m = lex[(lex.p_cvcv.isin(['CVCVC', 'CVCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 1 missmatch
ncvcvcv2m = lex[(lex.p_cvcv.isin(['CCVC','CVCC', 'CCCV']))  & (lex.islem == True) & (lex.freqlemfilms2 > LF_MIN) & (lex.nblettres <10)] # 2 missmatch

In [6]:
cvcvcv_hf = cvcvcv[(cvcvcv.freqlemfilms2 > HF_MIN) & (cvcvcv.freqlemlivres > HF_MIN)] 
print(cvcvcv_hf.shape)
cvcvcv_hf[['ortho', 'phon', 'freqlivres', 'freqfilms2','nbhomogr']].head(1000)

np.log10(cvcvcv_hf.freqlivres).describe()
                                                            

(105, 13)


count    105.000000
mean       1.091705
std        0.425122
min       -0.214670
25%        0.812245
50%        1.112940
75%        1.337659
max        2.125091
Name: freqlivres, dtype: float64

In [8]:
cvcvcv_lf = cvcvcv[(cvcvcv.freqlemfilms2 < LF_MAX) & (cvcvcv.freqlemlivres < LF_MAX)]
print(cvcvcv_lf.shape)
cvcvcv_lf[['ortho', 'phon', 'freqlivres', 'freqfilms2','nbhomogr']].head(1000)

(271, 13)


Unnamed: 0,ortho,phon,freqlivres,freqfilms2,nbhomogr
12374,badiner,badine,0.27,0.18,1
12989,ballonner,balone,0.14,0.15,1
13080,bambino,b@bino,0.27,0.33,1
13089,bamboula,b@bula,0.27,1.29,1
13111,bananer,banane,0.07,0.0,1
13147,bandana,b@dana,0.07,0.46,1
13333,baraqué,baRake,0.41,0.96,2
13821,bassiner,basine,0.54,0.23,1
14129,bavasser,bavase,0.41,0.48,1
14189,bazooka,bazuka,0.81,1.21,1


In [7]:
ncvcvcv1m_hf = ncvcvcv1m[(ncvcvcv1m.freqlemfilms2 > HF_MIN) & (ncvcvcv1m.freqlemlivres > HF_MIN)] # changed to "and"
print(ncvcvcv1m_hf.shape)
ncvcvcv1m_hf[['ortho', 'freqlivres', 'freqfilms2','nbhomogr']].head(1000)


(296, 13)


Unnamed: 0,ortho,freqlivres,freqfilms2,nbhomogr
12442,bagage,7.43,3.21,1
12451,bagarre,9.86,16.05,2
12478,bagnole,26.28,21.18,1
12895,balcon,32.97,9.9,1
13332,baraque,22.84,11.1,1
13605,barrage,10.68,9.6,1
13886,bâtard,9.12,9.89,2
13957,bâtir,8.24,5.62,1
14598,bénir,3.38,2.36,1
14701,berger,11.15,8.33,1


In [9]:
# select a subsample matched in frequency with cvcvcv

n = 60

ok= False
while (not ok):
    tmp = ncvcvcv1m_hf.sample(n)
    e = np.log10(tmp.freqlivres+0.1).median()
    ok = (e< 1.15)

print(e)
np.log10(tmp.freqlivres+0.1).describe()
tmp
ncvcvcv1m_hf = tmp

1.1423610604951357


In [10]:
ncvcvcv1m_lf = ncvcvcv1m[(ncvcvcv1m.freqlemfilms2 < LF_MAX) & (ncvcvcv1m.freqlemlivres < LF_MAX) & (ncvcvcv1m.freqlemfilms2 > LF_MIN)]
print(ncvcvcv1m_lf.shape)
ncvcvcv1m_lf[['ortho', 'freqlivres', 'freqfilms2','nbhomogr']].head(1000)

(553, 13)


Unnamed: 0,ortho,freqlivres,freqfilms2,nbhomogr
12222,bâbord,0.95,1.97,1
12280,bacille,0.34,0.18,1
12436,bâfrer,0.54,0.26,1
12781,balaise,0.27,0.45,2
12918,balèze,0.74,1.12,2
12919,balèze,0.14,0.61,2
13029,balourd,0.54,0.41,2
13030,balourd,0.41,0.67,2
13267,banquise,1.42,0.65,1
13375,barbant,0.2,1.02,2


In [11]:
ncvcvcv2m_hf = ncvcvcv2m[(ncvcvcv2m.freqlemfilms2 > HF_MIN) & (ncvcvcv2m.freqlemlivres > HF_MIN)] #chaged to "and"
print(ncvcvcv2m_hf.shape)
ncvcvcv2m_hf[['ortho', 'freqlivres', 'freqfilms2','nbhomogr']].head(1000)
np.log10(ncvcvcv2m_hf.freqlivres+0.1).describe()


(197, 13)


count    197.000000
mean       1.574014
std        0.468719
min       -0.522879
25%        1.253822
50%        1.543571
75%        1.867350
max        2.864908
Name: freqlivres, dtype: float64

In [12]:
# select a subsample matched in frequency with cvcvcv

n = 60

ok= False
while (not ok):
    tmp = ncvcvcv2m_hf.sample(n)
    e = np.log10(tmp.freqlivres+0.1).median()
    ok = (e< 1.6)

print(e)
np.log10(tmp.freqlivres+0.1).describe()
ncvcvcv2m_hf = tmp

1.569525719669081


In [13]:
ncvcvcv2m_lf = ncvcvcv2m[(ncvcvcv2m.freqlemfilms2 < LF_MAX) & (ncvcvcv2m.freqlemlivres < LF_MAX)]
print(ncvcvcv2m_lf.shape)
ncvcvcv2m_lf[['ortho', 'freqlivres', 'freqfilms2','nbhomogr']].head(1000)

(180, 13)


Unnamed: 0,ortho,freqlivres,freqfilms2,nbhomogr
13527,barge,0.07,1.6,2
13528,barge,0.41,0.72,2
13834,baste,0.14,1.21,2
13835,baste,0.14,0.42,2
14711,berk,0.95,1.35,1
14808,best,0.34,0.52,1
15255,bigre,0.54,0.47,2
15520,bisque,0.54,0.41,2
15815,blatte,0.14,0.41,1
15971,block,0.0,0.7,1


In [None]:
cvcvcv_hf.to_csv('match_hf.csv')
cvcvcv_lf.to_csv('match_lf.csv')
ncvcvcv1m_hf.to_csv('1_mismatch_hf.csv')
ncvcvcv1m_lf.to_csv('1_mismatch_lf.csv')
ncvcvcv2m_hf.to_csv('2_mismatch_hf.csv')
ncvcvcv2m_lf.to_csv('2_mismatch_lf.csv')


# Pseudoword selection/Construction

Select randomally 10 words from each category and change them manually (the file of pseudowords is in the main folder)

In [None]:
n = 10

#sample randomally 10 words from each category
p_cvcvcv_hf = cvcvcv_hf.sample(n)
p_cvcvcv_lf = cvcvcv_lf.sample(n)
p_ncvcvcv1m_hf = ncvcvcv1m_hf.sample(n)
p_ncvcvcv1m_lf = ncvcvcv1m_lf.sample(n)
p_ncvcvcv2m_hf = ncvcvcv2m_hf.sample(n)
p_ncvcvcv2m_lf = ncvcvcv2m_lf.sample(n)

#combining the selected words:
frames = [p_cvcvcv_hf,p_cvcvcv_lf,p_ncvcvcv1m_hf,p_ncvcvcv1m_lf,p_ncvcvcv2m_hf,p_ncvcvcv2m_lf]
result = pd.concat(frames)

#extracting the words into a csv file
#result[['ortho', 'phon','p_cvcv', 'cvcv', 'nbsyll', 'nblettres']].to_csv('base_of_pseudo.csv')

# Cleaning and uniting the separate files

In [175]:
# opening the files
h_0 = pd.read_csv('match_hf_c.csv',encoding="utf-8-sig")
l_0 = pd.read_csv('match_lf_c.csv',encoding="utf-8-sig")
h_1 = pd.read_csv('1_mismatch_hf_c.csv',encoding="utf-8-sig")
l_1 = pd.read_csv('1_mismatch_lf_c.csv',encoding="utf-8-sig")
h_2 = pd.read_csv('2_mismatch_hf_c.csv',encoding="utf-8-sig")
l_2 = pd.read_csv('2_mismatch_lf_c.csv',encoding="utf-8-sig")

frames = [h_0,l_0,h_1,l_1,h_2,l_2]
new_frames = []
h_0["freq_mism"]= "H_0"
l_0["freq_mism"]= "L_0"
h_1["freq_mism"]= "H_1"
l_1["freq_mism"]= "L_1"
h_2["freq_mism"]= "H_2"
l_2["freq_mism"]= "L_2"

In [298]:
for frame in frames:
    frame = frame[frame.decision != "x"] #deleting the excluded words
    frame = frame.drop(['bad_cp','bad_as','decision'], axis=1) #removing irrelevant columns
    new_frames.append(frame)



In [265]:
#uniting the dfs and extraxting them to 1 file:
real_words = pd.concat(new_frames)
real_words.rename(columns={'Unnamed: 3': 'Serial_num'}, inplace=True)
real_words.to_csv('clean_words_file.csv',encoding="utf-8-sig",index=False)


In [302]:
import random
df = pd.read_csv("clean_words_file.csv")
df["freq_sum"] = df['freqlemfilms2']+df['freqfilms2']+df['freqlemlivres']+df['freqlivres']

df['freq']= np.log(df["freq_sum"])

def cat(freq_mism):
    if freq_mism[0] == 'H':
        return 'H'
    else:
        return 'L'
    
df['freq_cat'] = df['freq_mism'].apply(cat)


# making sure that the words in the low freq category are not homomgraph


In [303]:
low_df = df[df['freq']=='L']
low_words= list(low_f['ortho'])

low_df = lex.loc[lex['ortho'].isin(low_words)].copy()
low_df["freq_sum"] = low_df['freqlemfilms2']+low_df['freqfilms2']+low_df['freqlemlivres']+low_df['freqlivres']

low_df['freq']= np.log(low_df["freq_sum"])
low_df.drop(['phon','islem','freqlemfilms2','freqfilms2','freqlemlivres','freqlivres','p_cvcv','cvcv','freq_sum'], axis =1,inplace = True)

bad = low_df[low_df['freq']>= np.log(8)]
bad_words = list(bad['ortho'])


for word in bad_words:
    df = df[df.ortho != word]
    
bad_words

[]

In [412]:
##only the low_2mismatch:

#open the file with the filtered words by annahita and chritoph
df1 = pd.read_csv('low_2_mismatch.csv',encoding='utf-8-sig')
df1['freq_sum'] = df1['freqlemfilms2']+df1['freqfilms2']+df1['freqlemlivres']+df1['freqlivres']
df1['freq']= np.log(df1["freq_sum"])
df1.drop(['Unnamed: 16','Unnamed: 17','Unnamed: 0','Unnamed: 3','phon','islem','freqlemfilms2','freqfilms2','freqlemlivres','freqlivres','p_cvcv','cvcv','freq_sum'], axis =1,inplace = True)
words= list(df1['ortho'])

#finding the num of homograph in lex
df = lex.loc[lex['ortho'].isin(words)].copy()
df["freq_sum"] = df['freqlemfilms2']+df['freqfilms2']+df['freqlemlivres']+df['freqlivres']
df['freq']= np.log(df["freq_sum"])
df.drop(['phon','islem','freqlemfilms2','freqfilms2','freqlemlivres','freqlivres','p_cvcv','cvcv','freq_sum'], axis =1,inplace = True)

#defining the "bad word" according to freq
bad = df[df['freq']>= np.log(8)]
bad_words = set(bad['ortho'])

for word in bad_words:
    df = df[df.ortho != word]



df_all = pd.merge(df1, df, on=['ortho','cgram'], 
                   how='left', indicator=True)
problematic = df_all[df_all["_merge"] == 'left_only']
good = df_all[df_all["_merge"] == 'both']

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('l_2_problem.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
good.to_excel(writer, sheet_name='good_words')
problematic.to_excel(writer, sheet_name='problematic_words')
bad.to_excel(writer, sheet_name='cause_of_problem')
df1.to_excel(writer, sheet_name='filtered_by_as_and_cp')


# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [165]:

            
freq_groups = df.groupby("freq_mism")

h_2 = freq_groups.get_group('H_2')[5:45]     
l_2 = freq_groups.get_group('L_2').head(40)
h_1 = freq_groups.get_group('H_1').sample(n=40, random_state=1)
l_1 = freq_groups.get_group('L_1').sample(n=40, random_state=1)
h_0 = freq_groups.get_group('H_0').sample(n=40, random_state=1)
l_0 = freq_groups.get_group('L_0').sample(n=40, random_state=1)


frames = [h_0, l_0, h_1, l_1, h_2 ,l_2]
final = pd.concat(frames)
final.reset_index(inplace = True)
final.drop(['index','islem','Unnamed: 16','Unnamed: 17','freq_sum','freqlemfilms2','freqfilms2','freqlemlivres','freqlivres'], axis=1, inplace = True)
final

#final.to_csv('final.csv',encoding="utf-8-sig",index=False)


Unnamed: 0,Serial_num,ortho,phon,cgram,p_cvcv,cvcv,nbsyll,nblettres,freq_mism,freq,freq_cat
0,104433,posséder,posede,VER,CVCVCV,CVCCVCVC,3,8,H_0,4.973902,H
1,40407,dépasser,depase,VER,CVCVCV,CVCVCCVC,3,8,H_0,4.925731,H
2,41077,député,depyte,NOM,CVCVCV,CVCVCV,3,6,H_0,3.558201,H
3,112309,récemment,Resam@,ADV,CVCVCV,CVCVCCVCC,3,9,H_0,4.280547,H
4,45853,dominer,domine,VER,CVCVCV,CVCVCVC,3,7,H_0,4.402809,H
5,59704,fabuleux,fabyl2,ADJ,CVCVCV,CVCVCVVC,3,8,H_0,3.985273,H
6,82360,limiter,limite,VER,CVCVCV,CVCVCVC,3,7,H_0,3.625141,H
7,125876,signaler,siNale,VER,CVCVCV,CVCCVCVC,3,8,H_0,4.270397,H
8,43659,dévorer,devoRe,VER,CVCVCV,CVCVCVC,3,7,H_0,4.235844,H
9,124995,sentiment,s@tim@,NOM,CVCVCV,CVCCVCVCC,3,9,H_0,5.930413,H
