In [286]:
import json
import pandas as pd
import os
import re
from collections import Counter
import librosa
import random
import IPython.display as ipd

# The paths

In [287]:
recordings_folder_path = r"../Intervention_T1_data/recordings/phondel_trials/FR_phonDel/FR_phonDel"
phon_del_trial_path = r"../Intervention_T1_data/ground_truth_transcribed/PhonDel/phonDel_test_ground_truth_ALL_T1_first_coder.csv"
path_filters = None

# Data Pre-Processing

Load the dataframe containing the data

In [288]:
df = pd.read_csv(phon_del_trial_path)
print("Number of rows in df: ", len(df))
df.rename(columns={"Comments": "comments"}, inplace=True)
print("Columns in df: ", df.columns)

df

Number of rows in df:  11087
Columns in df:  Index(['date_time', 'participant_id', 'session_id', 'trial_id', 'timePoint',
       'language', 'session_form', 'phase', 'config_id', 'config', 'file_name',
       'trial_answer', 'accuracy', 'notes', 'comments'],
      dtype='object')


Unnamed: 0,date_time,participant_id,session_id,trial_id,timePoint,language,session_form,phase,config_id,config,file_name,trial_answer,accuracy,notes,comments
0,23.11.2023 09:26,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,2249c5f2f75d4b089bee1f36e1a7aef2,PRE,fr,B,DELETION,phondel_B_pseudo_8,bromal,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,an.nage,0.0,,
1,23.11.2023 09:26,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,3eff2b8d9be54f24aaa5f0bf3ef81c50,PRE,fr,B,DELETION,phondel_B_pseudo_6,banéon,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,{no audio},,,
2,23.11.2023 09:24,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,56dbefb3163a44beab895890b503600f,PRE,fr,B,DELETION,phondel_B_word_2,cheval,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,ch euval,1.0,first_phoneme_repetition,
3,23.11.2023 09:26,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,5c6cef80aa54470caa4df67c9003aecc,PRE,fr,B,DELETION,phondel_B_pseudo_7,polage,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,onage,1.0,,
4,23.11.2023 09:24,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,620b6b5984f14b0c90432d5a123e2106,PRE,fr,B,DELETION,phondel_B_word_3,fromage,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,omage,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11082,20.11.2023 09:07,101_edugame2023,-,bb53a95c95dc4ee0ad83b7443bb83848,PRE,It,C,DELETION,phondel_A_pseudo_2,gleno,dummy_TEST_eb01e2e9529a4e90a9c7de1574d22425_bb...,,,,
11083,20.11.2023 09:07,101_edugame2023,-,d9016ac30812441b958f1897eac9940e,PRE,It,C,DELETION,phondel_A_word_1,fiume,dummy_TEST_eb01e2e9529a4e90a9c7de1574d22425_d9...,,,,
11084,20.11.2023 09:07,101_edugame2023,-,e9ce0cf1ae684d42add65da7b69beb65,PRE,It,C,DELETION,phondel_A_pseudo_1,piulo,dummy_TEST_eb01e2e9529a4e90a9c7de1574d22425_e9...,,,,
11085,20.11.2023 09:08,101_edugame2023,-,ec02fd447bc9419dba6b4422e0d40067,PRE,It,C,DELETION,phondel_A_word_5,sblocco,dummy_TEST_eb01e2e9529a4e90a9c7de1574d22425_ec...,,,,


## Modification of the database

1) Drop some columns that are no relevant
    - date_time, timePoint, trial_id
2) Change some column names to have more evocative names
    - config --> reference_text
    - trial_answer --> human_transcription
3) Add some column
    - words_human_transcription: contains the transcription without the annotations
    - human_alignment: contains the alignment between the clinician and asr transcription
    - audio_length: duration of corresponding audio
    - process_time: time ot took to the ASR to process the audio
    - asr_response: response of the ASR
    - asr_comparison: comparison between clinician and asr transcription (TP, TN, FP, FN)
4) Remove rows of the database without clinician annotations: 104 rows (all were marked as duplicate trial)
5) Keep only the files that are present in both the dataframe and in the recording folder and remove duplicate file (duplicate with respect to the file names)
6) Replace all ';' by a ' ' in the reference_text column for future alignment 

In [289]:
def reformat_df(df, columns_drop, columns_rename, columns_add):

    df = df.drop(columns=columns_drop)
    df = df.rename(columns=columns_rename)
    for col in columns_add:
        df[col] = ""
    return df    

In [290]:
def match_files_df_recordings(df, recordings_folder_path, path_filters=None):    
    recordings_filenames = []
    recordings_filepaths = []
    if path_filters is None:
        path_filters = ()

    # Get list of all audio filenames and paths in recordings folder whose paths contains all path filters
    for root, _, files in os.walk(recordings_folder_path):
        if all(filter in root for filter in path_filters):
            for file in files:
                if file.endswith(".wav"):
                    recordings_filenames.append(file)
                    recordings_filepaths.append(os.path.join(root, file))

    # Get the list of filenames in the json file
    df_filenames = df['filename'].tolist()

    # Get the list of filenames in the df file that are not in the recordings folder
    missing_recordings = list(set(df_filenames) - set(recordings_filenames))
    print(len(missing_recordings), " Files that are in the df but not in the recording folder (not processed): ", missing_recordings)

    # Get the list of filenames in the recordings folder that are not in the df
    extra_recordings = list(set(recordings_filenames) - set(df_filenames))
    print(len(extra_recordings), " Files that are in the recording folder but not in the df (not processed): ", extra_recordings)

    # Remove extra files from recordings_filepaths and missing files from df
    recordings_filepaths = [path for path in recordings_filepaths if os.path.basename(path) not in extra_recordings]
    df = df[~df['filename'].isin(missing_recordings)]

    # Remove the duplicate filenames from df and from recordings_filepaths
    print("Duplicate filenames in df (not processed): ", df[df['filename'].duplicated(keep=False)].sort_values('filename')['filename'].tolist())
    duplicate_list = [item for item, count in Counter(recordings_filepaths).items() if count > 1]
    print("Duplicate filenames in recordings folder (not processed): ", [item for item, count in Counter(recordings_filepaths).items() if count > 1])
    df = df.drop_duplicates(subset=['filename'], keep='first')
    recordings_filepaths = list(set(recordings_filepaths))

    # reset index of DataFrame
    df = df.reset_index(drop=True)

    # Verify number of files in DataFrame matches number of files in folder
    assert len(df) == len(recordings_filepaths), f"Number of files in DataFrame ({len(df)}) does not match number of files in folder ({len(recordings_filepaths)})"
    print("Number of files to be processed: ", len(df))

    return df, recordings_filepaths, recordings_filenames

In [291]:
### Change format of df to match the expected format ###
df = reformat_df(df = df, 
                    columns_drop = ['date_time','timePoint', 'trial_id'], 
                    columns_rename = {"config":"reference_text", "trial_answer":"human_transcription", "file_name": "filename"}, 
                    columns_add = ["words_human_transcription", "human_alignment", "asr_response", "asr_comparison", "audio_length", "process_time", "filepath", "accuracy_og"])
# Reorder the DataFrame columns
desired_order = ['participant_id', 'session_id', 'language', 'session_form', 'phase',
                 'config_id', 'filename', 'reference_text', 'words_human_transcription',
                 'human_transcription', 'accuracy', 'accuracy_og','notes', 'comments', 
                 'human_alignment', 'asr_response', 'asr_comparison', 'audio_length', 'process_time']
df = df[desired_order]


### Handle rows with NaN values in human_transcription column ###
print(df['human_transcription'].isnull().sum(), "rows with NaN values in human_transcription column and notes associated with these rows are", df[df['human_transcription'].isnull()]['notes'].unique(), "\n")
# Remove rows with NaN values in transcription column 
df = df.dropna(subset=['human_transcription']).reset_index(drop=True)


### Synchronize the DataFrame with the audio files by removing missing files and duplicates ###
df, recordings_filepaths, recordings_filenames = match_files_df_recordings(df = df, 
                                                                            recordings_folder_path = recordings_folder_path, 
                                                                            path_filters = path_filters)


# For each string in reference_text, replace the ; by a space for the alignment
df['reference_text'] = df['reference_text'].apply(lambda x: x.replace(";", " "))

# Add the filepath column to the DataFrame
df['filepath'] = [os.path.join(recordings_folder_path, filename) for filename in df['filename']]



# Save the updated df to a JSON file
df.to_json('dfs/phondel/Intervention_PhonDel_og_df.json', orient='records')

# Only keep rows in df_asr with language "fr" and reset index
df_asr_fr = df[df['language'] == 'fr'].reset_index(drop=True) #### FRENCH ONLY

92 rows with NaN values in human_transcription column and notes associated with these rows are [nan] 

5489  Files that are in the df but not in the recording folder (not processed):  ['136_edugame2023_f6f542fe9bd64832a5c0e03bd9fd7b21_f6db4337797b40359db0a7bd9905012a.wav', '102_edugame2023_aa91d3e1f00a4898a31307b0e1caa91d_94fa4b079a74491881b551f268cdeb68.wav', '1037_edugame2023_39ae012aa84b434b9386c948032813bc_96e2d2dd49fa4b7f9ef12d126bcaa7ce.wav', '1321_edugame2023_7222297109f643e497365cff18d41309_2c9ff96a8c3b4acdbe57f9ebe4f6c872.wav', '1054_edugame2023_6f2efaf392744ffdaa2d7fa0a7e488bc_1f6a0c0e11e94df4b7d1f4fce9aca80b.wav', '1036_edugame2023_c3160e2340364510973439a8532e503d_8a35f4e694534c4ca6cc9ba38cf6ce5d.wav', '734_edugame2023_5cfdf1d57aca4fa091cbf25bbc704de9_4a1ce74e574e48ccae49e4773cf8c712.wav', '1036_edugame2023_c3160e2340364510973439a8532e503d_78441c2df0d447b79f0453434303335e.wav', '416_edugame2023_66324299ad1e42179d35ea98c68d11e0_e9157c70d15f447b9e2137111ff55228.wav', '719_edug

In [292]:
# copy accuracy column to accuracy_og column
df['accuracy_og'] = df['accuracy']
# If accuracy row is nan replace it with 0.0
df['accuracy'] = df['accuracy'].apply(lambda x: 0.0 if pd.isna(x) else x)


In [293]:
# Check that the number of unique reference text and config_id match
assert len(df['reference_text'].unique()) == len(df['config_id'].unique()), "Number of unique reference text and config_id do not match"
print("Number of unique reference text and config_id match and is", len(df['reference_text'].unique()), "\n")

# Each config_id should have the same reference_text print them
for config_id in sorted(df['config_id'].unique()):
    print(config_id, df[df['config_id'] == config_id]['reference_text'].unique())


Number of unique reference text and config_id match and is 54 

phondel_A_pseudo_0 ['plique']
phondel_A_pseudo_1 ['razin']
phondel_A_pseudo_2 ['ganda']
phondel_A_pseudo_3 ['glotu']
phondel_A_pseudo_4 ['pladar']
phondel_A_pseudo_5 ['spéréo']
phondel_A_pseudo_6 ['gatine']
phondel_A_pseudo_7 ['pobier']
phondel_A_pseudo_8 ['trical']
phondel_A_word_0 ['plume']
phondel_A_word_1 ['râteau']
phondel_A_word_2 ['chemin']
phondel_A_word_3 ['frisson']
phondel_A_word_4 ['canapé']
phondel_A_word_5 ['station']
phondel_A_word_6 ['glotte']
phondel_A_word_7 ['solitude']
phondel_A_word_8 ['camarade']
phondel_B_pseudo_0 ['triffe']
phondel_B_pseudo_1 ['mican']
phondel_B_pseudo_2 ['condas']
phondel_B_pseudo_3 ['glomo']
phondel_B_pseudo_4 ['plabir']
phondel_B_pseudo_5 ['spalio']
phondel_B_pseudo_6 ['banéon']
phondel_B_pseudo_7 ['polage']
phondel_B_pseudo_8 ['bromal']
phondel_B_word_0 ['plage']
phondel_B_word_1 ['bateau']
phondel_B_word_2 ['cheval']
phondel_B_word_3 ['fromage']
phondel_B_word_4 ['canari']
phon

In [294]:
# Fill the audio_length column by adding the audio duration for each file in recordings_filepaths and get the total time
total_time = 0
for audio_path in recordings_filepaths:
    filename = os.path.basename(audio_path)
    if filename in df['filename'].values:
        audio_length = librosa.get_duration(path=audio_path)
        df.loc[df['filename'] == filename, 'audio_length'] = audio_length
        total_time += audio_length
print("\nTotal time of audio files:", total_time, "seconds =", total_time/60, "minutes")

# Print if a participant has multiple session_form values
print("\nNumber of participants with multiple session_form (A, B, C) values: ", len(df.groupby('participant_id')['session_form'].nunique()[df.groupby('participant_id')['session_form'].nunique() > 1]))

# Print the number of participants per language and trial
for lan in df['language'].unique():
    print("\n Total number of unique participants for each config_id in", lan)
    for config_id in df['config_id'].unique():
        filtered_df = df[(df['language'] == lan) & (df['config_id'] == config_id)]
        # print number of participants per config_id
        print(str(config_id) + " :  " + str(len(filtered_df['participant_id'].unique())))
    print(" \nTotal number of unique participants for each phase in", lan)
    for phase in df['phase'].unique():
        filtered_df = df[(df['language'] == lan) & (df['phase'].apply(lambda x: x.split("_")[0] in [phase]))]
        print(phase + " :  " + str(len(filtered_df['participant_id'].unique())) + " participants")



Total time of audio files: 21356.38984126983 seconds = 355.93983068783047 minutes

Number of participants with multiple session_form (A, B, C) values:  0

 Total number of unique participants for each config_id in fr
phondel_B_pseudo_8 :  52
phondel_B_pseudo_6 :  52
phondel_B_word_2 :  52
phondel_B_pseudo_7 :  52
phondel_B_word_3 :  52
phondel_B_pseudo_1 :  52
phondel_B_word_8 :  52
phondel_B_word_0 :  52
phondel_B_pseudo_4 :  52
phondel_B_word_5 :  52
phondel_B_word_4 :  52
phondel_B_word_6 :  52
phondel_B_pseudo_3 :  52
phondel_B_word_1 :  52
phondel_B_pseudo_2 :  52
phondel_B_pseudo_5 :  52
phondel_B_pseudo_0 :  52
phondel_B_word_7 :  52
phondel_C_pseudo_5 :  52
phondel_C_pseudo_7 :  52
phondel_C_word_6 :  52
phondel_C_word_1 :  52
phondel_C_pseudo_3 :  52
phondel_C_word_3 :  52
phondel_C_word_4 :  52
phondel_C_word_0 :  52
phondel_C_word_8 :  52
phondel_C_word_2 :  52
phondel_C_pseudo_2 :  52
phondel_C_pseudo_4 :  52
phondel_C_pseudo_8 :  52
phondel_C_word_5 :  52
phondel_C_word_7

## Transcription Normalization

Get transcriptions without the annotations and write them in the words_human_transcription column. These will be fed to the ASR during training so they need to contain what was said without any comment or mark.
- Remove the sounding out annotation '.' in the transcription : pa.pa --> papa
- Remove the insertion annotation'{...}' in the transcription : pa{pu}pa --> papupa
- Remove the comments {inaudible}, {comments}, {adult_support}, {adult_task}, {no audio}, {pause}, {cut recording}


Extra step: Remove all between brackets and parenthesis because we have transcriptions containing (indexes are those of df after cleaning):

- [ :  putio [pytjo] , 267
- [ :  o ofre [o] , 491
- [ :  trofe [o] , 507
- [ :  ofe [ɔ] , 603
- ɔ :  ofe [ɔ] , 603
- [ :  trofe [ɔ] , 626
- ɔ :  trofe [ɔ] , 626
- [ :  otrf [ɔ] , 1081
- ɔ :  otrf [ɔ] , 1081
- [ :  rofe [o] , 1171
- [ :  rofr [o] , 1209
- [ :  ofr [o] , 1239
- [ :  offe [o] , 1364
- [ :  offe [o] , 1378
- ( :  atin(e)gue , 1552
- [ :  offe [o] , 1589
- :  in-nuge , 3401
- ( :  s pudiome (o ouvert) , 4495

In [295]:
# Fill the words_human_transcription column with the human transcription with removed annotations
def remove_transcription_annotations(transcription):
    """
    Removes clinician annotations from a given transcription.
    Args:
        transcription (str): The input transcription string.
    Returns:
        str: The transcription string with annotations removed.
    """
    # If the letter before and after the "." is the same, only keep one letter and '.' ex: "a.ar" -> "ar"
    transcription = re.sub(r'(\w)\.(\1)', r'\2', transcription)
    transcription = transcription.replace(".", "")

    transcription = transcription.replace("{inaudible}", "")
    transcription = transcription.replace("{comments}", "")
    transcription = transcription.replace("{adult_support}", "")
    transcription = transcription.replace("{adult_task}", "")
    transcription = transcription.replace("{no audio}", "")
    transcription = transcription.replace("{pause}", "")
    transcription = transcription.replace("{cut recording}", "")

    transcription = transcription.replace("{", "")
    transcription = transcription.replace("}", "")

    # remove all between brackets and parenthesis
    transcription = re.sub(r'\[.*?\]', '', transcription)
    transcription = re.sub(r'\(.*?\)', '', transcription)

    return transcription

df['words_human_transcription'] = df['human_transcription'].apply(lambda x: remove_transcription_annotations(x))

The transcriptions in our dataset exhibit inconsistencies where the same word can be spelled differently across rows. To ensure consistent labeling for our finetuning task and accurate evaluation of the ASR performance, we need to normalize the spelling of correctly uttered words. To achieve this, we create a replacement dictionary that maps each variant spelling to a normalized spelling. 

To create the replacement dictionary, we iterate over each list of (pseudo)words. For each (pseudo)word in this list, we select only the rows where the clinician marked the word as correctly pronounced (where its value in the accuracy list is 2). We then check for discrepancies between the reference text and human transcription for these rows and add any identified discrepancies to our dictionary.

Here accuracy score of 2 will be marked as correct and the others as incorrect

In [296]:
list_filenames_1 = ['3701_edugame2023_3bc42b641b324df19b5e0062a69d96e1_b1b9010672944621a0e836947eadbb93.wav',
                    '4023_edugame2023_ae4c45a99b3249b9b7956b53eac1356f_d236a6ad182b4c42a49ec0e4991309fa.wav',

                  ]

list_filenames_0 = ['4021_edugame2023_2c00adcffb304ea29cf1e50f387b5f60_859df5b501104d388791e8556607fac0.wav',
                    '3712_edugame2023_e6c1c6fde35f47c3825b66bf237a377d_d69e204de61346ea971e1eb237918381.wav',
                    '3151_edugame2023_e491dd7fb2bf4f87be389ec9f010cee7_964a64f36b344ff9a0b734438af6716d.wav',
]

list_filenames_2 = ['3726_edugame2023_80869283edb446a196918178d6ecaa99_41750a185c9d4320ab6db2c402f75256.wav']

# search in the df for filenames in list_filenames_1, replace accuracy by 1.0
df.loc[df['filename'].isin(list_filenames_1), 'accuracy'] = 1.0
# search in the df for filenames in list_filenames_0, replace accuracy by 0.0
df.loc[df['filename'].isin(list_filenames_0), 'accuracy'] = 0.0

# for filename in list_filenames_2, replace words_human_transcription and human_transcription by the reference_text
df.loc[df['filename'].isin(list_filenames_2), 'words_human_transcription'] = df.loc[df['filename'].isin(list_filenames_2), 'reference_text']

In [297]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
config_dfs = []
print("Number of unique config_id: ", len(config_ids))

for i in range(len(config_ids)): # index of the config_id to be processed 

    # Get all rows with the config_id we want to process
    df_config = df[df['config_id'] == config_ids[i]]
    # only keep when phase is REPETITION
    

    # Only keep rows where the accuracy is 2
    df_config = df_config.dropna(subset=['accuracy'])
    df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: int(x))
    df_config = df_config[df_config['accuracy'].apply(lambda x: int(x) == 2)]

    df_config_del = df_config[df_config['phase'].apply(lambda x: x in 'DELETION')]
    df_config_del = df_config[df_config.apply(lambda x: (x['reference_text'][1:] not in x['words_human_transcription'].strip().split(" ")), axis=1)]
  
    # Show all rows where the word in the reference_text column is not contained in the string in the words_human_transcription column
    df_config = df_config[df_config['phase'].apply(lambda x: x in 'REPETITION')]
    df_config = df_config[df_config.apply(lambda x: (x['reference_text'] not in x['words_human_transcription'].strip().split(" ")), axis=1)]
    # if df_config is empty, continue to the next config_id
    if not df_config.empty:
        config_dfs.append(df_config)
        print('Index', i, ": Processed config_id: ", config_ids[i])
        print("Processed reference_text repetition:", df_config.iloc[0]['reference_text'])
        # for each row in df_config print the reference_text and words_human_transcription and the accuracy
        for index, row in df_config.iterrows():
            print(row['filename'])
            print(row['phase'], '/', row['words_human_transcription'], '/', row['accuracy'])
            #print(row['reference_text'][1:])
            #print(row['words_human_transcription'].strip().split(" "))
            #print(row['reference_text'][1:] not in row['words_human_transcription'].split(" "))
        print()

Number of unique config_id:  54
Index 2 : Processed config_id:  phondel_B_word_2
Processed reference_text repetition: cheval
3162_edugame2023_32793e0c589d4fa68ec56925c81f1c3f_1f751c30cd6849268d510f20e4f74564.wav
REPETITION / cheuval / 2.0
3416_edugame2023_4a0b4d189b05483fa96185374eee5ffa_314ed60b4d074038a4bace9848c30679.wav
REPETITION / cheuval / 2.0
3417_edugame2023_004738c4582e43c6ba3a3203dfbeb056_f6f15af87b4f491f84211b6dcd13f9bd.wav
REPETITION / cheuval / 2.0
3420_edugame2023_9bfc27a89e1c4f2187008ecf0b9a826a_81640b5073d14e0a95e7de9b1259398d.wav
REPETITION / cheuval / 2.0
3746_edugame2023_d1ce946d8f994e6587c94f7b6563b019_c6236011c31f4f42abbbd9aa800e8c5e.wav
REPETITION / cheuval / 2.0

Index 5 : Processed config_id:  phondel_B_pseudo_1
Processed reference_text repetition: mican
3405_edugame2023_5b9989694d774a34a1eafaa610d7d0c2_b99960ab06fd44b191ac78ec7b069da7.wav
REPETITION / ican / 2.0

Index 13 : Processed config_id:  phondel_B_word_1
Processed reference_text repetition: bateau
3417

In [298]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
config_dfs = []
print("Number of unique config_id: ", len(config_ids))

for i in range(len(config_ids)): # index of the config_id to be processed 

    # Get all rows with the config_id we want to process
    df_config = df[df['config_id'] == config_ids[i]]
    # only keep when phase is REPETITION
    

    # Only keep rows where the accuracy is 2
    df_config = df_config.dropna(subset=['accuracy'])
    df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: int(x))
    df_config = df_config[df_config['accuracy'].apply(lambda x: int(x) == 2)]

    df_config = df_config[df_config['phase'].apply(lambda x: x in 'DELETION')]
    df_config = df_config[df_config.apply(lambda x: (x['reference_text'][1:] not in x['words_human_transcription'].strip().split(" ")), axis=1)]
  
    # Show all rows where the word in the reference_text column is not contained in the string in the words_human_transcription column
    # if df_config is empty, continue to the next config_id
    if not df_config.empty:
        config_dfs.append(df_config)
        print('Index', i, ": Processed config_id: ", config_ids[i])
        print("Processed reference_text repetition:", df_config.iloc[0]['reference_text'])
        print("Processed reference_text deletion:", df_config.iloc[0]['reference_text'][1:])
        # for each row in df_config print the reference_text and words_human_transcription and the accuracy
        for index, row in df_config.iterrows():
            print(row['filename'])
            print(row['phase'], '/', row['words_human_transcription'], '/', row['accuracy'])
            #print(row['reference_text'][1:])
            #print(row['words_human_transcription'].strip().split(" "))
            #print(row['reference_text'][1:] not in row['words_human_transcription'].split(" "))
        print()

Number of unique config_id:  54
Index 2 : Processed config_id:  phondel_B_word_2
Processed reference_text repetition: cheval
Processed reference_text deletion: heval
3111_edugame2023_4c44aca8f5384bcdbe71dde1dfcef2c7_2f55c7851d1d4dbe997c20bcd6e2fec3.wav
DELETION / euval / 2.0
3118_edugame2023_3c686e53fdb9487db3ddc5df86592967_9aded2d1b5c243909791bacb01d5eb03.wav
DELETION / euval / 2.0
3124_edugame2023_9ad8272b040541ab84acc2bcbd54aa4a_455ad58d67d744fb9e9b5fd87cd30ed7.wav
DELETION / eu euval / 2.0
3127_edugame2023_4959a495407c466bb9a26cf24c129cea_543330a86386477ab45f0f738df5185e.wav
DELETION / euval / 2.0
3132_edugame2023_bbc5159ce61148a49da08f5f6f2afb60_9892d9de65714f8195e8d31c7f0b5654.wav
DELETION / euval / 2.0
3147_edugame2023_9ea3ec00205249e8a6f410525222dbab_29451355d2b6444095d801b82fd1c4ca.wav
DELETION / euval / 2.0
3149_edugame2023_bd00b4024cdf4ee491269af5b74cd509_7e5fc21492d1429393c33cec3244cec6.wav
DELETION / euval / 2.0
3152_edugame2023_277d765afc3e44d69a8a7b7da04766ab_fd84ab3d658

In [299]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
config_dfs = []
print("Number of unique config_id: ", len(config_ids))

for i in range(len(config_ids)): # index of the config_id to be processed 

    # Get all rows with the config_id we want to process
    df_config = df[df['config_id'] == config_ids[i]]
    # only keep when phase is REPETITION
    

    # Only keep rows where the accuracy is not 2
    df_config = df_config.dropna(subset=['accuracy'])
    df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: int(x))
    df_config = df_config[df_config['accuracy'].apply(lambda x: int(x) != 2)]

    # Show all rows where the word in the reference_text column is not contained in the string in the words_human_transcription column
    df_config = df_config[df_config['phase'].apply(lambda x: x in 'REPETITION')]
    df_config = df_config[df_config.apply(lambda x: (x['reference_text'] in x['words_human_transcription'].split(" ")), axis=1)]
    # if df_config is empty, continue to the next config_id
    if not df_config.empty:
        config_dfs.append(df_config)
        print('Index', i, ": Processed config_id: ", config_ids[i])
        print("Processed reference_text repetition:", df_config.iloc[0]['reference_text'])
        # for each row in df_config print the reference_text and words_human_transcription and the accuracy
        for index, row in df_config.iterrows():
            #print(row['filename'])
            print(row['phase'], '/', row['words_human_transcription'], '/', row['accuracy'])
            #print(row['reference_text'][1:])
            #print(row['words_human_transcription'].strip().split(" "))
            #print(row['reference_text'][1:] not in row['words_human_transcription'].split(" "))
        print()

Number of unique config_id:  54
Index 0 : Processed config_id:  phondel_B_pseudo_8
Processed reference_text repetition: bromal
REPETITION / bromal omage / 1.0

Index 1 : Processed config_id:  phondel_B_pseudo_6
Processed reference_text repetition: banéon
REPETITION / banéon banéon ba anéon / 1.0

Index 5 : Processed config_id:  phondel_B_pseudo_1
Processed reference_text repetition: mican
REPETITION / mican ican / 1.0

Index 6 : Processed config_id:  phondel_B_word_8
Processed reference_text repetition: capitale
REPETITION / capitale apitale / 1.0
REPETITION / capitale talcapi / 1.0

Index 10 : Processed config_id:  phondel_B_word_4
Processed reference_text repetition: canari
REPETITION / canari nari / 0.0

Index 11 : Processed config_id:  phondel_B_word_6
Processed reference_text repetition: gloire
REPETITION / gloire gloire oire / 1.0
REPETITION / gloire gl gl g g gl loire loire / 1.0
REPETITION / gloire loire loire loire / 1.0

Index 12 : Processed config_id:  phondel_B_pseudo_3
Pro

In [300]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
config_dfs = []
print("Number of unique config_id: ", len(config_ids))

for i in range(len(config_ids)): # index of the config_id to be processed 

    # Get all rows with the config_id we want to process
    df_config = df[df['config_id'] == config_ids[i]]
    # only keep when phase is REPETITION
    

    # Only keep rows where the accuracy is 2
    df_config = df_config.dropna(subset=['accuracy'])
    df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: int(x))
    df_config = df_config[df_config['accuracy'].apply(lambda x: int(x) != 2)]

    df_config = df_config[df_config['phase'].apply(lambda x: x in 'DELETION')]
    df_config = df_config[df_config.apply(lambda x: (x['reference_text'][1:] in x['words_human_transcription'].strip().split(" ")), axis=1)]
  
    # Show all rows where the word in the reference_text column is not contained in the string in the words_human_transcription column
    # if df_config is empty, continue to the next config_id
    if not df_config.empty:
        config_dfs.append(df_config)
        print('Index', i, ": Processed config_id: ", config_ids[i])
        print("Processed reference_text repetition:", df_config.iloc[0]['reference_text'])
        print("Processed reference_text deletion:", df_config.iloc[0]['reference_text'][1:])
        # for each row in df_config print the reference_text and words_human_transcription and the accuracy
        for index, row in df_config.iterrows():
            print(row['filename'])
            print(row['phase'], '/', row['words_human_transcription'], '/', row['accuracy'])
        print()

Number of unique config_id:  54
Index 1 : Processed config_id:  phondel_B_pseudo_6
Processed reference_text repetition: banéon
Processed reference_text deletion: anéon
3181_edugame2023_9305eed61e8d4b3d99b88c83f0bc2703_fc3fd7e7c0bf43139de9abb0d0cc8ab7.wav
DELETION / b anéon / 1.0

Index 3 : Processed config_id:  phondel_B_pseudo_7
Processed reference_text repetition: polage
Processed reference_text deletion: olage
3752_edugame2023_e62014d710fd4ea1a22b11e9e2023959_746befef51d94ac8861de5f07334df4f.wav
DELETION / p olage / 1.0

Index 4 : Processed config_id:  phondel_B_word_3
Processed reference_text repetition: fromage
Processed reference_text deletion: romage
3103_edugame2023_4cdb2baccd9341c89a743a854bf7a985_f87714d2c5714a488ad64141a21c4e85.wav
DELETION / f romage / 1.0
3156_edugame2023_5b68689254634805b9b823b0b2d5fd6b_565725b57a7f43478ab414ff456afaf3.wav
DELETION / fro romage  / 0.0

Index 7 : Processed config_id:  phondel_B_word_0
Processed reference_text repetition: plage
Processed re

-  reference_text repetition: mican
- REPETITION / ican / 2.0  
- 3405_edugame2023_5b9989694d774a34a1eafaa610d7d0c2_b99960ab06fd44b191ac78ec7b069da7.wav


-  reference_text repetition: bateau
- REPETITION / beateau / 2.0  
- 3417_edugame2023_004738c4582e43c6ba3a3203dfbeb056_f7cd868e49a84835a05205a9859151f1.wav



-  reference_text repetition: protal
- Processed reference_text deletion: rotal
- DELETION / trotal / 2.0
- 3740_edugame2023_1dbfc8ca008d43cbaf16a0495c15174d_29da326d5a484e259fe2c159ea77c49c.wav



In [301]:
replacements = {
    'lèz': 'laise',
    'glèz': 'glaise',
    'conda': 'condas',
    'onda': 'ondas',
    'solussion': 'solution',
    'olussion': 'olution',
    'stassion': 'station',
    'tassion': 'tation',
    'like': 'lique',
}

# cheval -> euval index 0
# chemin -> eumin index 11


In [302]:
df_config = config_dfs[13] # 1 6 8 9  
print("Processed reference_text repetition:", df_config.iloc[0]['reference_text'])
print("Processed reference_text deletion:", df_config.iloc[0]['reference_text'][1:])
df_config

Processed reference_text repetition: pogion
Processed reference_text deletion: ogion


Unnamed: 0,participant_id,session_id,language,session_form,phase,config_id,filename,reference_text,words_human_transcription,human_transcription,accuracy,accuracy_og,notes,comments,human_alignment,asr_response,asr_comparison,audio_length,process_time,filepath
4958,4010_edugame2023,81ddcf87851c495f83c0bbdd75d09189,fr,C,DELETION,phondel_C_pseudo_7,4010_edugame2023_4635f9c4aa9b49af80aebe89d605b...,pogion,p ogion,p ogion,1.0,1.0,first_phoneme_deletion,,,,,2.763175,,../Intervention_T1_data/recordings/phondel_tri...


In [303]:
# Create a copy of the df


# Add a space before and after each key and value in the replacements dictionary to avoid replacing parts of words
# For example "re" in "arbre" will not replaced by "reux"
replacements = {f" {k} ": f" {v} " for k, v in replacements.items()}
# Add a space before each sentence in words_human_transcription
df['words_human_transcription'] = df['words_human_transcription'].apply(lambda x: f" {x} ")
df_not_normalized = df.copy()

# Replace the keys with the values in the replacements dictionary
for k, v in replacements.items():
    df['words_human_transcription'] = df['words_human_transcription'].str.replace(k, v)

Code to check all the transcriptions containing a given word:

In [304]:
word = 'like'
df_row = df_not_normalized.loc[df_not_normalized['words_human_transcription'].str.contains(' '+word+' ')]
df_row

Unnamed: 0,participant_id,session_id,language,session_form,phase,config_id,filename,reference_text,words_human_transcription,human_transcription,accuracy,accuracy_og,notes,comments,human_alignment,asr_response,asr_comparison,audio_length,process_time,filepath
2175,3170_edugame2023,79d1e001114a463a825e425ec54beeae,fr,A,DELETION,phondel_A_pseudo_0,3170_edugame2023_9fd374ddfbe7490787b168e369691...,plique,i like,i like,2.0,2.0,sounding_out,,,,,8.0,,../Intervention_T1_data/recordings/phondel_tri...


After the normalization of the spelling:

In [305]:
word = 'like'
df_row = df.loc[df['words_human_transcription'].str.contains(' '+word+' ')]
df_row

Unnamed: 0,participant_id,session_id,language,session_form,phase,config_id,filename,reference_text,words_human_transcription,human_transcription,accuracy,accuracy_og,notes,comments,human_alignment,asr_response,asr_comparison,audio_length,process_time,filepath


#### Some explanation required here
Some mistach between accuracy score and word transcribed seen for the following (there might be more those are the ones that I have noticed)

- row 796: diege but tiege but accurcy is 2



Code to check the mismatch for a given row:

In [306]:
for index in [3879, 1597, 5339,3601, 5265, 3401]:
    print('Filename : ' + df.loc[index]['filename'])
    print('Reference Text : ' + df.loc[index]['reference_text'])
    print('Human Transcription : ' +  df.loc[index]['human_transcription'])
    print('Words Human Transcription : ' + df.loc[index]['words_human_transcription'])
    print('Accuracy : ', df.loc[index]['accuracy'])
    print()

Filename : 3726_edugame2023_80869283edb446a196918178d6ecaa99_41750a185c9d4320ab6db2c402f75256.wav
Reference Text : capitale
Human Transcription : capiptale
Words Human Transcription :  capitale 
Accuracy :  2.0

Filename : 3151_edugame2023_e491dd7fb2bf4f87be389ec9f010cee7_964a64f36b344ff9a0b734438af6716d.wav
Reference Text : spirale
Human Transcription : spira.l
Words Human Transcription :  spiral 
Accuracy :  0.0

Filename : 4023_edugame2023_ae4c45a99b3249b9b7956b53eac1356f_d236a6ad182b4c42a49ec0e4991309fa.wav
Reference Text : spirale
Human Transcription : irale
Words Human Transcription :  irale 
Accuracy :  1.0

Filename : 3712_edugame2023_e6c1c6fde35f47c3825b66bf237a377d_d69e204de61346ea971e1eb237918381.wav
Reference Text : lajon
Human Transcription : ch apeau apeau
Words Human Transcription :  ch apeau apeau 
Accuracy :  0.0

Filename : 4021_edugame2023_2c00adcffb304ea29cf1e50f387b5f60_859df5b501104d388791e8556607fac0.wav
Reference Text : solitude
Human Transcription : solutide
Wo

## Data Cleaning

In [307]:
df

Unnamed: 0,participant_id,session_id,language,session_form,phase,config_id,filename,reference_text,words_human_transcription,human_transcription,accuracy,accuracy_og,notes,comments,human_alignment,asr_response,asr_comparison,audio_length,process_time,filepath
0,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,fr,B,DELETION,phondel_B_pseudo_8,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,bromal,anage,an.nage,0.0,0.0,,,,,,4.144762,,../Intervention_T1_data/recordings/phondel_tri...
1,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,fr,B,DELETION,phondel_B_pseudo_6,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,banéon,,{no audio},0.0,,,,,,,12.027937,,../Intervention_T1_data/recordings/phondel_tri...
2,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,fr,B,DELETION,phondel_B_word_2,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,cheval,ch euval,ch euval,1.0,1.0,first_phoneme_repetition,,,,,8.573968,,../Intervention_T1_data/recordings/phondel_tri...
3,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,fr,B,DELETION,phondel_B_pseudo_7,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,polage,onage,onage,1.0,1.0,,,,,,1.544127,,../Intervention_T1_data/recordings/phondel_tri...
4,3101_edugame2023,0e0417f6222649fe89cf6edb413e0423,fr,B,DELETION,phondel_B_word_3,3101_edugame2023_1c148def3c254026adc7a7fdc3edc...,fromage,omage,omage,1.0,1.0,,,,,,9.102222,,../Intervention_T1_data/recordings/phondel_tri...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5501,3739_edugame2023,,fr,B,REPETITION,phondel_B_word_7,3739_edugame2023_726b0378066f47358975f01c3230e...,sorcière,sorcière,sorcière,2.0,2.0,,,,,,3.712,,../Intervention_T1_data/recordings/phondel_tri...
5502,3739_edugame2023,,fr,B,REPETITION,phondel_B_pseudo_6,3739_edugame2023_726b0378066f47358975f01c3230e...,banéon,banéon,banéon,2.0,2.0,,someone is talking loudly behind the voice of ...,,,,3.2,,../Intervention_T1_data/recordings/phondel_tri...
5503,3739_edugame2023,,fr,B,REPETITION,phondel_B_word_8,3739_edugame2023_726b0378066f47358975f01c3230e...,capitale,capitale,capitale,2.0,2.0,,whispered,,,,2.24,,../Intervention_T1_data/recordings/phondel_tri...
5504,3739_edugame2023,,fr,B,REPETITION,phondel_B_pseudo_7,3739_edugame2023_726b0378066f47358975f01c3230e...,polage,,{inaudible},0.0,,,,,,,1.984,,../Intervention_T1_data/recordings/phondel_tri...


In [308]:
df_not_cleaned = df.copy()
print("Total number of rows in df  before removal: ", len(df_not_cleaned))

df = df[~df['notes'].str.strip().isin(["duplicate_trial"])]
df.to_json('dfs/phondel/Intervention_PhonDel_df.json')
print("Total number of rows in df after removal: ", len(df))

# Remove rows where accuracy is nan
df = df.dropna(subset=['accuracy'])
print("Total number of rows in df after removal: ", len(df))

# create a df with all the rows that have been removed for training
df = df[~df['notes'].str.strip().isin(["not_doing_task", "inaudible", "not_doing_task inaudible", "duplicate_trial"])]
df = df[~df['human_transcription'].str.contains("adult")]
df = df[~df['words_human_transcription'].str.strip().isin([""])]
print("Total number of rows in df after removal: ", len(df))
# Print the rows with the notes "not_doing_task", "inaudible" or "not_doing_task inaudible"
# df_not_cleaned[df_not_cleaned['notes'].str.strip().isin(["not_doing_task", "inaudible", "not_doing_task inaudible"])]

# Save the updated df to a JSON file
print("Type of accuracy column of first row: ", type(df.loc[0]['accuracy']))
df.to_json('dfs/phondel/train/Intervention_PhonDel_df.json')

df = pd.read_json('dfs/phondel/Intervention_PhonDel_df.json')
print("Type of accuracy column of first row: ", type(df.loc[0]['accuracy']))



Total number of rows in df  before removal:  5506


Total number of rows in df after removal:  5506
Total number of rows in df after removal:  5506
Total number of rows in df after removal:  4698
Type of accuracy column of first row:  <class 'numpy.float64'>
Type of accuracy column of first row:  <class 'numpy.int64'>
