In [None]:
import json
import pandas as pd
import os
import re
from collections import Counter
import librosa
import random
from segment_service import align_texts
import IPython.display as ipd

# The paths

In [None]:
recordings_folder_path = r"../Intervention_T1_data/recordings/deco_trials/FR_deco/FR_deco"
decoding_trial_path = r"../Intervention_T1_data/ground_truth_transcribed/Decoding/deco_test_ground_truth_FR_T1_first_coder_.csv"
decoding_trial_path_ = r"../Intervention_T1_data/ground_truth_transcribed/Decoding/deco_test_ground_truth_FR_T1_first_coder.xlsx"
phon_del_trial_path = r"../Intervention_T1_data/ground_truth_transcribed/PhonDel/phonDel_test_ground_truth_ALL_T1_first_coder.csv"
path_filters = None

# Data Pre-Processing

Load the dataframe containing the data

In [None]:
df = pd.read_csv(decoding_trial_path)
#read xls file
df_ = pd.read_excel(decoding_trial_path_)
#df = pd.read_excel(decoding_trial_path_)
print("Number of rows in df: ", len(df), len(df_))
df.head(1)

## Modification of the database

1) Drop some columns that are no relevant
    - date_time, timePoint, trial_id
2) Change some column names to have more evocative names
    - config --> reference_text
    - trial_answer --> human_transcription
3) Add some column
    - words_human_transcription: contains the transcription without the annotations
    - human_alignment: contains the alignment between the clinician and asr transcription
    - audio_length: duration of corresponding audio
    - process_time: time ot took to the ASR to process the audio
    - asr_response: response of the ASR
    - asr_comparison: comparison between clinician and asr transcription (TP, TN, FP, FN)
4) Remove rows of the database without clinician annotations: 104 rows (all were marked as duplicate trial)
5) Keep only the files that are present in both the dataframe and in the recording folder and remove duplicate file (duplicate with respect to the file names)
6) Replace all ';' by a ' ' in the reference_text column for future alignment 

In [None]:
def reformat_df(df, columns_drop, columns_rename, columns_add):
    df = df.drop(columns=columns_drop)
    df = df.rename(columns=columns_rename)
    for col in columns_add:
        df[col] = ""
    return df    

In [None]:
def match_files_df_recordings(df, recordings_folder_path, path_filters=None):    
    recordings_filenames = []
    recordings_filepaths = []
    if path_filters is None:
        path_filters = ()

    # Get list of all audio filenames and paths in recordings folder whose paths contains all path filters
    for root, _, files in os.walk(recordings_folder_path):
        if all(filter in root for filter in path_filters):
            for file in files:
                if file.endswith(".wav"):
                    recordings_filenames.append(file)
                    recordings_filepaths.append(os.path.join(root, file))

    # Get the list of filenames in the json file
    df_filenames = df['filename'].tolist()

    # Get the list of filenames in the df file that are not in the recordings folder
    missing_recordings = list(set(df_filenames) - set(recordings_filenames))
    print(len(missing_recordings), " Files that are in the df but not in the recording folder (not processed): ", missing_recordings)

    # Get the list of filenames in the recordings folder that are not in the df
    extra_recordings = list(set(recordings_filenames) - set(df_filenames))
    print(len(extra_recordings), " Files that are in the recording folder but not in the df (not processed): ", extra_recordings)

    # Remove extra files from recordings_filepaths and missing files from df
    recordings_filepaths = [path for path in recordings_filepaths if os.path.basename(path) not in extra_recordings]
    df = df[~df['filename'].isin(missing_recordings)]

    # Remove the duplicate filenames from df and from recordings_filepaths
    print("Duplicate filenames in df (not processed): ", df[df['filename'].duplicated(keep=False)].sort_values('filename')['filename'].tolist())
    duplicate_list = [item for item, count in Counter(recordings_filepaths).items() if count > 1]
    print("Duplicate filenames in recordings folder (not processed): ", [item for item, count in Counter(recordings_filepaths).items() if count > 1])
    df = df.drop_duplicates(subset=['filename'], keep='first')
    recordings_filepaths = list(set(recordings_filepaths))

    # reset index of DataFrame
    df = df.reset_index(drop=True)

    # Verify number of files in DataFrame matches number of files in folder
    assert len(df) == len(recordings_filepaths), f"Number of files in DataFrame ({len(df)}) does not match number of files in folder ({len(recordings_filepaths)})"
    print("Number of files to be processed: ", len(df))

    return df, recordings_filepaths, recordings_filenames

In [None]:
### Change format of df to match the expected format ###
df = reformat_df(df = df, 
                    columns_drop = ['date_time','timePoint', 'trial_id'], 
                    columns_rename = {"config":"reference_text", "trial_answer":"human_transcription", "file_name": "filename"}, 
                    columns_add = ["words_human_transcription", "human_alignment", "asr_response", "asr_comparison", "audio_length", "process_time", "filepath", "accuracy_og"])
# Reorder the DataFrame columns
desired_order = ['participant_id', 'session_id', 'language', 'session_form', 'phase',
                 'config_id', 'filename', 'reference_text', 'words_human_transcription',
                 'human_transcription', 'accuracy', "accuracy_og", 'notes', 'comments', 
                 'human_alignment', 'asr_response', 'asr_comparison', 'audio_length', 'process_time']
df = df[desired_order]


### Handle rows with NaN values in human_transcription column ###
print(df['human_transcription'].isnull().sum(), "rows with NaN values in human_transcription column and notes associated with these rows are", df[df['human_transcription'].isnull()]['notes'].unique(), "\n")
# Remove rows with NaN values in transcription column 
df = df.dropna(subset=['human_transcription']).reset_index(drop=True)


### Synchronize the DataFrame with the audio files by removing missing files and duplicates ###
df, recordings_filepaths, recordings_filenames = match_files_df_recordings(df = df, 
                                                                            recordings_folder_path = recordings_folder_path, 
                                                                            path_filters = path_filters)


df['reference_text'] = df['reference_text'].apply(lambda x: x.replace(";", " "))
df['filepath'] = [os.path.join(recordings_folder_path, filename) for filename in df['filename']]

# Copy accuracy column to accuracy_og column
df['accuracy'] = df['accuracy'].apply(lambda x: x.strip())
df['accuracy'] = df['accuracy'].apply(lambda x: " ".join([i for i in x.split(" ") if i != ""]))
df['accuracy_og'] = df['accuracy']

df.at[339, 'accuracy'] = df.at[339, 'accuracy'] + " NA" if len(df.at[339, 'accuracy'].split(" ")) < 13 else df.at[339, 'accuracy']
# for each filename in df, find the corresponding row in df_ and print if their accuracy column is not equal
for idx, row in df.iterrows():
    filename = row['filename']
    accuracy = row['accuracy']
    accuracy_ = df_[df_['file_name'] == filename]['accuracy'].values[0]
    # if accuracy_ is NaN, set it to 12 NA zeros
    # remove space before and after accuracy string
    accuracy_ = accuracy_.strip()
    if accuracy != accuracy_:
        # show the index of mismatch in the two accuracy columns
        index = [i for i in range(12) if accuracy.split(" ")[i] != accuracy_.split(" ")[i]]
        # change the accuracy column in df to match the accuracy column in df_
        df.at[idx, 'accuracy'] = accuracy_

# Fill the audio_length column by adding the audio duration for each file in recordings_filepaths and get the total time
total_time = 0
for audio_path in recordings_filepaths:
    filename = os.path.basename(audio_path)
    if filename in df['filename'].values:
        audio_length = librosa.get_duration(path=audio_path)
        df.loc[df['filename'] == filename, 'audio_length'] = audio_length
        total_time += audio_length
print("\nTotal time of audio files:", total_time, "seconds =", total_time/60, "minutes")

# replace all the 'NA' with 0 in the accuracy column
df['accuracy'] = df['accuracy'].apply(lambda x: x.replace("NA", "0"))

# Save the updated df to a JSON file
df.to_json('dfs/deco/Intervention_Decoding_og_df.json', orient='records')

In [None]:
# Check that the number of unique reference text and config_id match
assert len(df['reference_text'].unique()) == len(df['config_id'].unique()), "Number of unique reference text and config_id do not match"
print("Number of unique reference text and config_id match and is", len(df['reference_text'].unique()), "\n")

# Each config_id should have the same reference_text print them
for config_id in sorted(df['config_id'].unique()):
    print(config_id, df[df['config_id'] == config_id]['reference_text'].unique())

In [None]:
# Print if a participant has multiple session_form values
print("\nNumber of participants with multiple session_form (A, B, C) values: ", len(df.groupby('participant_id')['session_form'].nunique()[df.groupby('participant_id')['session_form'].nunique() > 1]))

# Print the number of participants per language and trial
for lan in df['language'].unique():
    print("\n Total number of unique participants for each config_id in", lan)
    for config_id in df['config_id'].unique():
        filtered_df = df[(df['language'] == lan) & (df['config_id'] == config_id)]
        # print number of participants per config_id
        print(str(config_id) + " :  " + str(len(filtered_df['participant_id'].unique())))
    print(" \nTotal number of unique participants for each phase in", lan)
    for phase in df['phase'].unique():
        filtered_df = df[(df['language'] == lan) & (df['phase'].apply(lambda x: x.split("_")[0] in [phase]))]
        print(phase + " :  " + str(len(filtered_df['participant_id'].unique())) + " participants")


In [None]:
# print first 5 rows of the dfprint reference_text, human_transcription, and accuracy columns
for idx, row in df.iterrows():
    print(row['reference_text']) 
    print(row['human_transcription'])
    print(row['accuracy'])
    print(row['config_id'])
    if idx == 10:
        break 


## Transcription Normalization

Get transcriptions without the annotations and write them in the words_human_transcription column. These will be fed to the ASR during training so they need to contain what was said without any comment or mark.
- Remove the sounding out annotation '.' in the transcription : pa.pa --> papa
- Remove the insertion annotation'{...}' in the transcription : pa{pu}pa --> papupa
- Remove the comments {inaudible}, {comments}, {adult_support}, {adult_task}, {no audio}, {pause}, {cut recording}


In [None]:
# Fill the words_human_transcription column with the human transcription with removed annotations
def remove_transcription_annotations(transcription):
    """
    Removes clinician annotations from a given transcription.
    Args:
        transcription (str): The input transcription string.
    Returns:
        str: The transcription string with annotations removed.
    """
    # If the letter before and after the "." is the same, only keep one letter and '.' ex: "a.ar" -> "ar"
    transcription = re.sub(r'(\w)\.(\1)', r'\2', transcription)
    transcription = transcription.replace(".", "")

    transcription = transcription.replace("{inaudible}", "")
    transcription = transcription.replace("{comments}", "")
    transcription = transcription.replace("{adult_support}", "")
    transcription = transcription.replace("{adult_task}", "")
    transcription = transcription.replace("{no audio}", "")
    transcription = transcription.replace("{pause}", "")
    transcription = transcription.replace("{cut recording}", "")

    transcription = transcription.replace("{", " ")
    transcription = transcription.replace("}", " ")
    
    # remove all between brackets and parenthesis
    transcription = re.sub(r'\[.*?\]', '', transcription)
    transcription = re.sub(r'\(.*?\)', '', transcription)

    return transcription

df['words_human_transcription'] = df['human_transcription'].apply(lambda x: remove_transcription_annotations(x))

The transcriptions in our dataset exhibit inconsistencies where the same word can be spelled differently across rows. To ensure consistent labeling for our finetuning task and accurate evaluation of the ASR performance, we need to normalize the spelling of correctly uttered words. To achieve this, we create a replacement dictionary that maps each variant spelling to a normalized spelling. 

To create the replacement dictionary, we iterate over each list of (pseudo)words. For each (pseudo)word in this list, we select only the rows where the clinician marked the word as correctly pronounced (where its value in the accuracy list is 2). We then check for discrepancies between the reference text and human transcription for these rows and add any identified discrepancies to our dictionary.

In [None]:
# Dictionary for replacements
# To find all spelling of a word I did not consider the utterances where score for the word is 0
replacements = {
    "rui": "ruit",
    "digasscelle":"digacelle",
    "parosse": "paroce",
    "re": "reux",
    "reu": "reux",
    "bante": "bente",
    "dui": "duie",
    "djui": "duie",
    "englage": "anglage",
    "anglaje": "anglage",
    "trèb": "trèbe",
    "piro": "pireau",
    "lizoi": "lisoie",
    "milvené": "milvenet",
    "conjo": "conjeau",
    "éteur": "héteur",
    "eteur": "héteur",
    "pléfassion": "pléfation",
    "sorpil": "sorpile",
    "é": "haie",
    "aie": "haie",
    "ai": " haie",    
    "chanbre": "chambre",
    "crane": "crâne",
    "hotel": "hôtel",
    "otel": "hôtel",    
    "otèl": "hôtel",
    "pomier": "pommier",
    "pomié": "pommier",
    "pommié": "pommier",
    "cœur": "coeur",
    "keur": "coeur",
    "baskét": "basket",
    "baskèt": "basket",
    "réci": "récit",
    "réssi": "récit",
    "sette": "sept",
    "séte": "sept",
    "clé": "clef",
    "tradission": "tradition",
    "sisse": "six",
    "sis": "six",
    "cieu": "cieux",
    "sieu": "cieux",
    "départemen": "département",
    "éco": "écho",
    "éko": "écho",
    "logie": "logis",
    "logi": "logis",
    "intéligence": "intelligence",
    "intélligence": "intelligence",
    "inteligence": "intelligence",
    "oposition": "opposition",
    "opozission": "opposition",
    "creu": "creux",
    "foir": "foire",
    "chouétte": "chouette",
    "chouéte": "chouette",
    "cuizinié": "cuisinier",
    "cuisinié": "cuisinier",
    "cuisiniez": "cuisinier",
    "céllier": "cellier",
    "sélié": "cellier",
    "célier": "cellier",
    "éstragon": "estragon",
    "instrument":"instruments",
    "instrumen":"instruments",
    "malédiksion":"malédiction",
    "malédikssion":"malédiction",
    "lé": "lait",
    "lai": "lait",
    "dézir": "désir",
    "joi": "joie",
    "éskalope": "escalope",
    "eskalope": "escalope",
    "béré": "béret",
    "boulo": "boulot",
    "bouleau": "boulot",
    "bohneur": "bonheur",
    "boneur": "bonheur",
    "boneur": "bonheur",
    "paté": "pâté",
    "voi": "voix",
    "voie": "voix",
    "fassade": "façade",
    "argil": "argile",
    "gigo": "gigot",
    "jigo": "gigot",
    "duvé": "duvet",
    "convalésense": "convalescence",
    "convalésensse": "convalescence",
    "exitation": "excitation",
    "exsitation": "excitation",
    "exsitassion": "excitation",
    "éksitassion": "excitation",
    "eksitassion": "excitation",
    "tron": "tronc",
    "poir": "poire",
    "mouettte": "mouette",
    "menuizier": "menuisier",
    "menuizié": "menuisier",
    "casié": "casier",
    "cazié": "casier",
    "cazier": "casier",
    "équateur": "equateur",
    "orkestre": "orchestre",
    #"orchkestre": "orchestre",
    "orkéstre": "orchestre",
    "réklamation": "réclamation",
    "réklamassion": "réclamation",
    "réclamassion": "réclamation",
    "rai": "raie",
    "ré": "raie",
    "rré": "raie",
    "grève": "grève",
    "stan": "stand",
    "stande": "stand",
    "crète": "crête",
    "créte": "crête",
    "saizon": "saison",
    "sézon": "saison",
    "pompié": "pompier",
    "ponpié": "pompier",
    "sœur": "soeur",
    "ser": "soeur",
    "seur": "soeur",
    "sseur": "soeur",
    "regré": "regret",
    "rregré": "regret",
    "fusi": "fusil",
    "fuzi": "fusil",
    "fisse": "fils",
    "aout": "août",
    "out": "août",
    "oute": "août",
    "producsion": "production",
    "produkssion": "production",
    "dieu": "dieux",
    "tenssion": "tension",
    "déménagemen": "déménagement",
    "toi": "toit",
    "join": "juin",
    "éskapade": "escapade",
    "eskapade": "escapade",
    "navé": "navet",
    "pante": "pente",
    "bandi": "bandit",
    "choi": "choix",
    "parade": "parade",
    "fossil": "fossile",
    "tui": "tuit",
    "cutisse": "cutice",
    "ze": "zeux",
    "bummon": "bumon",
    
    "pérné": "perné",
    "pernet": "perné",
    "bonfaje": "bonfage",
    "jel": "jeul",
    "tréme": "trème",
    "bukurelle": "bucurelle",
    "davo": "daveau",
    "ninoi": "ninoie",
    "trefoné": "trefonet",
    "tréfonet": "trefonet",
    "tréfoné": "trefonet",
    "tièj": "tiège",
    "tiééj": "tiège",
    "ronjo": "ronjeau",
    "trufondulance": "trufondulence",
    "uper": "hupeur",
    "gralassion": "gralation",
    "turlem": "turlème",
    "escro": "escroc",
    "eskro": "escroc",
    "éskro": "escroc",
    "éscro": "escroc",
    "taba": "tabac",
    "reconaissance": "reconnaissance",
    "rekonnaissance": "reconnaissance",
    "reconéssance": "reconnaissance",
    "reconnéssance": "reconnaissance",
    "aparition": "apparition",
    "apparition": "apparition",
    "aparission": "apparition",
    "apparission": "apparition",
    "blan": "blanc",
    "poi": "poids",
    "couète": "couette",
    "chemizié": "chemisier",
    "chemizier": "chemisier",
    "rozier": "rosier",
    "rozié": "rosier",
    "ékuyière": "écuyère",
    "ékuyère": "écuyère",
    "ékuiyère": "écuyère",
    "catastrof": "catastrophe",
    "catastrofe": "catastrophe",
    "déklarassion": "déclaration",
    "déclarassion": "déclaration",
    "pièje": "piège",
    "pièje": "piège",
    "fame": "femme",
    "ffamme": "femme",
    "ffame": "femme",
    "secré": "secret",
    "ssecré": "secret",
    "fini": "finit",
    "mile": "mille",
    "sère": "cerf",
    "cer": "cerf",
    "cèr": "cerf",
    "cér": "cerf",
    "présision": "précision",
    "présizion": "précision",
    "préssizion": "précision",
    "précizion": "précision",
    "disse": "dix",
    "lieu": "lieux",
    "milion": "million",
    "débarkement": "débarquement",
    "débarkemen": "débarquement",
    "débarquemen": "débarquement",
    "nui": "nuit",
    "jou": "joue",
    "éscalade": "escalade",
    "éskalade": "escalade",
    "eskalade": "escalade",
    "valé": "valet",
    "jalou": "jaloux",
    "noi": "noix",
    "réptile": "reptile",
    "réptil": "reptile",
    "bé": "baie",
    "bai": "baie",
    "frére": "frère",
    "nombr": "nombre",
    "comande": "commande",
    "crèpe": "crêpe",
    "crèpes": "crêpe",
    "pinsson": "pinson",
    "jui": "juit",
    "goi": "goix",
    "gua": "goix",
    "ronbage": "rombage",
    "ronbaje": "rombage",
    "rombaje": "rombage",
    "jer": "jeur",
    "sidomèle": "sidomelle",
    "tivo": "tiveau",
    "firoi": "firoie",
    "timpuner": "timpunet",
    "timpuné": "timpunet",
    "tinpuné": "timpunet",
    "tinpunet": "timpunet",
    "fème": "phème",
    "tanjo": "tanjeau",
    "rœil": "reuil",
    "opeur": "hopeur",
    "oper": "hopeur",
    "nive": "nive",
    "jorcassion": "jorcation",
    "jorkassion": "jorcation",
    "jorcation": "jorcation",
    "jorkation": "jorcation",
    'jof': 'jauf',
    'reptil': 'reptile',
    'rekonéssance': 'reconnaissance',
    'reconnaissance': 'reconnaissance',
    'upeur': 'hupeur',
}

In [None]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
print("Number of unique config_id: ", len(config_ids))

i = 0 # index of the config_id to be processed 
j = 0 # index of the word in the config_id to be processed

# Get all rows with the config_id we want to process
df_config = df[df['config_id'] == config_ids[i]]

print("Processed config_id: ", config_ids[i])
print("Processed reference_text:", df_config.iloc[0]['reference_text'])
print("Processed word in reference_text:", df_config.iloc[0]['reference_text'].split(" ")[j])


### Get all rows where the j element of the accuracy list is 2 (if the accuracy list is bigger than j) ###
# Convert accuracy column to a list of integers
df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: [int(i) for i in x.split(" ") if i.isdigit()])

df_config = df_config[df_config['accuracy'].apply(lambda x: len(x) > j)]
df_config = df_config[df_config['accuracy'].apply(lambda x: x[j] != 0)]

# Show all rows where the j word in the reference_text column is not contained in the string in the words_human_transcription column
df_config = df_config[df_config.apply(lambda x: x['reference_text'].split(" ")[j] not in x['words_human_transcription'], axis=1)]
df_config

In [None]:
# Add a space before and after each key and value in the replacements dictionary to avoid replacing parts of words
# For example "re" in "arbre" will not replaced by "reux"
replacements = {f" {k} ": f" {v} " for k, v in replacements.items()}
# Add a space before each sentence in words_human_transcription
df['words_human_transcription'] = df['words_human_transcription'].apply(lambda x: f" {x} ")
df_not_normalized = df.copy()

# Replace the keys with the values in the replacements dictionary
for k, v in replacements.items():
    df['words_human_transcription'] = df['words_human_transcription'].str.replace(k, v)

Code to check all the transcriptions containing a given word:

In [None]:
word = 'ze'
df_row = df_not_normalized.loc[df_not_normalized['words_human_transcription'].str.contains(' '+word+' ')]
df_row

After the normalization of the spelling:

In [None]:
word = 'ze'
df_row = df.loc[df['words_human_transcription'].str.contains(' '+word+' ')]
df_row

In [None]:
filepath_accuracy_change = [
    ("3185_edugame2023_eb2dc813ac324ef5b2d08b499c9b0195_e3c0dd9c87e04709b975d6dfd85edbf2.wav", [2, 1, 2, 2, 2, 1, 0, 1, 0, 0, 0, 1]),
    ("4006_edugame2023_97a6892868d5420687fa49368f83b649_88b0ebe874f44e12b10cf6332efc8e20.wav", [2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 0, 0]),
    ("3122_edugame2023_b9898abefb3445ada6d0490a7251f340_b75faaac524446539bb181d8a5d24d73.wav", [0, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1]),
    ("3722_edugame2023_6aa06c9452704d77aac910356caaf078_ab8320445b3841b083e01a53b09a547d.wav", [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
    ("3726_edugame2023_04781866635f47c38e712760a4206223_e09682d5d140493ab8c00158e4e49a66.wav", [0, 1, 0, 0, 2, 1, 0, 1, 1, 2, 1, 1]),
    ("3722_edugame2023_6aa06c9452704d77aac910356caaf078_ab8320445b3841b083e01a53b09a547d.wav", [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
    ("3184_edugame2023_18b68dd83a4b4506a5a8db11cf640e67_11e234fc5b5a45b7ba26ffc041fe04d4.wav", [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]),
    ("3722_edugame2023_6aa06c9452704d77aac910356caaf078_ab8320445b3841b083e01a53b09a547d.wav", [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
    ("3111_edugame2023_9c0eb20452134660afa8aa9b8cc58579_ac6125a3f26649d38e25ac1f8c11d96d.wav", [1, 2, 0, 1, 2, 1, 2, 0, 2, 2, 1, 1]),
    ("3147_edugame2023_650080f2882548949fbcdac62d326398_b218dd10c2c44fa9b5a7b3d1af2c86c0.wav", [2, 2, 2, 1, 0, 2, 1, 1, 1, 2, 1, 2]),
    ("4012_edugame2023_c5f994b9a174432b826565f57098ca1a_bee3d3f7284e4b5eb7cc51a9611e9705.wav", [2, 2, 1, 0, 2, 0, 1, 0, 2, 2, 2, 2]),
    ("3125_edugame2023_8fe9370b1fff4db4bd4dc98613c4ecff_77cc7df1af174993b413df44cb3e5e4c.wav", [1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1]),
    ("3738_edugame2023_2b7f897c8b74431697be15e3ccf65a4f_cc163642455c4ae6b02d1d25f8cd7db0.wav", [0, 2, 1, 0, 2, 0, 1, 0, 2, 2, 1, 0]),
    ("3184_edugame2023_31e9ea5172f84cdfa0615027d60d61ba_ca59c03c6cf64cffafe9d5fa0c76ebbd.wav", [0, 0, 2, 0, 1, 2, 1, 0, 1, 2, 1, 1]),
    ("3158_edugame2023_a6fc181fb45d49fcb199afc489ee08bd_adbe989158fc4a419176d723c1cfa539.wav", [2, 1, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0]),
    ("3142_edugame2023_b5e0a9ac5743470eb758689a59610bab_5b17b70717ba45a091298c7896f2b036.wav", [1, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1]),
    ("3108_edugame2023_f4922707ff73491a85766bbcd62ef92f_5c27530df514442794960fd7201e83f1.wav", [0, 1, 0, 1, 0, 2, 1, 0, 0, 0, 1, 1]),
    ("3169_edugame2023_993e50a9ebbd471a8af41f6869e05a8a_fd8af6717dba4e219886a4d30a0f7cd0.wav", [1, 1, 0, 0, 0, 2, 2, 1, 1, 0, 0, 0]),
    ("3109_edugame2023_36a8df2696404c11985b72bc9a41bd72_f473e404e593433a9ddc524334856fe3.wav", [2, 1, 1, 0, 2, 1, 1, 0, 1, 1, 2, 0]),
    ("3169_edugame2023_993e50a9ebbd471a8af41f6869e05a8a_6680c0eb270c4b49a3889b3c37043dbc.wav", [2, 2, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1]),
    ("3180_edugame2023_954e1af83a994241a3bfe5d1954f7b6c_d33d0fbb5f124451b6c67da40ba46edc.wav", [2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2]),
    ("3138_edugame2023_75285312c1be4e859b964aa0f6164829_5f5febf8ea034755b15a46e453252548.wav", [0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 1, 1]),
    ("3407_edugame2023_8f88e8025a4743299b7785ae1ad9922e_0d77d0c00ebb429cb867faf1f50b0874.wav", [1, 2, 1, 0, 0, 2, 1, 0, 2, 2, 0, 1]),
    ("3104_edugame2023_6070e62649ff4888b5f4758f506e22f7_7979e246a17847629355c5256acc13d5.wav", [2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 1, 0]),
    ("3715_edugame2023_133ad28ca5ca40bfbc0fd075d23c0b56_aab9b04fb5cb41c486aa68e2c72960d6.wav", [2, 1, 2, 1, 0, 0, 0, 2, 2, 2, 1, 0]),
    ("3727_edugame2023_302578ea2c9b4c9b90c2d464cc282439_07d80a22c87343b1adfa0f161c6e7f63.wav", [2, 1, 2, 1, 2, 0, 0, 1, 2, 2, 2, 2]),
    ("4023_edugame2023_2c0c9d7e7af24547ae0e809494dfc1f7_5e23ce98d6a745d7b76f354cd9b9204c.wav", [2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 1, 0]),
    ("3151_edugame2023_60c1231162604cd9ad81ca6c717ca38e_1abd9770763e4a5e80dda765d59413b0.wav", [0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0]),
    ("3724_edugame2023_feab4682e57641e6a52094551121bdc2_f1c75f122e9243059e0c08be59982a9a.wav", [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
    ("3151_edugame2023_60c1231162604cd9ad81ca6c717ca38e_1abd9770763e4a5e80dda765d59413b0.wav", [0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0]),
    ("3102_edugame2023_b138ec8efbd74ef4a5ae3c3c941907ea_cf80243db2fa41cd954544e152c69232.wav", [0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0]),
    ("3143_edugame2023_c7acb8b9615147deb6f1032edf65aea6_04693034c1ce4d65bf2103428f60a3c4.wav", [0, 2, 1, 1, 2, 0, 0, 1, 1, 0, 0, 1]),
    ("3723_edugame2023_abbc25c5f4d34f8aad4c55484d7193b0_01e7c48162154f12978a4997163ec327.wav", [0, 0, 2, 1, 2, 2, 2, 1, 2, 0, 0, 2]),
    ("4007_edugame2023_bf4e28a282db4656b2b1c8475c2929a9_ee8710f9ad264ec0848dfd1922fe8c58.wav", [0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2]),
    ("3723_edugame2023_abbc25c5f4d34f8aad4c55484d7193b0_1c252027d4c348c2ac832836d4ebea73.wav", [2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0]),
    ("3168_edugame2023_63642ca494544fac819152eb8b0935de_44af16457b494216a50322517868b68c.wav", [2, 1, 2, 0, 2, 1, 0, 1, 0, 1, 2, 1]),
    ("3424_edugame2023_a3174ee9d0ff45a9b9bbf07f9a7abef1_827c2cab29f7450dab42f28ca7d5d084.wav", [2, 1, 2, 0, 2, 1, 0, 2, 2, 1, 1, 2]),
    ("3411_edugame2023_dc2803354f504d52b183f115e06fe140_8bc4a0ee5a20402b9c00010e4f64ddb2.wav", [2, 1, 1, 1, 2, 1, 0, 0, 2, 1, 2, 1]),
    ("3753_edugame2023_80ef3443d7bd49b597ef9159c5c712bd_c7aae3c47a0b45bb81ac3c3e4ee9ede2.wav", [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]),
    ("3723_edugame2023_299b733d8cda4565beb8895b9000f3b7_d036def5fd4f4fca97f7e9961f7a4d24.wav", [1, 1, 1, 0, 2, 1, 0, 1, 1, 2, 0, 2]),
    ("4013_edugame2023_1620c0ec6c0d482a96954e681fd5260f_1c3d9ba9c45a4daea91ca50cd9ac2ec9.wav", [1, 2, 1, 1, 0, 1, 0, 0, 2, 2, 0, 1]),
    ("3155_edugame2023_f3467c3935ed43be849ec26ac7aaab86_8678cc14130248ab81708e7d5722f336.wav", [1, 0, 1, 1, 1, 0, 2, 1, 2, 2, 1, 0]),
    ("3722_edugame2023_6aa06c9452704d77aac910356caaf078_ab8320445b3841b083e01a53b09a547d.wav", [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
    ("3722_edugame2023_6aa06c9452704d77aac910356caaf078_ab8320445b3841b083e01a53b09a547d.wav", [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
    ("3151_edugame2023_60c1231162604cd9ad81ca6c717ca38e_1abd9770763e4a5e80dda765d59413b0.wav", [0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0]),
    ("3156_edugame2023_c183500d36fd40869ec0e3971031afcc_0fe79f18b8104f1e8761cdc4b2f398bf.wav", [2, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0]),
    ("3710_edugame2023_9d66bb18ffc54188a0a8eb00785d93db_6bb0d5e00b9c43a7bb35107f65da5e60.wav", [2, 2, 0, 1, 2, 1, 0, 1, 2, 1, 2, 2]),
    ("3185_edugame2023_eb2dc813ac324ef5b2d08b499c9b0195_ab74571a9a0a4223a4436224c53f1c75.wav", [0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
    ("3107_edugame2023_66a9dee045c844c0a31d803c4992414c_7f3a77ac7cc04195b0970f9114532ba8.wav", [2, 2, 0, 0, 0, 2, 0, 1, 1, 1, 0, 1]),
    ("3107_edugame2023_66a9dee045c844c0a31d803c4992414c_3453b2a9513d41fdba7a85293ce23bc8.wav", [0, 0, 2, 1, 2, 2, 0, 0, 0, 0, 1, 1]),
    ("3184_edugame2023_31e9ea5172f84cdfa0615027d60d61ba_ca59c03c6cf64cffafe9d5fa0c76ebbd.wav", [0, 0, 2, 0, 1, 2, 1, 1, 1, 2, 1, 1]),
    ("3161_edugame2023_9889b3877b724fb0abdce7aa9c0daffc_b7b7ef370786475eb73c1d56af3358d1.wav", [0, 1, 0, 1, 2, 2, 1, 0, 0, 0, 0, 0]),
    ("3108_edugame2023_f4922707ff73491a85766bbcd62ef92f_825cd09d574f4fdf9ee839688959b02b.wav", [1, 2, 1, 1, 0, 1, 2, 1, 2, 2, 1, 1]),
    ("3119_edugame2023_961f25cb239c493f8dc868646059fe60_bd422dbdc0554417985de26a5770fdbd.wav", [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
    ("3403_edugame2023_ce4ce6660a524aa1b20d24092632d43b_d216811b566d4213b002c5b24c7d4eb6.wav", [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0]),
    ("3150_edugame2023_8c2562ac45b543e3814d7230261ccaac_85ea381846c34845b2406fa419b4222d.wav", [2, 1, 0, 2, 1, 0, 0, 2, 1, 0, 1, 1]),
    ("3177_edugame2023_4e6107c4aac043c592aa870d006b95e9_fcbd089589fd4e548f68f3cedd677ed7.wav", [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1]),
    ("3177_edugame2023_4e6107c4aac043c592aa870d006b95e9_fcbd089589fd4e548f68f3cedd677ed7.wav", [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1]),
    ("3749_edugame2023_f4704c3cddcf4bc6b6ba6c73fba064ab_368a44dd0b2d4ac396ff8883148b11ea.wav", [1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
    
    ]

eu_list = ['dieu', 'creu', 'lieu', 'cieu', 'reu', 'jeu', 'zeu']

tuple_insertion = []
tuple_soundingout = []
tuple_eu = []

insertion_changes = []
sounding_out_changes = []
eu_changes = [('4011_edugame2023_d7b4c8c9103f4b15852a4d530293a2af_c26218cebeec4d1ba464a7bb9d225dae.wav', 4)]

insertion_count = 0
sounding_out_count = 0
eu_count = 0

for filename, i in insertion_changes:
    index = df[df['filename'] == filename].index[0]
    new_acc = df.loc[index, 'accuracy'].split(" ")
    if new_acc[i] != '0':
        new_acc[i] = '0'
        insertion_count += 1
    df.loc[index, 'accuracy'] = " ".join(new_acc)

for filename, i in sounding_out_changes:
    index = df[df['filename'] == filename].index[0]
    new_acc = df.loc[index, 'accuracy'].split(" ")
    if new_acc[i] == '0':
        new_acc[i] != '0'
        sounding_out_count += 1
    df.loc[index, 'accuracy'] = " ".join(new_acc)

for filename, i in eu_changes:
    index = df[df['filename'] == filename].index[0]
    new_acc = df.loc[index, 'accuracy'].split(" ")
    if new_acc[i] != '0':
        new_acc[i] == '0'
        eu_count += 1
    df.loc[index, 'accuracy'] = " ".join(new_acc)

# for each file in filepath_accuracy_change find the corresponding row in the df and print the filename, the human transcription and the og_accuracy and accuracy after the change
for filepath, accuracy in filepath_accuracy_change:
    # transform the accuracy list to a string
    accuracy = " ".join([str(i) for i in accuracy])
    og_accuracy = df[df['filename'] == filepath]['accuracy_og'].values[0]
    # replace all NA by 0 and remove all spaces before and after the string
    # replace all 'NA' by '0' in accuracy string
    og_accuracy = og_accuracy.replace("NA", "0")
    og_accuracy = og_accuracy.strip()
    #  ensure length of accuracy is 12
    assert len(og_accuracy.split(" ")) == 12
    if accuracy != og_accuracy:
    # show the index of mismatch in the two accuracy columns
        i_insertion = [ i for i in range(12) if (accuracy.split(" ")[i] == '0' and og_accuracy.split(" ")[i] != '0')]
        i_soundingout_false = [ i for i in range(12) if (accuracy.split(" ")[i] != '0' and og_accuracy.split(" ")[i] == '0')]
        if i_insertion:
            index = df[df['filename'] == filepath].index[0]
            # replace the accuracy index in the df with 0 and all indices in i_insertion
            new_acc = df.loc[index, 'accuracy'].split(" ")
            for i in i_insertion:
                if new_acc[i] != '0':
                    new_acc[i] = '0'
                    insertion_count += 1
                    tuple_insertion.append((filepath, i))
            df.loc[index, 'accuracy'] = " ".join(new_acc)
        
        if i_soundingout_false:
            index = df[df['filename'] == filepath].index[0]
            # replace the accuracy index in the df with 1 and all indices in i_soundingout_false
            new_acc = df.loc[index, 'accuracy'].split(" ")
            for i in i_soundingout_false:
                if new_acc[i] == '0':
                    new_acc[i] != '0'
                    sounding_out_count += 1
                    tuple_soundingout.append((filepath, i))
            df.loc[index, 'accuracy'] = " ".join(new_acc)

for index, row in df.iterrows():
        # replace all . by nothing
        a = row['words_human_transcription']
        b = align_texts(a, row['reference_text'])
        # for each word in reference text.split(" ") check if its index in accuracy is not 0 and its aligment with the human transcription is not correct
        for i in range(len(row['reference_text'].split(" "))):
            # if its value in b is not equal to the word in reference text
            # find tuple in b whose value with index 1 is equal to the word in reference text
            for tuple in b:
                if tuple[1] == row['reference_text'].split(" ")[i]:
                    j = b.index(tuple)
                    if b[j][0] is not None and b[j][0].endswith('eu') and b[j][0] not in eu_list and row['accuracy'].split(" ")[i] != '0':
                        if row['filename'] not in ['3729_edugame2023_ee652fd5e53141f8a6e73c02097dab53_0d4cd554e29e4ce9b036b3950844f3e9.wav', '4019_edugame2023_2277afada0e2440abe5fc2d84f7becd2_5b361cfb24f843afae6bfbf448d43e75.wav', '4011_edugame2023_d7b4c8c9103f4b15852a4d530293a2af_c26218cebeec4d1ba464a7bb9d225dae.wav']:
                            new_acc = row['accuracy'].split(" ")
                            new_acc[i] = '0'
                            # back to string
                            new_acc = " ".join(new_acc)
                            print( row['accuracy'], new_acc)
                            df.loc[index, 'accuracy'] = new_acc
                            tuple_eu.append((row['filename'], i))
                            eu_count += 1



print(f"Insertion count: {insertion_count}")
print("This means that the accuracy was changed from 1 to 0 for the following filename and index:")
print(tuple_insertion)
print() 
print(f"Sounding out count: {sounding_out_count}")
print("This means that the accuracy was changed from 0 to 1 for the following filename and index:")
print(tuple_soundingout)
print()
print(f"eu count: {eu_count}")
print("This means that the accuracy was changed from 1 to 0 for the following filename and index:")
print(tuple_eu)
print()

# calculate the values of ratios
total_nb_scores = 930 * 12 # 930 files with 12 words each
insertion_ratio = insertion_count / total_nb_scores
sounding_out_ratio = sounding_out_count / total_nb_scores
eu_ratio = eu_count / total_nb_scores

print(f"Insertion ratio: {insertion_ratio}")
print(f"Sounding out ratio: {sounding_out_ratio}")
print(f"eu ratio: {eu_ratio}")

In [None]:
# find all rows where the accuracy is not length 12
df[df['accuracy'].apply(lambda x: len(x.split(" ")) == 12)]
df

In [None]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
count = 0
i = 0 # index of the config_id to be processed 
j = 0 # index of the word in the config_id to be processed
for i in range(len(config_ids)):
    for j in range(12):
        # Get all rows with the config_id we want to process
        df_config = df[df['config_id'] == config_ids[i]]

        ### Get all rows where the j element of the accuracy list is 2 (if the accuracy list is bigger than j) ###
        df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: [int(i) for i in x.split(" ") if i.isdigit()])
        df_config = df_config[df_config['accuracy'].apply(lambda x: len(x) == 12)]
        df_config = df_config[df_config['accuracy'].apply(lambda x: x[j] != 0)]

        # Show all rows where the j word in the reference_text column is not contained in the string in the words_human_transcription column
        df_config = df_config[df_config.apply(lambda x: x['reference_text'].split(" ")[j] not in x['words_human_transcription'], axis=1)]
        if len(df_config) > 0:
            #print("Processed config_id: ", config_ids[i])
            #print("Processed reference_text:", df_config.iloc[0]['reference_text'])
            print("################# Reference text: ", df_config.iloc[0]['reference_text'])
            print("######## Processed word ", df_config.iloc[0]['reference_text'].split(" ")[j])
            for index, row in df_config.iterrows():
                print('filename: ', row['filename'])
                print('normalized transcription: ', row['words_human_transcription'])
                print('transcription: ',row['human_transcription'])
                print('accuracy: ',row['accuracy'])
                #print(index)
                count += 1
                print()
count

In [None]:
# List of all config_ids ( = lists of (pseudo)words ) in the df
config_ids = df['config_id'].unique()
count = 0
i = 0 # index of the config_id to be processed 
j = 0 # index of the word in the config_id to be processed
for i in range(len(config_ids)):
    for j in range(12):
        # Get all rows with the config_id we want to process
        df_config = df[df['config_id'] == config_ids[i]]

        ### Get all rows where the j element of the accuracy list is 2 (if the accuracy list is bigger than j) ###
        df_config.loc[:, 'accuracy'] = df_config['accuracy'].apply(lambda x: [int(i) for i in x.split(" ") if i.isdigit()])
        df_config = df_config[df_config['accuracy'].apply(lambda x: len(x) == 12)]
        df_config = df_config[df_config['accuracy'].apply(lambda x: x[j] == 0)]

        # Show all rows where the j word in the reference_text column is not contained in the string in the words_human_transcription column
        df_config = df_config[df_config.apply(lambda x: x['reference_text'].split(" ")[j] in x['words_human_transcription'].split(" "), axis=1)]
        if len(df_config) > 0:
            #print("Processed config_id: ", config_ids[i])
            #print("Processed reference_text:", df_config.iloc[0]['reference_text'])
            print("################# Reference text: ", df_config.iloc[0]['reference_text'])
            processed_word = df_config.iloc[0]['reference_text'].split(" ")[j]
            if processed_word not in ["reux",]:
                print("######## Processed word ", processed_word)
                for index, row in df_config.iterrows():
                    print('filename: ', row['filename'])
                    print('normalized transcription: ', row['words_human_transcription'])
                    print('transcription: ',row['human_transcription'])
                    print('accuracy: ',row['accuracy'])
                    #print(index)
                    count += 1
                    print()
                    if index in []:#[435, 177]:
                        # change position j of digit in accuracy to 0
                        accuracy = row['accuracy']
                        if accuracy[j] != 0:
                            accuracy[j] = 0
                            df.loc[index, 'accuracy'] = " ".join([str(i) for i in accuracy])
                            #display(ipd.Audio(data= row['filepath'], autoplay=True, rate=16000))
count

Code to check the mismatch for a given row:

In [None]:
index = 796
print('Filename : ' + df.loc[index]['filename'])
print('Reference Text : ' + df.loc[index]['reference_text'])
print('Human Transcription : ' +  df.loc[index]['human_transcription'])
print('Words Human Transcription : ' + df.loc[index]['words_human_transcription'])
print('Accuracy : ' + df.loc[index]['accuracy'])
print()

### What are the duplicate trials ?

## Data Cleaning

In [None]:
# make sure all accuracy values are 12 length
df[df['accuracy'].apply(lambda x: len(x.split(" ")) != 12)]

In [None]:
df_not_cleaned = df.copy()
print("Total number of rows in df  before removal: ", len(df))

df = df[~df['notes'].str.strip().isin(["duplicate_trial"])]
df.to_json('dfs/deco/Intervention_Decoding_df.json') #######
print("Total number of rows in df  before removal: ", len(df))

# create a df with all the rows that have been removed for training
df = df[~df['notes'].str.strip().isin(["not_doing_task", "inaudible", "not_doing_task inaudible", "duplicate_trial"])]
df = df[~df['human_transcription'].str.contains("adult")]
df = df[~df['words_human_transcription'].str.strip().isin([""])]
df.to_json('dfs/deco/train/Intervention_Decoding_df.json')
print("Total number of rows in df  before removal: ", len(df))


df_removed = df_not_cleaned[~df_not_cleaned.index.isin(df.index)]
df_removed = df_removed[~df_removed['notes'].str.strip().isin(["duplicate_trial"])]
df_removed.to_json('dfs/deco/Intervention_Decoding_removed_df.json')

In [None]:
df_removed

In [None]:
# type of accuracy column of first row
print("Type of accuracy column of first row: ", type(df.loc[0]['accuracy']))