In [62]:
import json
import pandas as pd
import os
import re
from collections import Counter
import librosa
import random
from segment_service import align_texts
import IPython.display as ipd

# The paths

In [63]:
recordings_folder_path = r"../DANA/Audios"
decoding_trial_path = r"../DANA/Transcription_patients.xlsx"

# Data Pre-Processing

Load the dataframe containing the data

## Modification of the database

In [64]:
def reformat_df(df, columns_drop, columns_rename, columns_add):
    df = df.drop(columns=columns_drop)
    df = df.rename(columns=columns_rename)
    for col in columns_add:
        df[col] = ""
    return df    

In [65]:
def match_files_df_recordings(df, recordings_folder_path, path_filters=None):    
    recordings_filenames = []
    recordings_filepaths = []
    if path_filters is None:
        path_filters = ()

    # Get list of all audio filenames and paths in recordings folder whose paths contains all path filters
    for root, _, files in os.walk(recordings_folder_path):
        if all(filter in root for filter in path_filters):
            for file in files:
                if file.endswith(".wav"):
                    recordings_filenames.append(file)
                    recordings_filepaths.append(os.path.join(root, file))

    # Get the list of filenames in the json file
    df_filenames = df['filename'].tolist()

    # Get the list of filenames in the df file that are not in the recordings folder
    missing_recordings = list(set(df_filenames) - set(recordings_filenames))
    print(len(missing_recordings), " Files that are in the df but not in the recording folder (not processed): ", missing_recordings)

    # Get the list of filenames in the recordings folder that are not in the df
    extra_recordings = list(set(recordings_filenames) - set(df_filenames))
    print(len(extra_recordings), " Files that are in the recording folder but not in the df (not processed): ", extra_recordings)

    # Remove extra files from recordings_filepaths and missing files from df
    recordings_filepaths = [path for path in recordings_filepaths if os.path.basename(path) not in extra_recordings]
    df = df[~df['filename'].isin(missing_recordings)]
    df = df[~df['filename'].isin(extra_recordings)]

    # Remove the duplicate filenames from df and from recordings_filepaths
    print("Duplicate filenames in df (not processed): ", df[df['filename'].duplicated(keep=False)].sort_values('filename')['filename'].tolist())
    duplicate_list = [item for item, count in Counter(recordings_filepaths).items() if count > 1]
    print("Duplicate filenames in recordings folder (not processed): ", [item for item, count in Counter(recordings_filepaths).items() if count > 1])
    df = df.drop_duplicates(subset=['filename'], keep='first')
    recordings_filepaths = list(set(recordings_filepaths))

    # reset index of DataFrame
    df = df.reset_index(drop=True)

    # Verify number of files in DataFrame matches number of files in folder
    assert len(df) == len(recordings_filepaths), f"Number of files in DataFrame ({len(df)}) does not match number of files in folder ({len(recordings_filepaths)})"
    print("Number of files to be processed: ", len(df))

    return df, recordings_filepaths, recordings_filenames

In [66]:
#read xls file
df = pd.read_excel(decoding_trial_path)
print("Number of rows in df: ", len(df))
df.head(10)

Number of rows in df:  4928


Unnamed: 0,Subject_P,Session,Categorie,Animacity,Stimuli,Output_file,Error,Error_cat,Accuracy,RT_correct
0,P101,1,Fruits,animé,abricot,101_1_abricot.wav,,COR,1.0,
1,P101,1,Fruits,inanimé,abricot,101_1_abricot.wav,,COR,1.0,
2,P101,1,Outils,inanimé,agrafeuse,101_1_agrafeuse.wav,,COR,1.0,
3,P101,1,Outils,inanimé,agrafeuse,101_1_agrafeuse.wav,,COR,1.0,
4,P101,1,Fruits,animé,ananas,101_1_ananas.wav,,COR,1.0,
5,P101,1,Fruits,animé,ananas,101_1_ananas.wav,,COR,1.0,
6,P101,1,Outils,inanimé,arrosoir,101_1_arrosoir.wav,,COR,1.0,
7,P101,1,Outils,inanimé,arrosoir,101_1_arrosoir.wav,,COR,1.0,
8,P101,1,Légumes,animé,artichaut,101_1_artichaut.wav,,COR,1.0,
9,P101,1,Légumes,animé,artichaut,101_1_artichaut.wav,,COR,1.0,


In [67]:
### Change format of df to match the expected format ###
df = reformat_df(df = df, 
                    columns_drop = [], 
                    columns_rename = {"Stimuli":"reference_text", "Output_file": "filename", "Accuracy": "accuracy", "Categorie": "config_id"}, 
                    columns_add = ["asr_response","filepath", "audio_length"])
# Reorder the DataFrame columns



### Handle rows with NaN values in human_transcription column ###
#print(df['human_transcription'].isnull().sum(), "rows with NaN values in human_transcription column and notes associated with these rows are", df[df['human_transcription'].isnull()]['notes'].unique(), "\n")
# Remove rows with NaN values in transcription column 
#df = df.dropna(subset=['human_transcription']).reset_index(drop=True)


### Synchronize the DataFrame with the audio files by removing missing files and duplicates ###
df, recordings_filepaths, recordings_filenames = match_files_df_recordings(df = df, 
                                                                            recordings_folder_path = recordings_folder_path, 
                                                                            path_filters = None)


df['filepath'] = [os.path.join(recordings_folder_path, filename) for filename in df['filename']]


# Fill the audio_length column by adding the audio duration for each file in recordings_filepaths and get the total time
total_time = 0
for audio_path in recordings_filepaths:
    filename = os.path.basename(audio_path)
    if filename in df['filename'].values:
        audio_length = librosa.get_duration(path=audio_path)
        df.loc[df['filename'] == filename, 'audio_length'] = audio_length
        total_time += audio_length
print("\nTotal time of audio files:", total_time, "seconds =", total_time/60, "minutes")

# replace all the 'NA' with 0 in the accuracy column
# drop all the rows with accuracy nan
df = df.dropna(subset=['accuracy']).reset_index(drop=True)
# print final length of df
print("Number of rows in df after removing rows with NaN values in accuracy column: ", len(df))

# Save the updated df to a JSON file
# create a folder to save the JSON file if it does not exist
if not os.path.exists('Marina/dfs/deco'):
    os.makedirs('Marina/dfs/deco')
df.to_json('Marina/dfs/deco/Intervention_Decoding_og_df.json', orient='records')

504  Files that are in the df but not in the recording folder (not processed):  ['103_12_guitare2.wav', '112_12_olive1.wav', '112_12_pamplemousse2.wav', '112_12_arrosoir1.wav', '103_12_radis1.wav', '112_12_radis1.wav', '102_12_aubergine2.wav', '112_12_guitare1.wav', '102_12_cymbales1.wav', '104_12_courgette1.wav', '102_12_tomate2.wav', '101_9_flute2.wav', '102_12_cuillere1.wav', '103_12_tournevis2.wav', '112_12_ananas1.wav', '102_12_celeri2.wav', '104_12_poireau2.wav', '104_12_pruneau2.wav', '112_12_potiron2.wav', '112_12_carillon2.wav', '112_12_banjo2.wav', '112_12_abricot2.wav', '103_12_cerise2.wav', '103_12_agrafeuse2.wav', '104_12_celeri2.wav', '103_12_haricot1.wav', '101_9_myrtille2.wav', '103_12_scie2.wav', '102_12_piano2.wav', '104_12_lyre1.wav', '102_12_poireau1.wav', '112_12_fourchette2.wav', '101_9_batterie2.wav', '103_12_myrtille1.wav', '112_12_celeri1.wav', '102_12_orange2.wav', '103_12_roquette1.wav', '102_12_haricot1.wav', '104_12_mangue2.wav', '101_9_clarinette2.wav', '1

In [68]:
# Each config_id should have the same reference_text print them
for config_id in sorted(df['config_id'].unique()):
    print(config_id, df[df['config_id'] == config_id]['reference_text'].unique())

Fruits ['abricot' 'ananas' 'cerise' 'figue' 'fraise' 'framboise' 'kaki' 'mangue'
 'mirabelle' 'myrtille' 'orange' 'pamplemousse' 'pruneau' 'tomate']
Instruments  ['banjo' 'batterie' 'carillon' 'clarinette' 'cor' 'cymbales' 'flute'
 'guitare' 'lyre' 'metronome' 'orgue' 'piano' 'saxophone' 'violoncelle']
Légumes ['artichaut' 'asperge' 'aubergine' 'brocoli' 'celeri' 'courgette'
 'epinard' 'haricot' 'olive' 'patate' 'poireau' 'potiron' 'radis'
 'roquette']
Outils ['agrafeuse' 'arrosoir' 'chalumeau' 'ciseaux' 'compas' 'cuillere'
 'equerre' 'escabeau' 'fourchette' 'perceuse' 'scalpel' 'scie' 'tournevis'
 'truelle']


In [69]:
#df_not_cleaned = df.copy()
if not os.path.exists('Marina/dfs/deco'):
    os.makedirs('Marina/dfs/deco')

df.to_json('Marina/dfs/deco/Intervention_Decoding_df.json')
print("Total number of rows in df: ", len(df))

Total number of rows in df  before removal:  4366
Total number of rows in df  before removal:  4366


In [70]:
# type of accuracy column of first row
print("Type of accuracy column of first row: ", type(df.loc[0]['accuracy']))

Type of accuracy column of first row:  <class 'numpy.float64'>
