In [1]:
import os
import ast
import torch
from transformers import AutoModelForCTC, Wav2Vec2Processor
import numpy as np
import pandas as pd
import soundfile as sf
from src.text_processing import *
from src.data_processing import *
from src.ui_tools import *
from src.audio_processing import *

# Add espeak's shared library directory
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib'

In [2]:
# Load the cleaned data
data_path = 'data/df_test_cleaned.csv'
tests_df = pd.read_csv(data_path)

# We only keep the rows where the testType is testPhoneme
phonemeTest_df = tests_df[tests_df['testType'] == 'testPhoneme']

# Apply conversion functions to testResults and evaluationResults columns
phonemeTest_df['testResults'] = phonemeTest_df['testResults'].apply(lambda x: convert_str_to_dct_eval(x))
phonemeTest_df['evaluationResults'] = phonemeTest_df['evaluationResults'].apply(lambda x: convert_str_to_dct_eval(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phonemeTest_df['testResults'] = phonemeTest_df['testResults'].apply(lambda x: convert_str_to_dct_eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phonemeTest_df['evaluationResults'] = phonemeTest_df['evaluationResults'].apply(lambda x: convert_str_to_dct_eval(x))


In [3]:
save_recordings_as_wav(phonemeTest_df, output_dir='wav_files/testPhoneme', target_sample_rate=16000, channels=1)

In [4]:
# The groundtruth labels for the deletion phoneme test
deletion = 'vert rame lou ane reine dine lo rou li rose'

# The groundtruth labels for the fusion phoneme test
fusion = 'fa cha bou ten boi cho gué quin jin gren'

In [None]:
# If the transcription files already exist, we do nothing
deletion_file = 'testPhoneme_deletion_transcriptions.csv'
fusion_file = 'testPhoneme_fusion_transcriptions.csv'
files_exist = (os.path.isfile(deletion_file) or os.path.isfile(fusion_file))

if not(files_exist):
    # We save the groundtruth labels to a csv file
    save_phonetic_transcription_to_csv(deletion, test_type='testPhoneme', folder='transcriptions', file_name=deletion_file)
    save_phonetic_transcription_to_csv(fusion, test_type='testPhoneme', folder='transcriptions', file_name=fusion_file)

Phonetic transcription saved to transcriptions/testPhoneme_deletion_transcriptions.csv
Phonetic transcription saved to transcriptions/testPhoneme_fusion_transcriptions.csv


In [6]:
# Initialize the model and processor
MODEL_ID = "Cnam-LMSSC/wav2vec2-french-phonemizer"
model = AutoModelForCTC.from_pretrained(MODEL_ID)
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)

In [None]:
# Folder containing the audio files
audio_folder = 'wav_files/testPhoneme/'
output_csv = 'transcriptions/testPhoneme_children.csv'

# List all the .wav files in the folder
audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]

# Check if the CSV file already exists to write the header only once
file_exists = os.path.isfile(output_csv)

# Open the CSV in append mode
with open(output_csv, mode='a', encoding='utf-8', newline='') as f:
    # CSV writer setup
    import csv
    writer = csv.writer(f)
    
    # Write header if the file doesn't exist yet
    if not file_exists:
        writer.writerow(['File Name', 'Phonetic Transcription'])
    
    # Process each file one by one
    for audio_file in audio_files:
        try:
            # Load the audio file
            audio_path = os.path.join(audio_folder, audio_file)
            audio, sr = sf.read(audio_path)

            # Ensure audio is in the correct format (mono and float32)
            audio = np.array(audio, dtype=np.float32)
            
            # Preprocess the audio and prepare the inputs for the model
            inputs = processor(audio, sampling_rate=16_000, return_tensors="pt")

            # Get the model's predictions
            with torch.no_grad():
                logits = model(**inputs).logits

            # Decode the predictions to get the phonetic transcription
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]

            # Write the current result as a new line in the CSV
            writer.writerow([audio_file, transcription])

            # Optional: print progress
            print(f"Processed and saved transcription for: {audio_file}")

        except Exception as e:
            print(f"Error processing {audio_file}: {e}")

Processed and saved transcription for: testPhoneme_A80E6D01-D6C8-4E73-BCA0-77F29AF6DDB2.wav
Processed and saved transcription for: testPhoneme_233DA148-5EF9-4177-B538-54AC75D8AF55.wav
Processed and saved transcription for: testPhoneme_B8BB10DA-9B28-4538-BF07-35BA94AE5FE4.wav
Processed and saved transcription for: testPhoneme_EAC0BB7F-8195-4697-8BF1-1B979D198195.wav
Processed and saved transcription for: testPhoneme_31C2AB2C-D941-42C6-A0B3-862AEB09290D.wav
Processed and saved transcription for: testPhoneme_1F1DE99C-28B1-494E-AE86-9BC17D80753A.wav
Processed and saved transcription for: testPhoneme_1790F47F-D746-49C7-BD87-EADAF043A5A6.wav
Processed and saved transcription for: testPhoneme_2095B05B-E764-436C-8B53-48A734740FF6.wav
Processed and saved transcription for: testPhoneme_708E3AD2-8746-4218-86B2-54668105CA29.wav
Processed and saved transcription for: testPhoneme_97AF3CF5-7048-4725-A3CA-6F6676140F15.wav
Processed and saved transcription for: testPhoneme_2A4B2BB2-62FA-4AA6-9BD8-A672B