In [1]:
import os
import csv
import numpy as np
import soundfile as sf
import torch
from transformers import AutoModelForCTC, Wav2Vec2Processor
from src.data_processing import *

os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib'

In [2]:
# Initialize the model and processor
MODEL_ID = "Cnam-LMSSC/wav2vec2-french-phonemizer"
model = AutoModelForCTC.from_pretrained(MODEL_ID)
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)

In [3]:
# The tests id of the 10 selected readingTestFluencE tests
tests_id = [
    '2BB671AA-2F6A-4346-8B76-F0C89C236390',
    '3B545E56-D802-4380-9993-21C11066B12E',
    '5C1C826F-E778-48C3-9170-6BF943175984',
    '046E4FEB-E284-48D5-922E-616DA7651F02',
    '75A80925-F8CF-463D-AFED-5CC399848CC2',
    '102DCD09-43EA-434D-A590-0FA5C7C7C1B3',
    '098522E8-2203-425E-85E5-5809D5B0B523',
    '79055215-1979-42D3-9B26-B9C6DD935D83',
    'ABD81BE7-7629-4816-8241-7ECBF32DFFFA',
]

In [4]:
# Load the cleaned data
data_path = 'data/df_test_cleaned.csv'
tests_df = pd.read_csv(data_path)

# We only keep the rows where the testType is readingTestFluencE
readingTestFluencE_df = tests_df[tests_df['testType'] == 'readingTestFluencE']

# Apply conversion functions to testResults and evaluationResults columns
readingTestFluencE_df['testResults'] = readingTestFluencE_df['testResults'].apply(lambda x: convert_str_to_dct_eval(x))
readingTestFluencE_df['evaluationResults'] = readingTestFluencE_df['evaluationResults'].apply(lambda x: convert_str_to_dct_eval(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  readingTestFluencE_df['testResults'] = readingTestFluencE_df['testResults'].apply(lambda x: convert_str_to_dct_eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  readingTestFluencE_df['evaluationResults'] = readingTestFluencE_df['evaluationResults'].apply(lambda x: convert_str_to_dct_eval(x))


In [5]:
# We iterate over the tests_id and we create the top-3 phoneme transcriptions
for test_id in tests_id:
    # We extract the audio file
    audio_file = f"sample_readingTestFluencE/readingTestFluencE_{test_id}.wav"
    audio, _ = sf.read(audio_file)

    # Preprocess the audio and prepare the inputs for the model
    inputs = processor(np.array(audio), sampling_rate=16_000., return_tensors="pt")

    # Get the model's predictions
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the top-3 most probable phonemes for each timestep
    topk_probs, topk_indices = torch.topk(logits, k=3, dim=-1)

    # Decode the top-3 predictions for each timestep
    filtered_transcriptions = []
    last_phoneme_set = None  # Store the last added phoneme set to avoid duplicates

    for i in range(topk_indices.shape[1]):  # Iterate over time steps
        phonemes = processor.tokenizer.convert_ids_to_tokens(topk_indices[0, i].tolist())

        # Skip if the first prediction is '[PAD]' or the first two predictions are '|' and '[PAD]'
        if (phonemes[0] == "[PAD]") or (phonemes[0] == "|" and phonemes[1] == "[PAD]"):
            continue  # Ignore this timestamp

        # Create a tuple of phonemes (excluding '[PAD]' from second and third positions)
        phoneme_tuple = (phonemes[0], phonemes[1], phonemes[2])

        # Avoid adding consecutive duplicate phoneme sets
        if phoneme_tuple != last_phoneme_set:
            filtered_transcriptions.append([i] + list(phoneme_tuple))
            last_phoneme_set = phoneme_tuple  # Update last seen set

    # Define output CSV file name
    csv_filename = os.path.splitext(audio_file)[0] + "_phonemes.csv"

    # Save results to CSV
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Timestep", "Top_1", "Top_2", "Top_3"])  # CSV header
        writer.writerows(filtered_transcriptions)

    print(f"Filtered top-3 phoneme transcriptions saved to {csv_filename}")

Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_2BB671AA-2F6A-4346-8B76-F0C89C236390_phonemes.csv
Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_3B545E56-D802-4380-9993-21C11066B12E_phonemes.csv
Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_5C1C826F-E778-48C3-9170-6BF943175984_phonemes.csv
Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_046E4FEB-E284-48D5-922E-616DA7651F02_phonemes.csv
Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_75A80925-F8CF-463D-AFED-5CC399848CC2_phonemes.csv
Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_102DCD09-43EA-434D-A590-0FA5C7C7C1B3_phonemes.csv
Filtered top-3 phoneme transcriptions saved to sample_readingTestFluencE/readingTestFluencE_098522E8-2203-425E-85E5-5809D5B0B523_phonemes.csv
Filter

In [6]:
def load_predictions(csv_filename):
    """
    Reads the CSV file and returns a list of sets, where each set contains the three possible phonemes per timestamp.
    """
    phoneme_options = []
    
    with open(csv_filename, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        
        for row in reader:
            _, top1, top2, top3 = row
            phoneme_set = {top1}
            if top2 != "[PAD]":
                phoneme_set.add(top2)
            if top3 != "[PAD]":
                phoneme_set.add(top3)
            
            phoneme_options.append(phoneme_set)
    
    return phoneme_options

def evaluate_sentence(phoneme_options, target_sentence, buffer_size=15):
    """
    Evaluates whether each word in the target sentence was correctly pronounced using a rolling buffer.
    - Adds phoneme options to a buffer one at a time.
    - Checks if the word can be formed by selecting at most one phoneme per timestamp (while keeping order).
    - If a match is found, removes only the tuples in the buffer **that were used** for the current word.
    - If the buffer is full and the word is not matched, moves to the next word without clearing the buffer.
    Returns a word-by-word evaluation.
    """
    def can_reconstruct_word(buffer, target_word):
        """
        Checks if the target_word can be formed using at most one phoneme per timestamp (while maintaining order).
        Returns True and a list of indices used if the word is reconstructed.
        Otherwise, returns False and an empty list.
        """
        target_phonemes = list(target_word)  # Convert word to phoneme list
        target_index = 0  # Tracks position in target word
        used_indices = []  # Stores the buffer indices used to match the word

        # print(f"Buffer: {buffer}")

        for i, phoneme_set in enumerate(buffer):
            if target_index < len(target_phonemes) and target_phonemes[target_index] in phoneme_set:
                used_indices.append(i)
                target_index += 1  # Move to next phoneme

            if target_index == len(target_phonemes):  # If all phonemes were found in order
                return True, used_indices  # Return True and the indices used

        return False, []  # Could not reconstruct the word

    words = target_sentence.split(" ")  # Split sentence into words
    buffer = []  # Rolling buffer for phoneme predictions
    results = []

    for word in words:
        # print("==" * 20)
        # print(f"Target word: {word}")

        for _ in range(buffer_size):  # Ensure we don't go beyond buffer size
            found, used_indices = can_reconstruct_word(buffer, word)

            if found:  # If the word is successfully reconstructed
                results.append((word, "correct"))
                
                # Remove only the used elements while keeping order
                buffer = [buffer[i] for i in range(len(buffer)) if i not in used_indices]
                break  # Move to next word
            else:
                if phoneme_options:
                    buffer.append(phoneme_options.pop(0))

        else:  # If no match is found after buffer fills up, move on
            results.append((word, "missed"))

    return results

In [None]:

ground_truth = 'sɛ listwaʁ də məsjø pəti ki vi dɑ̃z yn vjɛj mɛzɔ̃ sitye o kœʁ dœ̃ vjø vilaʒ la mɛzɔ̃ ɛt ɑ̃tuʁe dœ̃ ʒaʁdɛ̃ avɛk yn baʁjɛʁ il i a de kɔ̃kɔ̃bʁ de ʃu fʁize tut sɔʁt də leɡymz o fɔ̃ dy ʒaʁdɛ̃ lə pɔʁtijɔ̃ ʁɛst tuʒuʁ fɛʁme puʁ kə ʃjɛ̃ a pys nə seʃap pa ʃjɛ̃ a pysz ɛm sə kuʃe pʁɛ də la pubɛl a lɔ̃bʁ dœ̃n ɔʁɑ̃ʒe kuvɛʁ də fʁyi delisjø ʃjɛ̃ a pysz ɛ ɡuʁmɑ̃ il kʁɔk tu sə ki lyi pas su la dɑ̃ dez ɔʁɑ̃ʒ puʁi ki tɔ̃b syʁ lə sɔl de flœʁ fanez œ̃ mɔʁso də byvaʁ œ̃ ʒuʁ məsjø pəti desid də mɛtʁ ʃjɛ̃ a pys dɑ̃z yn niʃ ʃjɛ̃ a pys nɛm paz ɛtʁ ɑ̃fɛʁme il pʁefɛʁ sɑ̃dɔʁmiʁ ɑ̃ ʁəɡaʁdɑ̃ lez etwal dɑ̃ lə sjɛl tut le nyiz il abwa kɑ̃ məsjø pəti va sə kuʃe məsjø pəti desid də dɔʁmiʁ dɑ̃ lə ɡʁənje də sa ʒɔli mɛzɔ̃ puʁ pʁɑ̃dʁ œ̃ pø də ʁəpoz il nə tʁuv ply lə sɔmɛj yn nyi dɛ̃sɔmni ɔp il sot dy li e uvʁ la ɡʁɑ̃d mal ki sə tʁuv dəvɑ̃ lyi dɑ̃z œ̃ kwɛ̃ sɔ̃bʁ dy ɡʁənje e la syʁpʁiz tut sa vi kil pɑ̃sɛ sɑ̃z istwaʁ lyi ʁəvjɛ̃t ɑ̃ memwaʁ il sɔʁ le muʃwaʁ bʁɔde paʁ sa ɡʁɑ̃mɛʁ se pətit dɑ̃ də lɛ sɔ̃ po də ʃɑ̃bʁ ebʁeʃe yn tɛt də pwasɔ̃ seʃe œ̃ sak plɛ̃ də bijz yn mɔ̃tʁ ki fɛ tik tak tik tak sɔ̃ kaʁnɛ də nɔtz œ̃ bu də lasɛ sɔ̃ vjø tʁɑ̃zistɔʁ a pil sɛ fu kɔm tu se suvniʁ sə buskyl dɑ̃ sa tɛt e il nə pø ʁətniʁ se laʁm demɔsjɔ̃ sa vi nɛ pa sɑ̃z istwaʁ il sə suvjɛ̃t ɛɡzaktəmɑ̃ də la vwa dy pʁezɑ̃tatœʁ meteo lə tɑ̃ va sameljɔʁe dəmɛ̃ ɑ̃ deby də matine syʁ nɔtʁ ʁeʒjɔ̃ sjɛl ʃaʁʒe lapʁɛmidi il sə ʁapɛl le vjɛj pyblisitez aɛma e la salte sɑ̃ va ɔ̃n a tuʒuʁ bəzwɛ̃ də pəti pwa ʃe swa le pʁəmjɛʁ lymjɛʁ dy ʒuʁ penɛtʁ paʁ la pətit fənɛtʁ dy ɡʁənje il ɛt o kœʁ də se suvniʁ kɑ̃ sɔ̃ ʁevɛj sɔndʁɪŋ dʁɪŋ dʁɪŋ'
ground_truth = ground_truth.split(" ")

# We iterate over the tests_id and we create the top-3 phoneme transcriptions
for test_id in tests_id:
    print("==" * 20)
    print(f"Test ID: {test_id}")

    test_row = readingTestFluencE_df[readingTestFluencE_df['id'] == test_id]
    evaluation_result = test_row['evaluationResults'].apply(
        lambda x: x['wordsState'] if 'wordsState' in x else None).dropna().tolist()

    # We extract the ground truth for each test
    read_words = [[d for d in row if list(d.values())[0] != "NonRead"] for row in evaluation_result]
    reference_text = ' '.join([list(d.keys())[0] for row in read_words for d in row])
    reference_words = reference_text.split()  
    target_sentence = " ".join(ground_truth[:len(reference_text.split())]) 

    
    csv_filename = f"sample_readingTestFluencE/readingTestFluencE_{test_id}_phonemes.csv"

    # Load the phoneme predictions
    phoneme_options = load_predictions(csv_filename)

    # Evaluate each word
    word_results = evaluate_sentence(phoneme_options, target_sentence, buffer_size=25)

    # Print results
    #print("Word-by-word evaluation:")
    # for word, status in word_results:
    #     status_symbol = "✅" if status == "correct" else "❌"
        # print(f"{status_symbol} {word} → {status}")

    # Summary of correctness
    correct_words = sum(1 for _, status in word_results if status == "correct")
    total_words = len(word_results)
    print(f"\nFinal Score: {correct_words}/{total_words} words read correctly.")


Test ID: 2BB671AA-2F6A-4346-8B76-F0C89C236390
Word-by-word evaluation:

Final Score: 46/57 words read correctly.
Test ID: 3B545E56-D802-4380-9993-21C11066B12E
Word-by-word evaluation:

Final Score: 54/67 words read correctly.
Test ID: 5C1C826F-E778-48C3-9170-6BF943175984
Word-by-word evaluation:

Final Score: 110/130 words read correctly.
Test ID: 046E4FEB-E284-48D5-922E-616DA7651F02
Word-by-word evaluation:

Final Score: 97/110 words read correctly.
Test ID: 75A80925-F8CF-463D-AFED-5CC399848CC2
Word-by-word evaluation:

Final Score: 71/86 words read correctly.
Test ID: 102DCD09-43EA-434D-A590-0FA5C7C7C1B3
Word-by-word evaluation:

Final Score: 83/94 words read correctly.
Test ID: 098522E8-2203-425E-85E5-5809D5B0B523
Word-by-word evaluation:

Final Score: 46/61 words read correctly.
Test ID: 79055215-1979-42D3-9B26-B9C6DD935D83
Word-by-word evaluation:

Final Score: 74/85 words read correctly.
Test ID: ABD81BE7-7629-4816-8241-7ECBF32DFFFA
Word-by-word evaluation:

Final Score: 51/61 wo