In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import azure.cognitiveservices.speech as speechsdk
import time
import json
import difflib
import string
from src.text_processing import *
from src.data_processing import *
from src.ui_tools import *
from src.audio_processing import *

# Add espeak's shared library directory
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib'

In [2]:
# Load the cleaned data
data_path = 'data/df_test_cleaned.csv'
tests_df = pd.read_csv(data_path)

# We only keep the rows where the testType is readingTestFluencE
readingTestFluencE_df = tests_df[tests_df['testType'] == 'readingTestFluencE']

# Apply conversion functions to testResults and evaluationResults columns
readingTestFluencE_df['testResults'] = readingTestFluencE_df['testResults'].apply(lambda x: convert_str_to_dct_eval(x))
readingTestFluencE_df['evaluationResults'] = readingTestFluencE_df['evaluationResults'].apply(lambda x: convert_str_to_dct_eval(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  readingTestFluencE_df['testResults'] = readingTestFluencE_df['testResults'].apply(lambda x: convert_str_to_dct_eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  readingTestFluencE_df['evaluationResults'] = readingTestFluencE_df['evaluationResults'].apply(lambda x: convert_str_to_dct_eval(x))


In [None]:
# We extract the row with id = 75A80925-F8CF-463D-AFED-5CC399848CC2
test_id = '75A80925-F8CF-463D-AFED-5CC399848CC2'
test_row = readingTestFluencE_df[readingTestFluencE_df['id'] == test_id]

evaluation_result = test_row['evaluationResults'].apply(
    lambda x: x['wordsState'] if 'wordsState' in x else None).dropna().tolist()

# We discard all the element that are 'NonRead'
read_words = [[d for d in row if list(d.values())[0] != "NonRead"] for row in evaluation_result]

# We create a string with the words that were read
reference_text = ' '.join([list(d.keys())[0] for row in read_words for d in row])

audio_file = f"sample_readingTestFluencE/readingTestFluencE_{test_id}.wav"

In [10]:
print(reference_text)

C'est l'histoire de Monsieur Petit qui vit dans une vieille maison située au coeur d'un vieux village. La maison est entourée d'un jardin avec une barrière ;il y a des concombres, des choux frisés, toutes sortes de légumes. Au fond du jardin, le portillon reste toujours fermé pour que Chien à Puces ne s'échappe pas. Chien à Puces aime se coucher près de la poubelle, à l'ombre d'un oranger couvert de fruits délicieux. Chien à Puces est gourmand, il croque tout ce qui lui passe sous


In [3]:
load_dotenv()  # Load environment variables from .env file

speech_key = os.getenv("AZURE_SPEECH_KEY")
service_region = os.getenv("AZURE_REGION")

In [None]:
def assess_pronunciation_continuous(audio_file: str, reference_text: str):
    """Performs continuous pronunciation assessment for a long audio file."""
    
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file)

    pronunciation_config = speechsdk.PronunciationAssessmentConfig(
        reference_text=reference_text,
        grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
        granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
        enable_miscue=True,
    )

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language="en-US", audio_config=audio_config)
    pronunciation_config.apply_to(speech_recognizer)

    done = False
    recognized_words = []
    prosody_scores = []
    fluency_scores = []
    durations = []

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """Stops the recognition process"""
        nonlocal done
        done = True

    def recognized(evt: speechsdk.SpeechRecognitionEventArgs):
        """Handles recognized speech"""
        pronunciation_result = speechsdk.PronunciationAssessmentResult(evt.result)
        recognized_words.extend(pronunciation_result.words)
        fluency_scores.append(pronunciation_result.fluency_score)
        if pronunciation_result.prosody_score is not None:
            prosody_scores.append(pronunciation_result.prosody_score)
        json_result = evt.result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult)
        jo = json.loads(json_result)
        nb = jo["NBest"][0]
        durations.append(sum([int(w["Duration"]) for w in nb["Words"]]))

    # Attach event handlers
    speech_recognizer.recognized.connect(recognized)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(0.5)
    
    speech_recognizer.stop_continuous_recognition()

    # Process results
    reference_words = [w.strip(string.punctuation) for w in reference_text.lower().split()]
    diff = difflib.SequenceMatcher(None, reference_words, [x.word.lower() for x in recognized_words])
    final_words = []
    
    for tag, i1, i2, j1, j2 in diff.get_opcodes():
        if tag in ['insert', 'replace']:
            for word in recognized_words[j1:j2]:
                if word.error_type == 'None':
                    word._error_type = 'Insertion'
                final_words.append(word)
        if tag in ['delete', 'replace']:
            for word_text in reference_words[i1:i2]:
                word = speechsdk.PronunciationAssessmentWordResult({
                    'Word': word_text,
                    'PronunciationAssessment': {'ErrorType': 'Omission'}
                })
                final_words.append(word)
        if tag == 'equal':
            final_words += recognized_words[j1:j2]

    # Calculate final scores
    accuracy_scores = [word.accuracy_score for word in final_words if word.error_type != 'Insertion']
    accuracy_score = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
    prosody_score = sum(prosody_scores) / len(prosody_scores) if prosody_scores else 0
    fluency_score = sum([x * y for (x, y) in zip(fluency_scores, durations)]) / sum(durations) if durations else 0
    completeness_score = len([w for w in recognized_words if w.error_type == "None"]) / len(reference_words) * 100
    completeness_score = min(completeness_score, 100)

    # Print results
    print(f'Accuracy: {accuracy_score:.2f}, Prosody: {prosody_score:.2f}, Fluency: {fluency_score:.2f}, Completeness: {completeness_score:.2f}')
    for idx, word in enumerate(final_words):
        print(f'{idx + 1}: {word.word} - Accuracy: {word.accuracy_score}, Error Type: {word.error_type}')

In [None]:
assess_pronunciation_continuous(audio_file=audio_file, reference_text=reference_text)

Accuracy: 9.18, Prosody: 0.00, Fluency: 41.68, Completeness: 3.49
1: se - Accuracy: 88.0, Error Type: Insertion
2: c'est - Accuracy: 0, Error Type: Omission
3: l'histoire - Accuracy: 50.0, Error Type: Mispronunciation
4: de - Accuracy: 80.0, Error Type: None
5: monsieur - Accuracy: 42.0, Error Type: Mispronunciation
6: petit - Accuracy: 52.0, Error Type: Mispronunciation
7: qui - Accuracy: 8.0, Error Type: Mispronunciation
8: vit - Accuracy: 11.0, Error Type: Mispronunciation
9: dans - Accuracy: 12.0, Error Type: Mispronunciation
10: portillon - Accuracy: 11.0, Error Type: Mispronunciation
11: une - Accuracy: 0, Error Type: Omission
12: vieille - Accuracy: 0, Error Type: Omission
13: maison - Accuracy: 0, Error Type: Omission
14: située - Accuracy: 0, Error Type: Omission
15: au - Accuracy: 0, Error Type: Omission
16: coeur - Accuracy: 0, Error Type: Omission
17: d'un - Accuracy: 0, Error Type: Omission
18: vieux - Accuracy: 0, Error Type: Omission
19: village - Accuracy: 0, Error Type

## AZURE model with the reference text

In [None]:
tests_id = [
    '2BB671AA-2F6A-4346-8B76-F0C89C236390',
    '3B545E56-D802-4380-9993-21C11066B12E',
    '5C1C826F-E778-48C3-9170-6BF943175984',
    '046E4FEB-E284-48D5-922E-616DA7651F02',
    '75A80925-F8CF-463D-AFED-5CC399848CC2',
    '102DCD09-43EA-434D-A590-0FA5C7C7C1B3',
    '098522E8-2203-425E-85E5-5809D5B0B523',
    '79055215-1979-42D3-9B26-B9C6DD935D83',
    'ABD81BE7-7629-4816-8241-7ECBF32DFFFA',
]

# We iterate over the tests_id and we evaluate each one
for test_id in tests_id:
    test_row = readingTestFluencE_df[readingTestFluencE_df['id'] == test_id]

    evaluation_result = test_row['evaluationResults'].apply(
        lambda x: x['wordsState'] if 'wordsState' in x else None).dropna().tolist()

    # We discard all the element that are 'NonRead'
    read_words = [[d for d in row if list(d.values())[0] != "NonRead"] for row in evaluation_result]
    reference_text = ' '.join([list(d.keys())[0] for row in read_words for d in row])
    audio_file = f"sample_readingTestFluencE/readingTestFluencE_{test_id}.wav"

    print('=' * 60)
    assess_pronunciation_continuous(audio_file=audio_file, reference_text=reference_text)

Accuracy: 27.31, Prosody: 0.00, Fluency: 54.15, Completeness: 19.30
1: située - Accuracy: 0.0, Error Type: Mispronunciation
2: c'est - Accuracy: 0, Error Type: Omission
3: l'histoire - Accuracy: 49.0, Error Type: Mispronunciation
4: de - Accuracy: 94.0, Error Type: None
5: monsieur - Accuracy: 55.0, Error Type: Mispronunciation
6: petit - Accuracy: 60.0, Error Type: None
7: qui - Accuracy: 18.0, Error Type: Mispronunciation
8: vit - Accuracy: 21.0, Error Type: Mispronunciation
9: dans - Accuracy: 18.0, Error Type: Mispronunciation
10: une - Accuracy: 16.0, Error Type: Mispronunciation
11: vieille - Accuracy: 18.0, Error Type: Mispronunciation
12: maison - Accuracy: 52.0, Error Type: Mispronunciation
13: chien - Accuracy: 62.0, Error Type: Insertion
14: située - Accuracy: 0, Error Type: Omission
15: au - Accuracy: 76.0, Error Type: None
16: chien - Accuracy: 28.0, Error Type: Mispronunciation
17: coeur - Accuracy: 0, Error Type: Omission
18: d'un - Accuracy: 0, Error Type: Omission
19: 

Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: received close frame, sending a close response frame.
Info: on_underlying_io_close_sent: uws_client=0x125e31b90, io_send_result:0
Info: on_underlying_io_close_sent: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.


## AZURE model without the reference text

In [10]:
def assess_pronunciation_continuous(audio_file: str):
    """Performs continuous pronunciation assessment for a long audio file without requiring a reference text."""
    
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language="en-US", audio_config=audio_config)

    done = False
    recognized_words = []
    prosody_scores = []
    fluency_scores = []
    durations = []

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """Stops the recognition process"""
        nonlocal done
        done = True

    def recognized(evt: speechsdk.SpeechRecognitionEventArgs):
        """Handles recognized speech"""
        pronunciation_result = speechsdk.PronunciationAssessmentResult(evt.result)
        recognized_words.extend(pronunciation_result.words)
        fluency_scores.append(pronunciation_result.fluency_score)
        if pronunciation_result.prosody_score is not None:
            prosody_scores.append(pronunciation_result.prosody_score)
        json_result = evt.result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult)
        jo = json.loads(json_result)
        nb = jo["NBest"][0]
        durations.append(sum([int(w["Duration"]) for w in nb["Words"]]))

    # Attach event handlers
    speech_recognizer.recognized.connect(recognized)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(0.5)
    
    speech_recognizer.stop_continuous_recognition()

    # Calculate final scores
    accuracy_scores = [word.accuracy_score for word in recognized_words if word.error_type != 'Insertion']
    accuracy_score = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
    prosody_score = sum(prosody_scores) / len(prosody_scores) if prosody_scores else 0
    fluency_score = sum([x * y for (x, y) in zip(fluency_scores, durations)]) / sum(durations) if durations else 0

    # Print results
    print(f'Accuracy: {accuracy_score:.2f}, Prosody: {prosody_score:.2f}, Fluency: {fluency_score:.2f}')
    for idx, word in enumerate(recognized_words):
        print(f'{idx + 1}: {word.word} - Accuracy: {word.accuracy_score}, Error Type: {word.error_type}')

In [11]:
tests_id = [
    '2BB671AA-2F6A-4346-8B76-F0C89C236390',
    '3B545E56-D802-4380-9993-21C11066B12E',
    '5C1C826F-E778-48C3-9170-6BF943175984',
    '046E4FEB-E284-48D5-922E-616DA7651F02',
    '75A80925-F8CF-463D-AFED-5CC399848CC2',
    '102DCD09-43EA-434D-A590-0FA5C7C7C1B3',
    '098522E8-2203-425E-85E5-5809D5B0B523',
    '79055215-1979-42D3-9B26-B9C6DD935D83',
    'ABD81BE7-7629-4816-8241-7ECBF32DFFFA',
]

# We iterate over the tests_id and we evaluate each one
for test_id in tests_id:
    audio_file = f"sample_readingTestFluencE/readingTestFluencE_{test_id}.wav"

    print('=' * 60)
    assess_pronunciation_continuous(audio_file=audio_file)

Accuracy: 0.00, Prosody: 0.00, Fluency: 0.00
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.


KeyboardInterrupt: 

Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: received close frame, sending a close response frame.
Info: on_underlying_io_close_sent: uws_client=0x1479eadb0, io_send_result:0
Info: on_underlying_io_close_sent: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: received close frame, sending a close response frame.
Info: on_underlying_io_close_sent: uws_client=0x1479ef970, io_send_result:0
Info: on_underlying_io_close_sent: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: received close frame, sending a close response frame.
Info: on_underlying_io_close_sent: uws_client=0x106909a40, io_send_result:0
Info: on_underlying_io_close_sent: closing underlying io.
Info: on_underlying_io_close_comp