# Get an idea of the data

In [1]:
import pandas as pd
import librosa
import librosa.display
import torch
import concurrent.futures
from transformers import (
    Wav2Vec2Processor, 
    Wav2Vec2ForCTC, 
    Wav2Vec2ProcessorWithLM, 
)


In [2]:
def wav2vec2(audio, sr, 
            processor, 
            model, 
            pool = None, 
            num_processes= None,
            beam_width= None,
            beam_prune_logp = None,
            token_min_logp = None,
            hotwords = None,
            hotword_weight = None,
            alpha = None,
            beta = None,
            unk_score_offset = None,
            lm_score_boundary = None,
            output_word_offsets= False,
            output_char_offsets= False,
            phoneme = False,
            start_offset = None,
            end_offset = None,):
    
    
    start_offset = start_offset if start_offset else 0
    end_offset = end_offset if end_offset else len(audio)/sr
    audio = audio[int( start_offset * sr ):int( end_offset * sr )]
        
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    inputs = inputs.to('cuda')

    with torch.no_grad():
        logits = model(**inputs).logits.cpu()
    
    if phoneme:
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription_dict = processor.decode(predicted_ids[0], output_char_offsets = output_char_offsets)
        transcription_dict = {"text": transcription_dict['text'].lower(), 
                              "char_offsets": transcription_dict['char_offsets'] if output_char_offsets else None }

    else:
        transcription = processor.batch_decode(logits.numpy(), output_word_offsets=output_word_offsets, 
                                            pool=pool, num_processes=num_processes, 
                                            beam_width=beam_width, beam_prune_logp=beam_prune_logp,
                                            token_min_logp=token_min_logp, hotwords=hotwords,
                                            hotword_weight=hotword_weight, alpha=alpha, beta=beta,
                                            unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary)
                                            
        # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
        time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
        for dict in transcription.word_offsets[0]:
            dict["start_offset"] = dict["start_offset"] * time_offset
            dict["end_offset"] = dict["end_offset"] * time_offset

        transcription_dict = {"text": transcription.text[0],
                            "word_offsets": transcription.word_offsets[0] 
                            }

    return transcription_dict, logits

In [3]:
def segment_speech(word_offsets, target_duration, min_duration, overlap=False):
    segments = []
    current_segment = []
    current_duration = 0.0

    if word_offsets[-1]['end_offset'] - word_offsets[0]['start_offset'] < target_duration:
        return [{'start_offset': word_offsets[0]['start_offset'], 'end_offset': word_offsets[-1]['end_offset']}]
    
    for word in word_offsets:
        word_duration = word['end_offset'] - word['start_offset']
        if current_duration + word_duration > target_duration:
            current_segment.append(word)
            segments.append(current_segment)
            current_segment = [word] if overlap else []
            current_duration = word_duration if overlap else 0.0
        else:
            current_segment.append(word)
            current_duration += word_duration

    if current_segment:
        if current_duration < min_duration and segments:
            last_segment = segments.pop()
            merged_segment = last_segment + current_segment
            split_index = len(merged_segment) // 2
            segments.append(merged_segment[:split_index])
            segments.append(merged_segment[split_index:])
        else:
            segments.append(current_segment)

    segment_intervals = []
    for segment in segments:
        start_offset = segment[0]['start_offset']
        end_offset = segment[-1]['end_offset']
        segment_intervals.append({'start_offset': start_offset, 'end_offset': end_offset})

    return segment_intervals


# Convert tensors to lists for JSON serialization
def convert_tensors_to_lists(obj):
    if isinstance(obj, torch.Tensor):
        return obj.tolist()
    elif isinstance(obj, list):
        return [convert_tensors_to_lists(i) for i in obj]
    else:
        return obj

def convert_lists_to_tensors(obj):
    if isinstance(obj, list):
        try:
            # Try to convert the list to a tensor
            return torch.tensor(obj)
        except (ValueError, TypeError):
            # If the list cannot be converted to a tensor, convert its elements instead
            return [convert_lists_to_tensors(i) for i in obj]
    else:
        return obj


# Columns to be converted from tensors to lists and vice versa
tensor_columns = [
    'logits_asr',
    'logits_phoneme',
]

In [4]:
def run_models(audio, target_duration_segment, min_duration_segment, overlap, reference_text, processor_phoneme, model_phoneme, processor_asr, model_asr, segmenter = True):

    transcriptions_segments = []
    char_offsets_segments = []
    logits_segments = []
    audio_length = librosa.get_duration(y = audio, sr = 16000)

    if audio_length < target_duration_segment + min_duration_segment:
        segmenter = False

    if segmenter:
        transcription_asr_segmenter, logits_asr = wav2vec2(audio, 16000, processor=processor_asr, model=model_asr, 
                                                hotwords= reference_text.split(), output_word_offsets=True, 
                                                alpha=0.0, beta=0.0)
        word_offsets = transcription_asr_segmenter['word_offsets']     
        for entry in segment_speech(word_offsets, target_duration_segment, min_duration_segment, overlap):
            onset = max(0, entry['start_offset'] - 1)
            offset = min(audio_length, entry['end_offset'] + 1)
            transcription_clip_all, logits_clip_all = wav2vec2(audio, 16000, processor=processor_phoneme, model=model_phoneme, 
                                                            start_offset=onset, end_offset=offset, 
                                                            phoneme=True, output_char_offsets=True)

            transcriptions_segments.append(transcription_clip_all['text'])
            char_offsets_segments.append(transcription_clip_all['char_offsets'])
            logits_segments.append(logits_clip_all)
    else:
        transcription_clip_all, logits_clip_all = wav2vec2(audio, 16000, processor=processor_phoneme, model=model_phoneme, 
                                                            phoneme=True, output_char_offsets=True)

        transcriptions_segments.append(transcription_clip_all['text'])
        char_offsets_segments.append(transcription_clip_all['char_offsets'])
        logits_segments.append(logits_clip_all)


    return transcriptions_segments, logits_segments, char_offsets_segments



In [5]:
def split_logit_matrix(logits_segment, char_offsets_segment):
    # get offsets of all ' ' characters in the char_offsets_segments dict
    space_offsets = [(item['start_offset'], item['end_offset']) for item in char_offsets_segment if item['char'] == ' ']
    if not space_offsets:
        space_offsets = [(0, len(char_offsets_segment))]

    # cut the logits and the prediction at the first space
    sub_logits_segments = []
    prev_offset = 0
    for offset in space_offsets:
        # change size of logits_segment0 to the size of the first segment
        sub_logits_segment = logits_segment[:, prev_offset:offset[1]]
        prev_offset = offset[0]
        sub_logits_segments.append(sub_logits_segment)
            
    return sub_logits_segments


In [6]:
def multiple_predictions(sub_logits_segment, processor_phoneme, threshold_diff, 
                         max_pred_per_word, k):
    # Get top-k values and indices
    topk_values, topk_indices = torch.topk(sub_logits_segment, k=k, dim=-1)
    top1_indices = topk_indices[0, :, 0].tolist()

    # Initialize with the top-1 prediction
    predictions = [tuple(top1_indices)]

    # Iterate over each alternative prediction for each timeframe
    for j in range(1, k):
        for i in range(len(top1_indices)):  

            top1_idx = topk_indices[0][i][0].item()
            top1_value = topk_values[0][i][0].item()
            alt_idx = topk_indices[0][i][j].item()
            alt_value = topk_values[0][i][j].item()

            # If the top1 prediction is a pad token and the alternative is a space token or vice versa, skip (no need to swap)
            if not ((top1_idx == pad_token_id and alt_idx == space_token_id) 
                    or (top1_idx == space_token_id and alt_idx == pad_token_id)):
                
                diff = top1_value - alt_value
                if diff < threshold_diff:
                    new_predictions = []
                    for pred in predictions:
                        if len(predictions) >= max_pred_per_word + 1:
                            break
                        new_prediction = list(pred)
                        new_prediction[i] = topk_indices[0][i][j]
                        new_predictions.append(tuple(new_prediction))
                    # Add new predictions and remove duplicates
                    predictions.extend(new_predictions)
                    predictions = list(set(predictions))

            # Stop if we have reached the max prediction count
            if len(predictions) > max_pred_per_word:
                break

    predictions = [torch.tensor(prediction) for prediction in predictions]

    # decode the predictions and add them to the all_decoded_predictions list
    decoded_predictions = processor_phoneme.batch_decode(predictions)
    # remove duplicates
    decoded_predictions = list(set(decoded_predictions))
    return decoded_predictions

In [7]:
def decode_phonetic_prediction_concat_error(all_decoded_predictions, target_phonemes, error_list, correct, results_decode):
    # Binary classification of the reference text
    for j in range(1, len(all_decoded_predictions) -1):
        previous_pred_list = all_decoded_predictions[j-1]
        current_pred_list = all_decoded_predictions[j]
        next_pred_list = all_decoded_predictions[j+1]
        decoded_prediction_concat = []
        for previous_pred in previous_pred_list:
            for current_pred in current_pred_list:
                for next_pred in next_pred_list:
                    concat_pred = previous_pred + current_pred + next_pred
                    decoded_prediction_concat.append(concat_pred)

        for i, targets_words in enumerate(target_phonemes):
            for concat_pred in decoded_prediction_concat:
                if any(error in concat_pred for error in error_list[i]):
                    correct[i] = 0
                    break
                for target_word in targets_words:
                    if not correct[i] and target_word in concat_pred:
                        correct[i] = 1
                        results_decode[i] = [concat_pred]
                        break

    return correct, all_decoded_predictions, results_decode            

In [8]:
def process_targets(target_excel_path, sheet_name = None):

    if sheet_name != None:
        df_targets = pd.read_excel(target_excel_path, sheet_name=sheet_name)
    else:
        df_targets = pd.read_excel(target_excel_path)

    # for each element in Target_IPA column, if format is '[element1] or [elements2]' change it to [element1, element2]
    df_targets['Target_IPA'] = df_targets['Target_IPA'].apply(lambda x: x.split(" ") if isinstance(x, str) else [])
    df_targets['Asr_IPA_Other'] = df_targets['Asr_IPA_Other'].apply(lambda x: x.split(" ") if isinstance(x, str) else [])
    df_targets['Errors_Letter_End'] = df_targets['Errors_Letter_End'].apply(lambda x: x.split(" ") if isinstance(x, str) else [])
    df_targets['Errors_Letter_Begining'] = df_targets['Errors_Letter_Begining'].apply(lambda x: x.split(" ") if isinstance(x, str) else [])
    # Create a new column for the errors words (empty list)
    df_targets['Errors_Words'] = None

    # add Asr_IPA_Other to Target_IPA
    df_targets['Target_IPA'] = df_targets.apply(lambda x: x['Target_IPA'] + x['Asr_IPA_Other'], axis=1)

    # for row, take all words in target_IPA list of that row and add each item (if not empty) of error_letter list to it, then add the result to the errors words column
    df_targets['Errors_Letter_End'] = df_targets.apply(
        lambda x: [word + letter for word in x['Target_IPA'] for letter in x['Errors_Letter_End'] if letter], axis=1)
    df_targets['Errors_Letter_Begining'] = df_targets.apply(
        lambda x: [letter + word for word in x['Target_IPA'] for letter in x['Errors_Letter_Begining'] if letter], axis=1)
    # for each word in the list in the Target_IPA column, add word\u0303' to the errors column list
    df_targets['Errors_Tilde'] = df_targets.apply(lambda x: [word + '\u0303' for word in x['Target_IPA']], axis=1)
    # add all errors to the errors words column
    df_targets['Errors_Words'] = df_targets.apply(
        lambda x:   x['Errors_Letter_End'] + x['Errors_Letter_Begining'] + x['Errors_Tilde'], axis=1)

    # remove all empty strings from the list
    df_targets['Errors_Words'] = df_targets['Errors_Words'].apply(lambda x: [i for i in x if i])
    df_targets['Target_IPA'] = df_targets['Target_IPA'].apply(lambda x: [i for i in x if i])

    return df_targets


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_asr_segmenter = Wav2Vec2ForCTC.from_pretrained("Dandan0K/Intervention-xls-FR-no-LM").to(device)
processor_asr_segmenter = Wav2Vec2ProcessorWithLM.from_pretrained("Dandan0K/Intervention-xls-FR-no-LM")

model_asr_phonemizer = Wav2Vec2ForCTC.from_pretrained("Cnam-LMSSC/wav2vec2-french-phonemizer").to(device)
processor_asr_phonemizer = Wav2Vec2Processor.from_pretrained("Cnam-LMSSC/wav2vec2-french-phonemizer")

# To be changed if you change the phoneme model
token_dict = {"1": 1, "a": 2, "b": 3, "d": 4, "e": 5, "f": 6, "h": 7, "i": 8, "j": 9, "k": 10, 
              "l": 11, "m": 12, "n": 13, "o": 14, "p": 15, "r": 16, "s": 17, "t": 18, "u": 19, "v": 20, 
              "w": 21, "x": 22, "y": 23, "z": 24, "ç": 25, "ð": 26, "ø": 27, "ŋ": 28, "œ": 29, "ɐ": 30, 
              "ɑ": 31, "ɒ": 32, "ɔ": 33, "ə": 34, "ɛ": 35, "ɜ": 36, "ɡ": 37, "ɣ": 38, "ɨ": 39, "ɪ": 40, 
              "ɬ": 41, "ɲ": 42, "ɹ": 43, "ɾ": 44, "ʁ": 45, "ʃ": 46, "ʊ": 47, "ʌ": 48, "ʍ": 49, "ʒ": 50, 
              "ʔ": 51, "ʲ": 52, "ː": 53, "̃": 54, "β": 55, "θ": 56, "|": 0, "[UNK]": 57, "[PAD]": 58}

df_targets = process_targets('Targets_IPA.xlsx', sheet_name='Decoding_FR')

k = len(token_dict)
max_pred_per_word = 20
threshold_diff = 2
min_duration_speech = 2
optimal_duration_speech = 3

def get_score(audio_file, target_duration_segment, min_duration_segment, reference_text, processor_phoneme, model_phoneme, processor_asr, model_asr, df_targets, overlap=False, segmenter=True):
    
    reference_words = reference_text.split(" ")
    target_phonemes = []
    error_list = []
    correct = [0] * len(reference_words)
    results_decode = [[]] * len(reference_words)

    for word in reference_words:
        target_phonemes.append(df_targets[df_targets['Target'] == word]['Target_IPA'].values[0])
        error_list.append(df_targets[df_targets['Target'] == word]['Errors_Words'].values[0])

    # Segment the audio file into speech segments and return the logits matrix and character offsets for each segment
    _, logits_segments, char_offsets_segments = run_models(audio_file, target_duration_segment, min_duration_segment, overlap, reference_text, processor_phoneme, model_phoneme, processor_asr, model_asr, segmenter)
    # Process the logits matrix for each segment
    for logits_segment, char_offsets_segment in zip(logits_segments, char_offsets_segments): ### process each segment in parallel

        # Split the logits at the first space character and return the sub-logits matrix, target phonemes, and error list
        sub_logits_segments = split_logit_matrix(logits_segment, char_offsets_segment)
        
        # Decode the most probable phonetic predictions for each segment
        all_decoded_predictions = []
        for sub_logits_segment in sub_logits_segments: ### process each subsegment of each segment in parallel
            decoded_predictions = multiple_predictions(sub_logits_segment, processor_phoneme, threshold_diff, max_pred_per_word, k)
            all_decoded_predictions.append(decoded_predictions)
        
        # Binary classification of the reference text
        correct, _, results_decode = decode_phonetic_prediction_concat_error(all_decoded_predictions, target_phonemes, error_list, correct, results_decode) ### process each segment in parallel
    
    return correct, results_decode




Some weights of the model checkpoint at Dandan0K/Intervention-xls-FR-no-LM were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Dandan0K/Intervention-xls-FR-no-LM and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at Cnam-LMSSC/wav2vec2-french-phonemizer were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Cnam-LMSSC/wav2vec2-french-phonemizer and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probab

In [10]:
df = pd.read_json("dfs/deco/Intervention_df_cleaned_deco.json")
i = 1
filepath = df['filepath'][i]
audio, sr = librosa.load(filepath, sr=16000)
reference_text = df['reference_text'][i]
print(df['accuracy'][i])
print(df['filename'][i])
pad_token_id = 58
space_token_id = 0
get_score(audio, optimal_duration_speech, min_duration_speech, reference_text, processor_phoneme, model_phoneme, processor_asr, model_asr, df_targets_decoding, overlap=False, segmenter=True)

0 2 0 0 2 0 0 0 0 2 0 2
3101_edugame2023_9a4621529c23405c8d2681287fed34a4_eae17d2661f944718181014af9a5c1c8.wav


NameError: name 'processor_phoneme' is not defined

In [16]:
import requests
import base64
import numpy as np
import scipy.io.wavfile as wav
import argparse
import sys
#import sounddevice as sd
import numpy as np
import threading
import queue
def audio_to_base64(audio_data, fs=16000):
    temp_file = "temp_audio.wav"
    wav.write(temp_file, fs, audio_data.astype(np.int16))
    with open(temp_file, "rb") as audio_file:
        return base64.b64encode(audio_file.read()).decode('utf-8')
    
def process_audio(reference_text, audio_data=None, audio_file=None):
    if audio_data is None and audio_file is None:
        raise ValueError("Either audio_data or audio_file must be provided")
    if audio_file:
        with open(audio_file, "rb") as file:
            audio_base64 = base64.b64encode(file.read()).decode('utf-8')
    else:
        audio_base64 = audio_to_base64(audio_data)

    url = "http://localhost:8070/asr_pipeline"
    payload = {
        "reference_text": reference_text,
        "audio": audio_base64
    }
    response = requests.post(url, json=payload)
    if response.status_code == 201:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

audio_data, sr = librosa.load(filepath, sr=16000)
reference_text = df['reference_text'][i]
process_audio(reference_text, audio_data=audio_data)

Error: 500
{"error":"An unexpected error occurred"}



In [738]:
pip install sounddevice


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
