In [1]:
import requests,json,base64
import pandas as pd
import os
from segment_service import align_texts
import time
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2Processor

# The paths

In [2]:
service_address_local = "http://localhost:8070/segment"
service_address_server = "https://19af-130-60-24-13.ngrok-free.app/segment"   ### This service address is fixed. Nianlong set it up on a server.

recordings_folder_path = r"../Intervention_T1_data/recordings/deco_trials/FR_deco/FR_deco"
decoding_trial_path = r"../Intervention_T1_data/ground_truth_transcribed/Decoding/deco_test_ground_truth_FR_T1_first_coder_.csv"
phon_del_trial_path = r"../Intervention_T1_data/ground_truth_transcribed/PhonDel/phonDel_test_ground_truth_ALL_T1_first_coder.csv"
path_filters = None

# Get the ASR Transcription and Alignment for all files

In [14]:
def fill_df(recordings_filepaths, df, service_address, json_path, asr_method, asr_response_col, process_time_col):
        
    for audio_path in recordings_filepaths:
        # Get the reference text and language for the audio file
        audio_file_path = os.path.basename(audio_path)
        if audio_file_path in df["filename"].values: 
            # add condition that we only do for rows where df[df[asr_response_col].apply(lambda x: x["transcription"]["words"] == [])]
            #if list(df.loc[df["filename"] == audio_file_path][asr_response_col])[0]["transcription"]["words"] == []:

                reference_text = list(df.loc[df["filename"] == audio_file_path]["reference_text"])[0] 
                language = list(df.loc[df["filename"] == audio_file_path]["language"])[0]
                
                # Call the segment service with the audio file and reference text
                print("Processing file: ", audio_file_path)
                time_start = time.time()
                response_asr = call_segment_service(service_address, audio_path, reference_text, language, asr_method)
                time_end = time.time()
                print("ASR response: ", response_asr)
                # Get the human transcription and remove annotations
                human_transcription = list(df.loc[df["filename"] == audio_file_path]["words_human_transcription"])[0]

                # Update the df with the ASR responses and human alignments
                df.loc[df["filename"] == audio_file_path, "human_alignment"] = [{"alignment": align_texts(reference_text, human_transcription)}]
                df.loc[df["filename"] == audio_file_path, asr_response_col] = [response_asr]
                df.loc[df["filename"] == audio_file_path, process_time_col] = [time_end - time_start]

                # Save the updated df to a JSON file
                df.to_json(json_path, orient='records')

                
    return df

In [15]:
def wav2vec2(audio, sr, 
            processor, 
            model, 
            reference_text = None, 
            pool = None, 
            num_processes= None,
            beam_width= None,
            beam_prune_logp = None,
            token_min_logp = None,
            hotwords = None,
            hotword_weight = None,
            alpha = None,
            beta = None,
            unk_score_offset = None,
            lm_score_boundary = None,
            output_word_offsets= False,
            output_char_offsets= False,
            n_best = 1,
            with_lm = False,
            phoneme = False,
            noise_reduction = False,
            start_offset = None,
            end_offset = None,
            slow_down = False):
    
    
    start_offset = start_offset if start_offset else 0
    end_offset = end_offset if end_offset else len(audio)/sr

    audio = audio[int( start_offset * sr ):int( end_offset * sr )]

    if slow_down:
        audio = librosa.effects.time_stretch(audio, rate = 1.1)
        
    if noise_reduction:
        audio = nr.reduce_noise(audio, sr, prop_decrease=1)
        
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

    #inputs = inputs.to('cuda')

    with torch.no_grad():
        logits = model(**inputs).logits.cpu()
    
    if phoneme:
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription_dict = processor.decode(predicted_ids[0], output_char_offsets = output_char_offsets)
        # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
        '''	
        if output_char_offsets == True:
            time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
            for i in range(len(transcription_dict['char_offsets'])): 
                transcription_dict['char_offsets'][i]["start_offset"] = round(transcription_dict['char_offsets'][i]["start_offset"] * time_offset, 2)
                transcription_dict['char_offsets'][i]["end_offset"] = round(transcription_dict['char_offsets'][i]["end_offset"] * time_offset, 2)
        '''
        transcription_dict = {"text": transcription_dict['text'].lower(), 
                              "char_offsets": transcription_dict['char_offsets'] if output_char_offsets else None }
        alignment_result = align_texts(reference_text, transcription_dict['text'].lower()) if reference_text else None
        result = {
            "transcription": transcription_dict,
            "alignment": alignment_result
        } if reference_text else transcription_dict

        transcriptions = [result]
    else:
        if not with_lm:
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription_text = processor.decode(predicted_ids[0]).lower()
            transcription_dict = {"text": transcription_text}
            alignment_result = align_texts(reference_text, transcription_text) if reference_text else None
            result = {
                "transcription": transcription_dict,
                "alignment": alignment_result
            } if reference_text else transcription_dict

            transcriptions = [result]
        else:
            transcription = processor.batch_decode(logits.numpy(), output_word_offsets=output_word_offsets, 
                                                n_best =n_best, pool=pool, num_processes=num_processes, 
                                                beam_width=beam_width, beam_prune_logp=beam_prune_logp,
                                                token_min_logp=token_min_logp, hotwords=hotwords,
                                                hotword_weight=hotword_weight, alpha=alpha, beta=beta,
                                                unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary)
                                                
            # if only one transcription is returned, convert it to a list for consistency
            if n_best == 1 or not with_lm:
                transcription.text = [transcription.text]
                transcription.logit_score = [transcription.logit_score] if with_lm else None
                transcription.lm_score = [transcription.lm_score] if with_lm else None
                transcription.word_offsets = [transcription.word_offsets] if output_word_offsets else None

            # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
            if output_word_offsets == True:
                time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
                for i in range(len(transcription.word_offsets[0])): 
                    for dict in transcription.word_offsets[0][i]:
                        dict["start_offset"] = round(dict["start_offset"] * time_offset, 2)
                        dict["end_offset"] = round(dict["end_offset"] * time_offset, 2)

            
            # change output format to dictionary
            transcriptions = []
            for i in range(len(transcription.text[0])):
                transcription_dict = {"text": transcription.text[0][i],
                                        "logit_score": transcription.logit_score[0][i],
                                        "lm_score": transcription.lm_score[0][i] if with_lm else None,
                                        "word_offsets": transcription.word_offsets[0][i] if output_word_offsets else None
                                    }
                alignment_result = align_texts(reference_text, transcription.text[0][i]) if reference_text else None

                result = {
                    "transcription": transcription_dict,
                    "alignment": alignment_result
                } if reference_text else transcription_dict
                
                transcriptions.append(result)

    return transcriptions, logits

In [16]:
def fill_df_wav2vec2(df, json_path, asr_response_col, process_time_col, model_ID_list, 
            pool = None, num_processes= None, beam_width= None, beam_prune_logp = None,
            token_min_logp = None, hotwords = False, hotword_weight = None,
            alpha = None, beta = None, unk_score_offset = None, lm_score_boundary = None,
            output_word_offsets= False, n_best = 1, with_lm = False):
    
    models = [Wav2Vec2ForCTC.from_pretrained(id) for id in model_ID_list]
    if with_lm:
        processors = [Wav2Vec2ProcessorWithLM.from_pretrained(id) for id in model_ID_list]
    else:
        processors = [Wav2Vec2Processor.from_pretrained(id) for id in model_ID_list]

    for idx, row in df.iterrows():
        # Get the reference text and language for the audio file
        filename = row["filename"]
        reference_text = row["reference_text"]
        language = row["language"]
        filepath = row["filepath"]
        
        if language == "fr":
            processor = processors[0]
            model = models[0]
        elif language == "it":
            processor = processors[1]
            model = models[1]

        # Call the segment service with the audio file and reference text
        print("Processing file: ", filename)
        time_start = time.time()
        hotwords_list = reference_text.split(' ') if hotwords else None
        audio, sr = librosa.load(filepath, sr=16000)
        response_asr, logits = wav2vec2(audio=audio, sr=sr, processor=processor, model=model, 
                                        reference_text=reference_text, hotwords=hotwords_list ,
                                        hotword_weight= hotword_weight, alpha=alpha, beta=beta, 
                                        unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary, 
                                        output_word_offsets = output_word_offsets, n_best = n_best, with_lm = with_lm)
        time_end = time.time()
        print("ASR response: ", response_asr)
        # Convert the response to a json format
        response_asr = json.dumps(response_asr)

        # Get the human transcription and remove annotations
        human_transcription = row["words_human_transcription"]

        # Update the df with the ASR responses and human alignments
        df.loc[df["filename"] == filename, "human_alignment"] = [{"alignment": align_texts(reference_text, human_transcription)}]
        df.loc[df["filename"] == filename, asr_response_col] = [response_asr]
        df.loc[df["filename"] == filename, process_time_col] = [time_end - time_start]

        # Save the updated df to a JSON file
        df.to_json(json_path, orient='records')
    return df

In [17]:
df = pd.read_json(r'dfs/deco/Intervention_df_cleaned_deco.json')
df_1 = pd.read_json(r"dfs/deco/Intervention_df_1_deco.json")
df_2 = pd.read_json(r"dfs/deco/Intervention_df_2_deco.json")
df_3 = pd.read_json(r"dfs/deco/Intervention_df_3_deco.json")
df_4 = pd.read_json(r"dfs/deco/Intervention_df_4_deco.json")
df_5 = pd.read_json(r"dfs/deco/Intervention_df_5_deco.json")
df_list = [df_1, df_2, df_3, df_4, df_5]

n_best = 1
########################################

# WhipserX, v2v2 XLS and VOX models as they are
whisperX_json_path = "df_results/intervention/decoding_whisperX.json"
wav2vec2_vox_json_path = "df_results/intervention/decoding_wav2vec2_vox.json"
wav2vec2_xls_json_path = "df_results/intervention/decoding_wav2vec2_xls.json"

# v2v2 XLS models with no LM (greedy vs beam search)
wav2vec2_xls_json_path_no_LM = "df_results/intervention/decoding_wav2vec2_xls_no_LM.json"
wav2vec2_xls_json_path_no_LM_beam = "df_results/intervention/decoding_wav2vec2_xls_no_LM_beam.json"

# v2v2 XLS models with LM (Ref only vs Human Transcription) 
lm_Ref_wav2vec2_xls_json_path = "df_results/intervention/decoding_wav2vec2_xls_lm_Ref.json"
hotwords_wav2vec2_xls_json_path = "df_results/intervention/decoding_wav2vec2_xls_Hotwords.json"
hotwords_lm_xls_json_path = "df_results/intervention/decoding_wav2vec2_xls_lm_Hotwords.json"

wav2vec2_LM_models = [
    "Dandan0K/Intervention-xls-FR-Hum-no-df1", "Dandan0K/Intervention-xls-FR-Hum-no-df1-vowels", 
    "Dandan0K/Intervention-xls-FR-Hum-no-df2", "Dandan0K/Intervention-xls-FR-Hum-no-df2-vowels", 
    "Dandan0K/Intervention-xls-FR-Hum-no-df3", "Dandan0K/Intervention-xls-FR-Hum-no-df3-vowels",
    "Dandan0K/Intervention-xls-FR-Hum-no-df4", "Dandan0K/Intervention-xls-FR-Hum-no-df4-vowels", 
    "Dandan0K/Intervention-xls-FR-Hum-no-df5", "Dandan0K/Intervention-xls-FR-Hum-no-df5-vowels"
    ]
wav2vec2_LM_json_paths = [
    r"df_results\intervention\decoding_wav2vec2_df_1_xls.json",
    r"df_results\intervention\decoding_wav2vec2_df_1_xls_vowels.json"
    r"df_results\intervention\decoding_wav2vec2_df_2_xls.json",
    r"df_results\intervention\decoding_wav2vec2_df_2_xls_vowels.json",
    r"df_results\intervention\decoding_wav2vec2_df_3_xls.json",
    r"df_results\intervention\decoding_wav2vec2_df_3_xls_vowels.json",
    r"df_results\intervention\decoding_wav2vec2_df_4_xls.json",
    r"df_results\intervention\decoding_wav2vec2_df_4_xls_vowels.json",
    r"df_results\intervention\decoding_wav2vec2_df_5_xls.json",
    r"df_results\intervention\decoding_wav2vec2_df_5_xls_vowels.json"
]   

wav2vec2_LM_models = [
    "Dandan0K/Intervention-xls-FR-Hum-no-df1", 
    "Dandan0K/Intervention-xls-FR-Hum-no-df1-vowels"
    ]
wav2vec2_LM_json_paths = [
    r"df_results\intervention\decoding_wav2vec2_df_1_xls.json",
    r"df_results\intervention\decoding_wav2vec2_df_1_xls_vowels.json"
] 

########################################
# v2v2 VOX models with different ratio of male/female speakers in the fine-tuning data
wav2vec2_gender_json_paths = [
    # vox models
    "df_results/intervention/gender/decoding_wav2vec2_vox_M10F0.json",
    "df_results/intervention/gender/decoding_wav2vec2_vox_M8F2.json",
    "df_results/intervention/gender/decoding_wav2vec2_vox_M5F5.json",
    "df_results/intervention/gender/decoding_wav2vec2_vox_M2F2.json",
    "df_results/intervention/gender/decoding_wav2vec2_vox_M0F10.json",
    # xls models
    "df_results/intervention/gender/decoding_wav2vec2_xls_M10F0.json",
    "df_results/intervention/gender/decoding_wav2vec2_xls_M8F2.json",
    "df_results/intervention/gender/decoding_wav2vec2_xls_M5F5.json",
    "df_results/intervention/gender/decoding_wav2vec2_xls_M2F2.json",
    "df_results/intervention/gender/decoding_wav2vec2_xls_M0F10.json"
]
wav2vec2_gender_models = [
    # vox models
    "jonatasgrosman/exp_w2v2r_fr_vp-100k_gender_male-10_female-0_s714",
    "jonatasgrosman/exp_w2v2r_fr_vp-100k_gender_male-8_female-2_s911",
    "jonatasgrosman/exp_w2v2r_fr_vp-100k_gender_male-5_female-5_s722",
    "jonatasgrosman/exp_w2v2r_fr_vp-100k_gender_male-2_female-8_s473",
    "jonatasgrosman/exp_w2v2r_fr_vp-100k_gender_male-0_female-10_s934",
    # xls models
    "jonatasgrosman/exp_w2v2r_fr_xls-r_gender_male-10_female-0_s825",
    "jonatasgrosman/exp_w2v2r_fr_xls-r_gender_male-8_female-2_s755",
    "jonatasgrosman/exp_w2v2r_fr_xls-r_gender_male-5_female-5_s916",
    "jonatasgrosman/exp_w2v2r_fr_xls-r_gender_male-2_female-8_s886",
    "jonatasgrosman/exp_w2v2r_fr_xls-r_gender_male-0_female-10_s895"
]

In [19]:
"""
df_asr = df
df_wav2vec2_xls_no_LM =  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = wav2vec2_xls_json_path_no_LM,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["Dandan0K/Intervention-xls-FR-no-LM"],
                        )  
print("Done with wav2vec2_xls_no_LM")

df_asr = df
df_wav2vec2_xls_no_LM_beam=  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = wav2vec2_xls_json_path_no_LM_beam,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["Dandan0K/Intervention-xls-FR-no-LM"],
                            n_best = n_best,
                            with_lm = True,
                        )  
print("Done with df_wav2vec2_xls_no_LM_beam")



for index, (wav2vec2_model, json_path) in enumerate(zip(wav2vec2_LM_models, wav2vec2_LM_json_paths)):
    print(f"Processing model: {wav2vec2_model}")
    print(f"JSON path: {json_path}")
    df_wav2vec2 = fill_df_wav2vec2(
                            df=df_list[index // 2],  # Assuming each model corresponds to every two dataframes
                            json_path=json_path,
                            asr_response_col='asr_response',
                            process_time_col='process_time',
                            model_ID_list=[wav2vec2_model],
                            n_best=n_best,
                            with_lm=True
                        )
    print(f"Done with model: {wav2vec2_model}")

    
df_asr = df
df_whisperx =  fill_df(
                       df = df_asr, 
                       service_address = service_address_server, 
                       json_path = whisperX_json_path, 
                       asr_method = 'whisperx', 
                       asr_response_col = 'asr_response',
                       process_time_col = 'process_time')
print("Done with WhisperX") 

df_asr = df
df_wav2vec2_vox =  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = wav2vec2_vox_json_path,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["jonatasgrosman/exp_w2v2t_fr_vp-100k_s973", 
                                           "jonatasgrosman/exp_w2v2t_it_vp-100k_s449"],
                            n_best = n_best,
                        )  
print("Done with wav2vec2_vox")


df_asr = df
df_wav2vec2_xls =  fill_df_wav2vec2(
                        df = df_asr, 
                        json_path = wav2vec2_xls_json_path, 
                        asr_response_col = 'asr_response',
                        process_time_col = 'process_time',
                        model_ID_list = ["jonatasgrosman/wav2vec2-xls-r-1b-french", 
                                        "jonatasgrosman/wav2vec2-xls-r-1b-italian"],
                        n_best = n_best,
                        with_lm = True)
print("Done with wav2vec2_xls")

for wav2vec2_model, json_path in zip(wav2vec2_gender_models, wav2vec2_gender_json_paths):
    df_asr = df
    print("Processing model: ", wav2vec2_model)
    print(" json path: ", json_path)
    df_wav2vec2 =  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = json_path,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = [wav2vec2_model],
                        )  
    print("Done with model: ", wav2vec2_model)
"""

'\ndf_asr = df\ndf_whisperx =  fill_df(\n                       df = df_asr, \n                       service_address = service_address_server, \n                       json_path = whisperX_json_path, \n                       asr_method = \'whisperx\', \n                       asr_response_col = \'asr_response\',\n                       process_time_col = \'process_time\')\nprint("Done with WhisperX") \n\ndf_asr = df\ndf_wav2vec2_vox =  fill_df_wav2vec2(\n                            df = df_asr,\n                            json_path = wav2vec2_vox_json_path,\n                            asr_response_col = \'asr_response\',\n                            process_time_col = \'process_time\',\n                            model_ID_list = ["jonatasgrosman/exp_w2v2t_fr_vp-100k_s973", \n                                           "jonatasgrosman/exp_w2v2t_it_vp-100k_s449"],\n                            n_best = n_best,\n                        )  \nprint("Done with wav2vec2_vox")\n\n\ndf_asr = 

In [None]:
df_asr = df
df_wav2vec2_xls_no_LM =  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = wav2vec2_xls_json_path_no_LM,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["Dandan0K/Intervention-xls-FR-no-LM"],
                        )  
print("Done with wav2vec2_xls_no_LM")

df_asr = df
df_wav2vec2_xls_hotwords_lm =  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = hotwords_lm_xls_json_path,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["Dandan0K/Intervention-xls-FR-Ref"],
                            n_best = n_best,
                            with_lm = True,
                            hotwords= True
                        )  
print("Done with df_wav2vec2_xls_hotwords_lm")
	

df_asr = df
df_wav2vec2_xls_hotwords =  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = hotwords_wav2vec2_xls_json_path,
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["Dandan0K/Intervention-xls-FR-no-LM"],
                            n_best = n_best,
                            with_lm = True,
                            hotwords= True
                        )  
print("Done with df_wav2vec2_xls_hotwords")

# Change alpha and beta values, change unk_score 


In [25]:
df_asr = pd.read_json('dfs/deco/Intervention_Decoding_removed_df.json')
print(len(df_asr))
df_wav2vec2_Ref_rem=  fill_df_wav2vec2(
                            df = df_asr,
                            json_path = "df_results/intervention/decoding_wav2vec2_xls_lm_Ref_del.json",
                            asr_response_col = 'asr_response',
                            process_time_col = 'process_time',
                            model_ID_list = ["Dandan0K/Intervention-xls-FR-Ref"],
                            n_best = n_best,
                            with_lm = True,
                        )  
print("Done with df_wav2vec2_xls_no_LM_beam")

83


Some weights of the model checkpoint at Dandan0K/Intervention-xls-FR-Ref were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Dandan0K/Intervention-xls-FR-Ref and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN t

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Only 218 unigrams passed as vocabulary. Is this small or artificial data?


Processing file:  3103_edugame2023_0828b44e3f534a3eb509622760a0e1d2_08daac6429824018974fa150e351a7c5.wav
ASR response:  [{'transcription': {'text': 'prumeu chambre demande si-tu citis le crangrand freumevelau lieux hôtel crodon chanson', 'logit_score': -65.54761182875558, 'lm_score': -178.8191719352602, 'word_offsets': None}, 'alignment': [('haie', None, 'deletion'), ('brume', 'prumeu', 'substitution'), ('chambre', 'chambre', 'match'), ('demande', 'demande', 'match'), (None, 'si', 'insertion'), (None, 'tu', 'insertion'), (None, 'citis', 'insertion'), ('style', 'le', 'substitution'), ('crâne', 'crangrand', 'substitution'), ('fureur', 'freumevelau', 'substitution'), ('manivelle', 'lieux', 'substitution'), ('hôtel', 'hôtel', 'match'), ('cordon', 'crodon', 'substitution'), ('chanson', 'chanson', 'match'), ('pommier', None, 'deletion')]}]
Processing file:  3110_edugame2023_98b26f17034f480a97b33384488bb831_575fcc0e5d2b421089138c4870a1387b.wav


  df.loc[df["filename"] == filename, process_time_col] = [time_end - time_start]


ASR response:  [{'transcription': {'text': 'piège femme secret finit mulserf-journa précision dix duillusion débarquement', 'logit_score': -38.12207549858078, 'lm_score': -78.54638199274268, 'word_offsets': None}, 'alignment': [('piège', 'piège', 'match'), ('femme', 'femme', 'match'), ('secret', 'secret', 'match'), ('finit', 'finit', 'match'), ('mille', None, 'deletion'), ('cerf', 'mulserf', 'substitution'), ('jardin', 'journa', 'substitution'), ('précision', 'précision', 'match'), ('dix', 'dix', 'match'), ('lieux', None, 'deletion'), ('million', 'duillusion', 'substitution'), ('débarquement', 'débarquement', 'match')]}]
Processing file:  3119_edugame2023_5e6440930fc44670921d8a8b496bd499_1fac4df4c02140ada716303b8a4e2c12.wav
ASR response:  [{'transcription': {'text': 'joue roi-ran toit quantsoit tout doncen soissonte', 'logit_score': -29.331292382009728, 'lm_score': -112.70962852486622, 'word_offsets': None}, 'alignment': [('rof', None, 'deletion'), ('rari', None, 'deletion'), ('tuit', 

In [None]:
audio, sr = librosa.load('../Intervention_T1_data/recordings/deco_trials/FR_deco/FR_deco/3101_edugame2023_9a4621529c23405c8d2681287fed34a4_b0b4cf5f585b4888a333f7b73d48e62e.wav', sr=16000)
audio

array([ 1.9794676e-05,  3.3159624e-05,  2.9026451e-05, ...,
       -1.2594093e-04, -5.1501807e-04, -6.2957359e-04], dtype=float32)

### Brouillon

Notes:

- I need to know the complexity level of reference texts so that I can evaluate the performance wrt to this criteria
- Some audios are not processed by Whisperx, others not processed by IBM Watson, others not by both --> Empty response returned by ASR
- Warning when loading wav2vec2 models disapear when running from huggingsound_env instead of asr


##### Caller function (Todo: Update)

In [None]:
"""
Calls the segment service with the provided parameters.

Args:
  service_address (str): The address of the segment service.
  audio_file_path (str): The path to the audio file.
  reference_text (str): The reference text for the audio.
  language (str): The language of the audio and reference text.
  method (str): The method to be used by the segment service.
  model_ID_list (list): The list of model IDs to be used by the segment service.
  with_LM (bool): Whether to use the language model or not.

Returns:
  dict: The JSON response from the segment service.
"""
def call_segment_service( service_address, 
                          audio_file_path,
                          reference_text,
                          language,
                          method,
                          model_ID_list = None,
                          with_LM = False ):
                        
    audio_file_base64_string = base64.b64encode( open(audio_file_path, 'rb').read()).decode('ASCII')
    response = requests.post( service_address,
                              data = json.dumps( {
                                  "audio_file_base64_string":audio_file_base64_string,
                                  "reference_text":reference_text,
                                  "language":language,
                                  "method":method,
                                  "model_ID_list":model_ID_list,
                                  "with_LM":with_LM
                              } ),
                              headers = {"Content-Type": "application/json"}
                            )
    return response.json()

''' EXEMPLE OF USAGE
call_segment_service( 
            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.
            audio_file_path = audio_path,  ### path to the audio file
            reference_text = reference_text,
            language = language,   ### "it" for "italian", "fr" for "french", "de" for "german"
            method = "ibm-watson"  ### The name of the model, can be either "whisperx" or "ibm-watson"
        )
'''

' EXEMPLE OF USAGE\ncall_segment_service( \n            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.\n            audio_file_path = audio_path,  ### path to the audio file\n            reference_text = reference_text,\n            language = language,   ### "it" for "italian", "fr" for "french", "de" for "german"\n            method = "ibm-watson"  ### The name of the model, can be either "whisperx" or "ibm-watson"\n        )\n'

In [None]:
"""
call_segment_service( 
            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.
            audio_file_path = recordings_filepaths[1],  ### path to the audio file
            reference_text = 'a b c',
            language = 'fr',   ### "it" for "italian", "fr" for "french", "de" for "german"
            method = "whisperx"  ### The name of the model, can be either "whisperx" or "ibm-watson"
  
        )

call_segment_service( 
            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.
            audio_file_path = recordings_filepaths[4],  ### path to the audio file
            reference_text = 'a b c',
            language = 'fr',   ### "it" for "italian", "fr" for "french", "de" for "german"
            method = "wav2vec2",  ### The name of the model, can be either "whisperx" or "ibm-watson"
            model_ID_list = ["jonatasgrosman/wav2vec2-xls-r-1b-french", 
                            "jonatasgrosman/wav2vec2-xls-r-1b-italian"],
            with_LM=False
        )

call_segment_service( 
            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.
            audio_file_path = recordings_filepaths[4],  ### path to the audio file
            reference_text = 'a b c',
            language = 'fr',   ### "it" for "italian", "fr" for "french", "de" for "german"
            method = "wav2vec2",  ### The name of the model, can be either "whisperx" or "ibm-watson"
            model_ID_list = ["jonatasgrosman/wav2vec2-xls-r-1b-french", 
                            "jonatasgrosman/wav2vec2-xls-r-1b-italian"],
            with_LM=True
        )
"""

'\ncall_segment_service( \n            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.\n            audio_file_path = recordings_filepaths[1],  ### path to the audio file\n            reference_text = \'a b c\',\n            language = \'fr\',   ### "it" for "italian", "fr" for "french", "de" for "german"\n            method = "whisperx"  ### The name of the model, can be either "whisperx" or "ibm-watson"\n  \n        )\n\ncall_segment_service( \n            service_address = "http://localhost:8070/segment",  ### This is the service address, default on 8070 port.\n            audio_file_path = recordings_filepaths[4],  ### path to the audio file\n            reference_text = \'a b c\',\n            language = \'fr\',   ### "it" for "italian", "fr" for "french", "de" for "german"\n            method = "wav2vec2",  ### The name of the model, can be either "whisperx" or "ibm-watson"\n            model_ID_list = ["jonatasgrosman/wa