# Get an idea of the data

In [223]:
import pandas as pd
import torch
from datasets import Dataset,  load_metric, Audio
from transformers import  Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import TrainingArguments,  Trainer
import wandb
import IPython.display as ipd
import numpy as np
import random
import librosa


### Paths of the recording folder and the two dataframes we are working with

In [224]:
recordings_folder_path = r"../Intervention_T1_data/recordings/deco_trials/FR_deco/FR_deco"
df = pd.read_json("df_results/intervention/decoding_wav2vec2_xls_lm_Ref.json")


# 20% of the dataset (60% for training, 20% for validation, 20% for testing)
# split is speaker-independent and correct number of words is evenly distributed across the splits
df1 = pd.read_json("dfs/Intervention_df_1.json")

### Some information regarding the data: number of files and total audio duration per config_id ( = list of (pseudo)words )

In [225]:
print('----Total number of files per config_id in the whole dataset----')
print(df.groupby('config_id').size())


----Total number of files per config_id in the whole dataset----
config_id
config_A_complex_1    51
config_A_complex_2    52
config_A_easy_1       51
config_A_easy_2       51
config_A_pseudo_1     47
config_A_pseudo_2     47
config_B_complex_1    57
config_B_complex_2    57
config_B_easy_1       57
config_B_easy_2       57
config_B_pseudo_1     55
config_B_pseudo_2     55
config_C_complex_1    57
config_C_complex_2    56
config_C_easy_1       57
config_C_easy_2       57
config_C_pseudo_1     53
config_C_pseudo_2     53
dtype: int64


In [226]:
print('----Total number of files per config_id for 1/5 of the dataset----')
print(df1.groupby('config_id').size())

----Total number of files per config_id for 1/5 of the dataset----
config_id
config_A_complex_1     8
config_A_complex_2     9
config_A_easy_1       10
config_A_easy_2       10
config_A_pseudo_1      9
config_A_pseudo_2      9
config_B_complex_1    11
config_B_complex_2    11
config_B_easy_1       11
config_B_easy_2       11
config_B_pseudo_1     10
config_B_pseudo_2     11
config_C_complex_1    10
config_C_complex_2     9
config_C_easy_1       11
config_C_easy_2       10
config_C_pseudo_1     11
config_C_pseudo_2     10
dtype: int64


In [227]:
print('----Total audio duration per config_id for the whole dataset----')
print(df.groupby('config_id')['audio_length'].sum())

----Total audio duration per config_id for the whole dataset----
config_id
config_A_complex_1    1103.916698
config_A_complex_2    1470.180571
config_A_easy_1       1132.421079
config_A_easy_2       1181.552762
config_A_pseudo_1     1055.631238
config_A_pseudo_2     1311.914667
config_B_complex_1    1610.688254
config_B_complex_2    1733.065143
config_B_easy_1       1534.857905
config_B_easy_2       1568.918984
config_B_pseudo_1     1291.475556
config_B_pseudo_2     1630.198349
config_C_complex_1    1437.849651
config_C_complex_2    1732.377524
config_C_easy_1       1357.399619
config_C_easy_2       1282.460952
config_C_pseudo_1     1261.804698
config_C_pseudo_2     1535.687492
Name: audio_length, dtype: float64


In [228]:
print('----Total audio duration per config_id for 1/5 of the dataset----')
print(df1.groupby('config_id')['audio_length'].sum())

----Total audio duration per config_id for 1/5 of the dataset----
config_id
config_A_complex_1    175.701333
config_A_complex_2    273.396825
config_A_easy_1       227.887746
config_A_easy_2       225.350095
config_A_pseudo_1     198.453841
config_A_pseudo_2     252.580571
config_B_complex_1    271.818413
config_B_complex_2    298.133333
config_B_easy_1       260.843429
config_B_easy_2       267.996063
config_B_pseudo_1     182.414476
config_B_pseudo_2     303.804444
config_C_complex_1    230.680381
config_C_complex_2    262.193778
config_C_easy_1       216.523175
config_C_easy_2       217.417143
config_C_pseudo_1     272.531302
config_C_pseudo_2     301.691937
Name: audio_length, dtype: float64


In [229]:
print(df.columns)


Index(['participant_id', 'session_id', 'language', 'session_form', 'phase',
       'config_id', 'filename', 'reference_text', 'human_transcription',
       'accuracy', 'notes', 'comments', 'human_alignment', 'asr_response',
       'asr_comparison', 'audio_length', 'process_time',
       'words_human_transcription'],
      dtype='object')


#### Important columns:
- reference_text: contains the list of (pseudo)word that was read by the child
- words_human_transcription: contains the transcription without the annotations and after being normalized (see cell below)
- human_alignment: contains the alignment between the clinician and asr transcription
- filename: unique identifier

#### Transcription Normalization

Get transcriptions without the annotations and write them in the words_human_transcription column. These will be fed to the ASR during training so they need to contain what was said without any comment or mark.
- Remove the sounding out annotation '.' in the transcription : pa.pa --> papa
- Remove the insertion annotation'{...}' in the transcription : pa{pu}pa --> papupa
- Remove the comments {inaudible}, {comments}, {adult_support}, {adult_task}, {no audio}, {pause}, {cut recording}


The transcription are not consistent within rows and a given word can be written differently. We need to normalize the spelling of words that were said correctly to get consistent labels for the finetuning task and to be able to evaluate accurately the performance of the ASR. To do this we create a replacement dictionnary (below) that maps a certain spelling to the normalized spelling. 

In [230]:
# small extract of the replacement dictionary applied to the transcriptions
replacements = {
    "parosse": "paroce",
    "re": "reux",
    "reu": "reux",
    "bante": "bente",
    "englage": "anglage",
    "anglaje": "anglage",
    "trèb": "trèbe",
    "piro": "pireau",
    "lizoi": "lisoie",
    "milvené": "milvenet",
    "conjo": "conjeau",
    "éteur": "héteur",
    "eteur": "héteur",
    "et": "haie",
    "é": "haie",
    "aie": "haie",
    "ai": " haie",    
    "hotel": "hôtel",
    "otel": "hôtel",    
    "otèl": "hôtel",
    "pomier": "pommier",
    "pomié": "pommier",
    "pommié": "pommier",
}

## Get a look at the data

In [231]:
# Convert to Audio Datasetfor all the dataframes in the lists

# Replace column filename of the dataframe by an audio column with the path to the recordings folder
df['audio'] = recordings_folder_path + "/" + df['filename']
dataset = Dataset.from_pandas(df).cast_column("audio", Audio())
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) 

In [232]:
def wav2vec2(audio_path, 
            processor, 
            model, 
            reference_text = None, 
            pool = None, 
            num_processes= None,
            beam_width= None,
            beam_prune_logp = None,
            token_min_logp = None,
            hotwords = None,
            hotword_weight = None,
            alpha = None,
            beta = None,
            unk_score_offset = None,
            lm_score_boundary = None,
            output_word_offsets= False,
            n_best = 1,
            with_lm = False,):
    
    audio, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

    #inputs = inputs.to('cuda')

    with torch.no_grad():
        logits = model(**inputs).logits.cpu()

    if not with_lm:
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription_text = processor.decode(predicted_ids[0]).lower()
        transcription_dict = {"text": transcription_text}
        alignment_result =  None
        result = {
            "transcription": transcription_dict,
            "alignment": alignment_result
        } if reference_text else transcription_dict
        transcriptions = [result]
        #TODO: topk_values, topk_indices = torch.topk(logits, k=n_best, dim=-1)
    else:
        transcription = processor.batch_decode(logits.numpy(), output_word_offsets=output_word_offsets, 
                                               n_best =n_best)

        # if only one transcription is returned, convert it to a list for consistency
        if n_best == 1 or not with_lm:
            transcription.text = [transcription.text]
            transcription.logit_score = [transcription.logit_score] if with_lm else None
            transcription.lm_score = [transcription.lm_score] if with_lm else None
            transcription.word_offsets = [transcription.word_offsets] if output_word_offsets else None

        # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
        if output_word_offsets == True:
            time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
            for i in range(len(transcription.word_offsets[0])): 
                for dict in transcription.word_offsets[0][i]:
                    dict["start_offset"] = round(dict["start_offset"] * time_offset, 2)
                    dict["end_offset"] = round(dict["end_offset"] * time_offset, 2)

        
        # change output format to dictionary
        transcriptions = []
        
        for i in range(len(transcription.text[0])):
            transcription_dict = {"text": transcription.text[0][i],
                                    "logit_score": transcription.logit_score[0][i],
                                    "lm_score": transcription.lm_score[0][i] if with_lm else None,
                                    "word_offsets": transcription.word_offsets[0][i] if output_word_offsets else None
                                }
            alignment_result = None

            result = {
                "transcription": transcription_dict,
                "alignment": alignment_result
            } if reference_text else transcription_dict
            
            transcriptions.append(result)

    return transcriptions, logits

In [233]:
# model at https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-french/tree/main 
# with language model containing only the reference words (what they are SUPPOSED to read)



model = Wav2Vec2ForCTC.from_pretrained('Cnam-LMSSC/wav2vec2-french-phonemizer')
processor = Wav2Vec2Processor.from_pretrained('Cnam-LMSSC/wav2vec2-french-phonemizer')

model = Wav2Vec2ForCTC.from_pretrained('Dandan0K/xls_1b_decoding_fr_decoding_test_iter')
processor = Wav2Vec2Processor.from_pretrained('Dandan0K/xls_1b_decoding_fr_decoding_test_iter')


Some weights of the model checkpoint at Cnam-LMSSC/wav2vec2-french-phonemizer were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Cnam-LMSSC/wav2vec2-french-phonemizer and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probab

config.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

In [234]:
rand_int = random.randint(0, len(dataset)-1)
print("Random Integer:", rand_int)
print("Reference Text:", dataset[rand_int]["reference_text"])
print("Transcript:", dataset[rand_int]["words_human_transcription"])
print("Annotated:", dataset[rand_int]["human_transcription"])
print("Accuracy:", dataset[rand_int]["accuracy"])
print("Input Array Ahape:", dataset[rand_int]["audio"]["array"].shape)
print("Sampling Rate:", dataset[rand_int]["audio"]["sampling_rate"])
filename = dataset[rand_int]["audio"]["path"]
ipd.Audio(data=dataset[rand_int]["audio"]["array"], autoplay=True, rate=16000)


Random Integer: 863
Reference Text: rof rari tuit cutice zeux bumon bante perné loif bonfage jeul trème
Transcript: rof rarrari fui cutice zeu dumon dente perné jjoif bonfa bonfage jeul jeul jeul trème
Annotated: rof {r.rar.}rari fui cutisse zeu du.mon dente pér.né {j.}joif bon.fa bonfage jeul jeul jeul trème
Accuracy: 2 1 0 2 2 0 2 1 1 2 2 2
Input Array Ahape: (453161,)
Sampling Rate: 16000


In [235]:
filepath = dataset[rand_int]["audio"]["path"]
filename = filepath.split("/")[-1]
reference_text = dataset[rand_int]["reference_text"]

# find the row in the dataframe with the same filename
response = df[df['filename'] == filename]['asr_response'].values[0]
response = eval(response.replace('null', 'None')) # contains top 3 responses
response[0] # top 1 response

{'transcription': {'text': 'rof rari fidciz-jmon- dontsperne-joaf bonfage jeul trème',
  'logit_score': -80.97405585026841,
  'lm_score': -119.38878566787353,
  'word_offsets': None},
 'alignment': [['rof', 'rof', 'match'],
  ['rari', 'rari', 'match'],
  ['tuit', None, 'deletion'],
  ['cutice', None, 'deletion'],
  ['zeux', None, 'deletion'],
  ['bumon', 'fidciz', 'substitution'],
  ['bante', 'jmon', 'substitution'],
  ['perné', 'dontsperne', 'substitution'],
  ['loif', 'joaf', 'substitution'],
  ['bonfage', 'bonfage', 'match'],
  ['jeul', 'jeul', 'match'],
  ['trème', 'trème', 'match']]}

In [236]:
# get the transcription from the model (without alignment and word offsets)
transcription, logits = wav2vec2(filepath, processor=processor, model=model, reference_text= reference_text) 
#transcription = wav2vec2(filepath, processor=processor, model=model, reference_text= reference_text, with_lm=True, n_best=1) 

transcription, logits


([{'transcription': {'text': 'rof rarrari fui cutice zeu dumon dente perné jjoif bonfa bonfage jeul jeul jeul trème'},
   'alignment': None}],
 tensor([[[ 0.0202, -1.7140, -2.0183,  ..., -0.7620,  0.2985, -0.7207],
          [-2.3730, -1.0241, -1.2727,  ..., -0.2516,  0.1246, -0.3987],
          [13.2701, -5.3995, -5.5242,  ..., -3.1928, -2.4611, -3.3778],
          ...,
          [14.5085, -6.6453, -6.6698,  ..., -3.8190, -2.7451, -4.0098],
          [14.0889, -6.5835, -6.4120,  ..., -3.6410, -2.6628, -4.0710],
          [-0.3543, -2.3185, -2.3499,  ..., -0.4469, -0.3250, -1.0011]]]))

In [237]:
logits.shape

torch.Size([1, 1415, 47])

In [238]:
print(logits.shape)
pred_id = torch.argmax(logits, dim=-1)[0].tolist()
# show the values of pred_id different from 0 (with index)
print([(i, pred_id[i]) for i in range(len(pred_id)) if pred_id[i] != 33])



torch.Size([1, 1415, 47])
[(0, 4), (1, 4), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 0), (16, 0), (17, 0), (18, 0), (19, 0), (20, 0), (21, 0), (22, 0), (23, 0), (24, 0), (25, 0), (26, 0), (27, 0), (28, 0), (29, 0), (30, 0), (31, 0), (32, 0), (33, 0), (34, 0), (35, 0), (36, 0), (37, 0), (38, 0), (39, 0), (40, 0), (41, 0), (42, 0), (43, 0), (44, 0), (45, 0), (46, 0), (47, 0), (48, 0), (49, 0), (50, 0), (51, 0), (52, 0), (53, 0), (54, 0), (55, 0), (56, 0), (57, 0), (58, 0), (59, 0), (60, 0), (61, 0), (62, 0), (63, 0), (64, 0), (65, 0), (66, 0), (67, 0), (68, 0), (69, 0), (70, 0), (71, 0), (72, 0), (73, 0), (74, 0), (75, 0), (76, 0), (77, 0), (78, 0), (79, 0), (80, 0), (81, 0), (82, 0), (83, 0), (84, 0), (85, 0), (86, 0), (87, 0), (88, 0), (89, 0), (90, 0), (91, 0), (92, 0), (93, 0), (94, 0), (95, 0), (96, 0), (97, 0), (98, 0), (99, 0), (100, 0), (101, 0), (102, 0), (103, 0), (104, 0), (105, 0), (106, 0), (107, 0), (10

In [239]:
audio, sr = librosa.load(filepath, sr=16000)
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
logits = model(input_values=inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['rof rarrari fui cutice zeu dumon dente perné jjoif bonfa bonfage jeul jeul jeul trème']

In [240]:
print(inputs['input_values'][0])
print(inputs['attention_mask'][0])
print(logits[0])

tensor([ 0.0006,  0.0008,  0.0007,  ...,  0.0489, -0.0524, -0.0528])
tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.int32)
tensor([[ 0.0202, -1.7140, -2.0183,  ..., -0.7620,  0.2985, -0.7207],
        [-2.3730, -1.0241, -1.2727,  ..., -0.2516,  0.1246, -0.3987],
        [13.2701, -5.3995, -5.5242,  ..., -3.1928, -2.4611, -3.3778],
        ...,
        [14.5085, -6.6453, -6.6698,  ..., -3.8190, -2.7451, -4.0098],
        [14.0889, -6.5835, -6.4120,  ..., -3.6410, -2.6628, -4.0710],
        [-0.3543, -2.3185, -2.3499,  ..., -0.4469, -0.3250, -1.0011]],
       grad_fn=<SelectBackward0>)


In [241]:
processor.batch_decode(pred_id)

['',
 '',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',


Questions when finetuning
- Downstream task ? ASR first to get word offsets then classification
- Which metric ? Wer or Cer
- Since child speech signal is different from adult speech signal,should I freeze the feature encoder layer (as done in the paper and in the tutorial for finetuning the model)
- If yes, and since little to no asr data featuring french children, should I finetune the model on child speech in other languages first with unfrozen feature encoder layer and then finetune that model on my data (frozen encoder layer) ?

# Why not encode transcriptions in IPA and train w2v2 phoneme ?

In [257]:
#!pip install text2ipa

from text2ipa import get_IPAs
bulk = ['rrofe', 'cerf','ruit', 'milvenet', 'lisoie', 'clef', 'conjeau', 'héteur', 'haie', 'hôtel', 'pommier', 'anglage', 'bente', 'paroce', 'pireau', 'trèbe', 'aout', 'août']
language = 'fr'
# Convert a list of text to English UK IPA
IPAs = get_IPAs(bulk,language)
print(IPAs)

['ʀʀɔf', 'sɛʀf', 'ʀɥi', 'milvənɛ', 'lizwa', 'klɛf', 'kɔ̃ʒo', 'etœʀ', 'ə', 'otəl', 'pɔmjəʀ', 'ɑ̃ɡlaʒ', 'bɑ̃tə', 'paʀɔz', 'piʀo', 'tʀə̀b', 'au', 'au']
