In [3]:
import pandas as pd
import numpy as np
import re
import os
from transformers import AutoProcessor
from pyctcdecode import build_ctcdecoder
from transformers import Wav2Vec2ProcessorWithLM
from huggingface_hub import Repository,HfApi, snapshot_download
from datasets import load_dataset
from itertools import islice
import random
from itertools import islice
from datasets import load_dataset
import shutil


In [4]:
# Creates the text files containing the words to be added to the LM for each language
df = pd.read_json(r'dfs/deco/Intervention_df_cleaned_deco.json')
df_1 = pd.read_json(r'dfs/deco/Intervention_df_1_deco.json')
df_2 = pd.read_json(r'dfs/deco/Intervention_df_2_deco.json')
df_3 = pd.read_json(r'dfs/deco/Intervention_df_3_deco.json')
df_4 = pd.read_json(r'dfs/deco/Intervention_df_4_deco.json')
df_5 = pd.read_json(r'dfs/deco/Intervention_df_5_deco.json')
dataframes = [df_1, df_2, df_3, df_4, df_5]

## Create the text files for the LM

In [5]:
# LM containing the words from the reference_text column
with open("LM/intervention/Fr_Ref.txt", "w") as file:
  ref_transcriptions = df[df["language"] == "fr"]["reference_text"]
  file.write("\n".join(ref_transcriptions))

# Remove duplicates words from the LM/intervention/Fr_Ref.txt text files to generate possible mispronounciations
with open("LM/intervention/Fr_Ref.txt", "r") as input_file:
    unique_ref_transcriptions = set(ref_transcriptions)
    with open("LM/intervention/Fr_Ref_unique.txt", "w") as output_file:
        output_file.write("\n".join(unique_ref_transcriptions))

# LM containing the words from the human_transcription column (so with the potential mispronounciations)
for i, df_to_exclude in enumerate(dataframes, start=1):
    df_no_excluded = pd.concat([df for j, df in enumerate(dataframes) if j != i-1], ignore_index=True)
    with open(f"LM/intervention/Fr_Hum_no_df{i}.txt", "w") as file:
        french_transcriptions = df_no_excluded[df_no_excluded["language"] == "fr"]["words_human_transcription"]
        file.write("\n".join(french_transcriptions))
        file.write("\n".join(ref_transcriptions))


In [6]:
# generate the mispronounciations
fr_dict_voyelles = {
    'a': ['ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'ai': ['a', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "ia"],
    'aie': ['a', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "ia", "ae", "ei", "iea", "eai", "eia", "iae", "aei"],
    'au': ['a', 'e', 'é', 'è', 'i', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "ua"],
    'eau': ['a', 'e', 'é', 'è', 'i', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "uae", "aeu", "aue", "uea", "eua", "ea", "ae", "ua", "ue"],
    'e': ['a', 'ai', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'é': ['a', 'ai', 'e', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'è': ['a', 'ai', 'e', 'é', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'i': ['a', 'ai', 'e', 'é', 'è', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'ie': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "ei"],
    'o': ['a', 'ai', 'e', 'é', 'è', 'i', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'oi': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "io"],
    'ou': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'u', 'eu', 'on', 'an', 'en', "in", "uo"],
    'u': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'eu', 'on', 'an', 'en', "in"],
    'y': ['a', 'ai', 'e', 'é', 'è', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'eu': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'on', 'an', 'en', "in", "ue"],
    'on': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'an', 'en', "in", "no", 'one'],
    'an': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', "in", "na", "ane"],
    'en': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', "in", "ne"],
    'in': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', "en", "ni"],
    'oeu': ['a', 'ai', 'e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'on', 'an', 'en', "in", "euo", "oue", "eou", "ueo", "uoe", "oe", "eo", "uo", "ue"],
    'ê': ['a','e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'ô': ['a','e', 'é', 'è', 'i','oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in"],
    'eui': ['a','e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "uei", "eiu", "ieu","iue", "eui", "ue", "ei", "ui", "iu"],
    'aoû': ['a','e', 'é', 'è', 'i', 'o', 'oi', 'u', 'eu', 'on', 'an', 'en', "in", "ua", "oua", "auo", "aou", "oa", "ua", "uo"],
    'ion': ['a','e', 'é', 'è', 'i', 'o', 'oi', 'ou', 'u', 'eu', 'on', 'an', 'en', "in", "noi", "oin", "nio", "ino", "ion", "no", "oi", "ni", "io"],
}

def generate_mispronunciations(word, dict_voyelles, index=0, mispronunciations=None):
    #print(f"Here we go")
    if mispronunciations is None:
        mispronunciations = set()

    if index >= len(word):
        return mispronunciations
    
    # Iterate over the word and find the earliest and biggest vowel
    earliest_vowel_index = len(word)  # Initialize with a value greater than any possible index
    longest_vowel_length = 0
    vowel_found = False

    #print(f"Word: {word[index:]} at index {index}")
    for i, c in enumerate(word[index:], start=index):
        if vowel_found:
            break
        for key in sorted(dict_voyelles.keys(), key=len, reverse=True):
            if key.startswith(c):
                if word[i:i + len(key)] == key:
                    if not vowel_found:
                        earliest_vowel_index = i
                        longest_vowel_length = len(key)
                        vowel_found = True
                        for value in dict_voyelles[key]:
                            new_word = word[:earliest_vowel_index] + value + word[earliest_vowel_index + longest_vowel_length:]
                            mispronunciations.add(new_word)
                        break
        
    # If no vowel is found, return the original word
    if earliest_vowel_index == len(word):
        return mispronunciations
    
    generate_mispronunciations(word, dict_voyelles, earliest_vowel_index + longest_vowel_length, mispronunciations)
    return mispronunciations
    
    
# Example usage
word = "poire"
result = generate_mispronunciations(word, dict_voyelles=fr_dict_voyelles)
print(result)


{'panre', 'poiroi', 'poiré', 'poiren', 'poiru', 'poira', 'penre', 'pure', 'père', 'poiro', 'poirou', 'piore', 'pere', 'poireu', 'ponre', 'poirai', 'poure', 'poiri', 'pare', 'poirè', 'pire', 'pinre', 'pore', 'peure', 'poiran', 'paire', 'poiron', 'pére', 'poirin'}


In [7]:
with open("LM/intervention/Fr_Ref_unique.txt", "r") as file:
    sentences = file.readlines()
all_mispronunciations = {}
sentences_misspronounced = []
for sentence in sentences:
    words = sentence.split()
    for word in words:
        mispronunciations = generate_mispronunciations(word, fr_dict_voyelles)
        all_mispronunciations[word] = mispronunciations
        for mispronunciation in mispronunciations:
            # write the whole sentence with word replaced by the mispronunciation
            sentences_misspronounced.append(sentence.replace(word, mispronunciation))
# write the sentences with mispronunciations to a file
with open("LM/intervention/Fr_Ref_mispronounced.txt", "w") as file:
    file.write("".join(sentences_misspronounced))

for i in range(1, 6):
    with open(f"LM/intervention/Fr_Hum_no_df{i}_vowels.txt", "w") as output_file:
        with open(f"LM/intervention/Fr_Hum_no_df{i}.txt", "r") as input_file:
            for line in input_file:
                output_file.write(line)
        with open("LM/intervention/Fr_Ref_mispronounced.txt", "r") as mispronounced_file:
            for line in mispronounced_file:
                output_file.write(line)

In [8]:
# number of words in the "LM/intervention/Fr_Hum_no_df1.txt"
with open("LM/intervention/Fr_Hum_no_df1.txt", "r") as file:
    words = file.read().split()
    print(len(words))
    # number of lines
    print(len(open("LM/intervention/Fr_Hum_no_df1.txt").readlines(  )))

with open("LM/intervention/Fr_Ref_mispronounced.txt", "r") as file:
    words = file.read().split()
    print(len(words))
    # number of lines
    print(len(open("LM/intervention/Fr_Ref_mispronounced.txt").readlines(  )))
    
with open("LM/intervention/Fr_Hum_no_df1_vowels.txt", "r") as file:
    words = file.read().split()
    print(len(words))
    # number of lines
    print(len(open("LM/intervention/Fr_Hum_no_df1_vowels.txt").readlines(  )))
    

20290
1667
88450
6885
108739
8551


## Create the LM with KenLM

In [9]:
# kenlm/build/bin/lmplz -o number_n_of_n_gram <"text_file_path" > "LM_file_path"
# kenlm/build/bin/lmplz -o 2 <"ASR_Thesis/It_Ref.txt" > "ASR_Thesis/2gram_It_Ref.arpa"

def create_LM(text_file_path, LM_file_path, n, discount_fallback=False):
    # remove the empty lines in the text file
    with open(text_file_path, "r") as file:
        lines = file.readlines()
    with open(text_file_path, "w") as file:
        for line in lines:
            if line.strip():
                file.write(line)
    command = f"../../kenlm/build/bin/lmplz -o {n} <{text_file_path}> {LM_file_path}"
    if discount_fallback:
        command += " --discount_fallback"
    os.system(command)

    with open(LM_file_path, "r") as file:
        lines = file.readlines()

    has_added_eos = False
    for i, line in enumerate(lines):
        if not has_added_eos and "ngram 1=" in line:
            # Increment the count of unigrams
            count = line.strip().split("=")[-1]
            lines[i] = line.replace(f"{count}", f"{int(count) + 1}")
        elif not has_added_eos and "<s>" in line:
            # Write the line and add </s> token
            lines.insert(i + 1, line.replace("<s>", "</s>"))
            has_added_eos = True

    with open(LM_file_path, "w") as file:
        file.writelines(lines)


# create_LM for all the files in the LM/intervention directory
for file in os.listdir("LM/intervention"):
    if file.endswith(".txt"):
        text_file_path = os.path.join("LM/intervention", file)
        LM_file_path = text_file_path.replace(".txt", ".arpa")
        create_LM(text_file_path, LM_file_path, 2, discount_fallback=False)


=== 1/5 Counting and sorting n-grams ===
Reading /home/hnp_vr/ASR_Dana/Master-Thesis-Pipeline/LM/intervention/Fr_Hum_no_df4.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 20278 types 2507
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:30084 2:26788528128
Statistics:
1 2507 D1=0.805085 D2=1.12841 D3+=0.633245
2 5044 D1=0.89506 D2=1.01403 D3+=1.59076
Memory estimate for binary LM:
type     kB
probing 152 assuming -p 1.5
probing 162 assuming -r models -p 1.5
trie     85 without quantization
trie     72 assuming -q 8 -b 8 quantization 
trie     85 assuming -a 22 array pointer compression
trie     72 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:30084 2:80704
=== 4/5 Calculating and writing order-interpolat

In [10]:
# create_LM for ref file with discount_fallback
text_file_path = os.path.join("LM/intervention/Fr_Ref.txt")
LM_file_path = text_file_path.replace(".txt", ".arpa")
create_LM(text_file_path, LM_file_path, 3, discount_fallback=True)

=== 1/5 Counting and sorting n-grams ===
Reading /home/hnp_vr/ASR_Dana/Master-Thesis-Pipeline/LM/intervention/Fr_Ref.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 11160 types 219
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:2628 2:9317757952 3:17470797824
Substituting fallback discounts for order 0: D1=0.5 D2=1 D3+=1.5
Substituting fallback discounts for order 1: D1=0.5 D2=1 D3+=1.5
Substituting fallback discounts for order 2: D1=0.5 D2=1 D3+=1.5
Statistics:
1 219 D1=0.5 D2=1 D3+=1.5
2 234 D1=0.5 D2=1 D3+=1.5
3 216 D1=0.5 D2=1 D3+=1.5
Memory estimate for binary LM:
type        B
probing 15208 assuming -p 1.5
probing 17492 assuming -r models -p 1.5
trie     8691 without quantization
trie     9766 assuming -q 8 -b 8 quantization 
trie     8674 assuming -a 22 array pointer compression
trie     9

## Add LM to a Wav2Vec2 Model

In [11]:
private_token = "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"

model_ids = ["jonatasgrosman/wav2vec2-xls-r-1b-french"] #"jonatasgrosman/wav2vec2-xls-r-1b-italian", 

local_dirs = [ "custom_w2v2/xls-r-1/FR_no_LM", "custom_w2v2/xls-r-1/Fr_Ref","custom_w2v2/xls-r-1/Fr_Hum_no_df1", "custom_w2v2/xls-r-1/Fr_Hum_no_df1_vowels",
             "custom_w2v2/xls-r-1/Fr_Hum_no_df2", "custom_w2v2/xls-r-1/Fr_Hum_no_df2_vowels", "custom_w2v2/xls-r-1/Fr_Hum_no_df3", "custom_w2v2/xls-r-1/Fr_Hum_no_df3_vowels",
                "custom_w2v2/xls-r-1/Fr_Hum_no_df4", "custom_w2v2/xls-r-1/Fr_Hum_no_df4_vowels", "custom_w2v2/xls-r-1/Fr_Hum_no_df5", "custom_w2v2/xls-r-1/Fr_Hum_no_df5_vowels"]

lm_model_paths = [None,"LM/intervention/Fr_Ref.arpa",  "LM/intervention/Fr_Hum_no_df1.arpa", "LM/intervention/Fr_Hum_no_df1_vowels.arpa",
                 "LM/intervention/Fr_Hum_no_df2.arpa", "LM/intervention/Fr_Hum_no_df2_vowels.arpa", "LM/intervention/Fr_Hum_no_df3.arpa", "LM/intervention/Fr_Hum_no_df3_vowels.arpa",
                 "LM/intervention/Fr_Hum_no_df4.arpa", "LM/intervention/Fr_Hum_no_df4_vowels.arpa", "LM/intervention/Fr_Hum_no_df5.arpa", "LM/intervention/Fr_Hum_no_df5_vowels.arpa"]

repo_ids = [ "Dandan0K/Intervention-xls-FR-no-LM", "Dandan0K/Intervention-xls-FR-Ref", "Dandan0K/Intervention-xls-FR-Hum-no-df1", "Dandan0K/Intervention-xls-FR-Hum-no-df1-vowels", 
           "Dandan0K/Intervention-xls-FR-Hum-no-df2", "Dandan0K/Intervention-xls-FR-Hum-no-df2-vowels", "Dandan0K/Intervention-xls-FR-Hum-no-df3", "Dandan0K/Intervention-xls-FR-Hum-no-df3-vowels",
           "Dandan0K/Intervention-xls-FR-Hum-no-df4", "Dandan0K/Intervention-xls-FR-Hum-no-df4-vowels", "Dandan0K/Intervention-xls-FR-Hum-no-df5", "Dandan0K/Intervention-xls-FR-Hum-no-df5-vowels"]

local_dirs = [ "custom_w2v2/xls-r-1/FR_no_LM", "custom_w2v2/xls-r-1/Fr_Ref","custom_w2v2/xls-r-1/Fr_Hum_no_df1", "custom_w2v2/xls-r-1/Fr_Hum_no_df1_vowels"]

lm_model_paths = [None,"LM/intervention/Fr_Ref.arpa",  "LM/intervention/Fr_Hum_no_df1.arpa", "LM/intervention/Fr_Hum_no_df1_vowels.arpa"]

repo_ids = [ "Dandan0K/Intervention-xls-FR-Hum-no-df1", "Dandan0K/Intervention-xls-FR-Hum-no-df1-vowels", "Dandan0K/Intervention-xls-FR-Hum-no-df2", "Dandan0K/Intervention-xls-FR-Hum-no-df2-vowels"]


for model_id in model_ids:
    print(f"-------------Processing model: {model_id}")
    for local_dir, lm_model_path, repo_id in zip(local_dirs, lm_model_paths, repo_ids):
        print(f"-------------------Processing local_dir: {local_dir}, lm_model_path: {lm_model_path}, repo_id: {repo_id}")
    
        api = HfApi()
    
        api.delete_repo(repo_id= repo_id, token = private_token)
        api.create_repo(repo_id= repo_id, private=False, token = private_token)

        processor = AutoProcessor.from_pretrained(model_id)

        vocab_dict = processor.tokenizer.get_vocab()
        sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

        decoder = build_ctcdecoder(
            labels=list(sorted_vocab_dict.keys()),
            kenlm_model_path = lm_model_path,
        )

        processor_with_lm = Wav2Vec2ProcessorWithLM(
            feature_extractor=processor.feature_extractor,
            tokenizer=processor.tokenizer,
            decoder=decoder
        )

        #repo = Repository(local_dir=local_dir, clone_from=model_id)
        snapshot_download(repo_id=model_id, local_dir=local_dir, token = private_token, repo_type="model")

        # remove the language model folder if it exists
        if os.path.exists(f"{local_dir}/language_model"):
            print(f"Removing {local_dir}/language_model")
            os.system(f"rm -r {local_dir}/language_model")

        # Save the model with the new LM
        processor_with_lm.save_pretrained(local_dir)

        # replace the .arpa file in the language_model folder by the binary file
        os.system(f"../../kenlm/build/bin/build_binary {lm_model_path} {local_dir}/language_model/lm.binary")
        # looks like: kenlm/build/bin/build_binary /home/hnp_vr/ASR_Thesis/customWav2vec2/vox/Ref_french/language_model/2gram_Fr_Ref.arpa /home/hnp_vr/ASR_Thesis/customWav2vec2/vox/Ref_french/language_model/2gram_Fr_Ref.bin

        # remove the .arpa file from the language_model folder
        os.system(f"rm {local_dir}/language_model/*.arpa")



        api.upload_folder(
            folder_path= local_dir,
            repo_id= repo_id,
            repo_type="model",
            token = private_token
        )

        print(f"----------------------------------Model {model_id} with LM {lm_model_path} saved to repo {repo_id}")

-------------Processing model: jonatasgrosman/wav2vec2-xls-r-1b-french
-------------------Processing local_dir: custom_w2v2/xls-r-1/FR_no_LM, lm_model_path: None, repo_id: Dandan0K/Intervention-xls-FR-Hum-no-df1




Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?


Fetching 24 files:   0%|          | 0/24 [00:00<?, ?it/s]

full_eval.sh:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.57k [00:00<?, ?B/s]

alphabet.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

eval.py:   0%|          | 0.00/6.20k [00:00<?, ?B/s]

(…)voice_8_0_fr_test_predictions_greedy.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

(…)mon_voice_8_0_fr_test_targets_greedy.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

(…)2_dev_data_fr_validation_predictions.txt:   0%|          | 0.00/132k [00:00<?, ?B/s]

(…)ion_common_voice_8_0_fr_test_targets.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

attrs.json:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

(…)common_voice_8_0_fr_test_predictions.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

lm.binary:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

(…)ata_fr_validation_predictions_greedy.txt:   0%|          | 0.00/134k [00:00<?, ?B/s]

unigrams.txt:   0%|          | 0.00/9.81M [00:00<?, ?B/s]

(…)ty-v2_dev_data_fr_validation_targets.txt:   0%|          | 0.00/133k [00:00<?, ?B/s]

(…)ommon_voice_8_0_fr_test_eval_results.txt:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

(…)oice_8_0_fr_test_eval_results_greedy.txt:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

(…)ta_fr_validation_eval_results_greedy.txt:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

(…)_dev_data_fr_validation_eval_results.txt:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Removing custom_w2v2/xls-r-1/FR_no_LM/language_model


/home/hnp_vr/kenlm/util/file.cc:76 in int util::OpenReadOrThrow(const char*) threw ErrnoException because `-1 == (ret = open(name, 00))'.
No such file or directory while opening None
ERROR
rm: cannot remove 'custom_w2v2/xls-r-1/FR_no_LM/language_model/*.arpa': No such file or directory


pytorch_model.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

----------------------------------Model jonatasgrosman/wav2vec2-xls-r-1b-french with LM None saved to repo Dandan0K/Intervention-xls-FR-Hum-no-df1
-------------------Processing local_dir: custom_w2v2/xls-r-1/Fr_Ref, lm_model_path: LM/intervention/Fr_Ref.arpa, repo_id: Dandan0K/Intervention-xls-FR-Hum-no-df1-vowels


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Loading the LM will be faster if you build a binary file.
Reading /home/hnp_vr/ASR_Dana/Master-Thesis-Pipeline/LM/intervention/Fr_Ref.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Only 219 unigrams passed as vocabulary. Is this small or artificial data?


Fetching 24 files:   0%|          | 0/24 [00:00<?, ?it/s]

attrs.json:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

alphabet.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

full_eval.sh:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

eval.py:   0%|          | 0.00/6.20k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

lm.binary:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

(…)common_voice_8_0_fr_test_predictions.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.57k [00:00<?, ?B/s]

(…)ion_common_voice_8_0_fr_test_targets.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

(…)voice_8_0_fr_test_predictions_greedy.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

(…)mon_voice_8_0_fr_test_targets_greedy.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

(…)2_dev_data_fr_validation_predictions.txt:   0%|          | 0.00/132k [00:00<?, ?B/s]

unigrams.txt:   0%|          | 0.00/9.81M [00:00<?, ?B/s]

(…)ata_fr_validation_predictions_greedy.txt:   0%|          | 0.00/134k [00:00<?, ?B/s]

(…)ty-v2_dev_data_fr_validation_targets.txt:   0%|          | 0.00/133k [00:00<?, ?B/s]

(…)ommon_voice_8_0_fr_test_eval_results.txt:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

(…)oice_8_0_fr_test_eval_results_greedy.txt:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

(…)_dev_data_fr_validation_eval_results.txt:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

(…)ta_fr_validation_eval_results_greedy.txt:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Removing custom_w2v2/xls-r-1/Fr_Ref/language_model


Reading LM/intervention/Fr_Ref.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


pytorch_model.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

KeyboardInterrupt: 