# Run inference on the fine-tuned model 

Test the ability of your fine-tuned model to make predictions on the test set


In [11]:
%%capture 
!pip install datasets==2.4.0
!pip install transformers==4.18 
!pip install huggingface_hub==0.5.1 
!pip install torchaudio==0.11  
!pip install librosa 
!pip install jiwer   
!git config --global credential.helper store 
!apt install git-lfs

In [None]:
!huggingface-cli login  # login to huggingface to get the auth_token

In [12]:
%%capture  
import pandas as pd
import re
import torch
import json
from IPython.display import display, HTML
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2CTCTokenizer
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from transformers import AutoModelForCTC, Wav2Vec2Processor
from datasets.utils.version import Version
from datasets import load_dataset, load_metric, Audio
import os
import numpy as np
import sys
import warnings
import argparse
from torch import Tensor

In [15]:
lang = "it"                   ## language code       
corpus = 6.1                  ## the number of the corpus of the language
test_pct = 20                 ## the percentange of the test set  
model_dir = "wav2vec2-large-xls-r-300m-it-100"    ## model directory 
n_checkpoint = 23700               ## last checkpoint
tokenizer_name = "tokenizer_it"    ## the tokenizer of the language 

## Load :
- the fine-tuned model
- the tokenizer 
- the processor

In [None]:
"""import the model, processor, tokenizer"""
print("loading saved model")
saved_model = AutoModelForCTC.from_pretrained(f"./{model_dir}/checkpoint-{n_checkpoint}/", local_files_only = True)
saved_model.to("cuda")
print("loading tokenizer")
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(f"./{tokenizer_name}/", local_files_only = True)
print("loading processor")
processor = Wav2Vec2Processor.from_pretrained(f"./{model_dir}/checkpoint-{n_checkpoint}/", local_files_only=True)

## Load the test set


In [None]:
if corpus == 6.1 : 
    data_test = load_dataset("common_voice", lang, split=f"test[:{test_pct}%]")
    data_test = data_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
else: 
    data_test = load_dataset(f"mozilla-foundation/common_voice_{corpus}_0/", lang, split=f"test[:{test_pct}%]", use_auth_token=True)    
    data_test = data_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

## Data pre-processing 
pre-process the data (transcriptions and speech signals) as it was done befor fine-tuning the model 

For the **transcriptions**: 
- remove punctuation 
- lowercase
- substitute characters (if needed)

In [None]:
print("preprocess data") 
if lang == "ar":
    chars_to_remove_regex = '[\—\,\?\.\!\-\;\:\"\“\%\�\°\(\)\–\…\¿\¡\,\""\‘\”\჻\~\՞\؟\،\,\॥\«\»\„\,\“\”\「\」\‘\’\《\》\[\]\{\}\=\`\_\+\<\>\‹\›\©\®\→\。\、\﹂\﹁\～\﹏\，\【\】\‥\〽\『\』\〝\⟨\⟩\〜\♪\؛\/\\\−\^\'\ʻ\ˆ\´\ʾ\‧\〟\'ً \'ٌ\'ُ\'ِ\'ّ\'ْ]'
else:
    chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\°\(\)\–\…\\\[\]\«\»\\\/\^\<\>\~\_\-\¿\¡\—]'


In [None]:
def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch
data_test = data_test.map(remove_special_characters)

def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[’]', "'", batch["sentence"])

    return batch

data_test= data_test.map(replace_hatted_characters)
data_test = data_test.cast_column("audio", Audio(sampling_rate=16_000))

## Prepare the data for the model
- transform input data into batches 
- filter the data setting a treashold (if not comment the line)

In [None]:
"""Prepare Dataset"""
print("prepare dataset")
def prepare_dataset(batch):
    audio = batch["audio"]    
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

common_voice_test = data_test.map(prepare_dataset, remove_columns=data_test.column_names , keep_in_memory=True)
common_voice_test= common_voice_test.filter(lambda x : x < 5.0*16000, input_columns=["input_length"])

if you filtered the input data by length remember to filter also the original file whose transcriptions will be used as ground truth  

In [None]:
print("loading reference transcriptions")
transcription = data_test
transcription=[ el for el in data_test if len(el["audio"]["array"]) < 5.0*16000]

## Evaluation 
- input the batches into the model 

In [None]:
print('evaluation')
predictions = [ ]
for el in common_voice_test["input_values"]:
    input_dict = processor(el, return_tensors="pt", padding=True)
    logits= saved_model(input_dict.input_values.cuda()).logits
    #print(logits.shape)
    pred_ids = torch.argmax(logits[0], dim=-1)
    #print(pred_ids)
    predicted_sentences = processor.decode(pred_ids)
    predictions.append(predicted_sentences)
    #print(predictions)

<a name="s2"></a> 
## Calculate CER and WER
- load the metrics CER and WER


In [None]:
wer = load_metric("wer")
cer = load_metric("cer")


- pass the predictions to the metrics: 
  - **CER**: takes the **string of characters from the predictions**, and the **string of characters of the reference sentence**. 
  - **WER**: takes the **list of words from the reference sentence** and the **list of words from the predicted sentence**. 
- the predicted sentences and the original transcriptions are combined into a dataframe and save into a csv file  (if data is too big comment this part) 

In [None]:
list_sent=[]
list_ref=[]

for i, sentence_ in enumerate(predictions):
    print(i, "Sentence: ",  sentence_)
    print("Reference: ",  transcription[i]["sentence"])
    list_sent.append(sentence_)
    list_ref.append(transcription[i]["sentence"])
    #wer_DSI(sentence_ , transcription[i]["sentence"], debug=True)  ## uncomment to print Deletions, Substitutions, Insertions

result_cer= cer.compute(predictions=[" ".join(list_sent)], references=[" ".join(list_ref)] )
print("CER", result_cer)

result_wer= wer.compute(predictions=[list_sent], references=[list_ref])
print("WER: ", result_wer)

d={ "predictions":list_sent, "references":list_ref }
df = pd.DataFrame(d)
df.to_csv(f"./INFERENCE_{lang}-{model_dir}.csv")

#### Function to print Deletions, Substitutions, and Insertions 

to use this function from [Pyzone](https://pyzone.dev/word-error-rate-in-python/) you can uncomment the line in the code above, or you can used it on the sentences saved in the CSV file 

In [None]:
def wer_DSI(ref, hyp ,debug=False):
    r = ref.split()
    h = hyp.split()
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact

    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    n_sub=[]

    if debug:
        print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("OK\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("SUB\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("DEL\t" + r[i]+"\t"+"****")
    if debug:
        lines = reversed(lines)
        for line in lines:
            print(line)
        print("Ncor " + str(numCor))
        print("Nsub " + str(numSub))
        print("Ndel " + str(numDel))
        print("Nins " + str(numIns))
        
    #return (numSub + numDel + numIns) / (float) (len(r))
    wer_result = round( (numSub + numDel + numIns) / (float) (len(r)), 3)
    return {'WER':wer_result, 'Cor':numCor, 'Sub':numSub, 'Ins':numIns, 'Del':numDel}
    #return numSub

In [None]:
## load the entences from saved csv file 

your_path = f"./INFERENCE_{lang}-{model_dir}.csv"
inference_file = pd.read_csv(your_path, sep=",")

for i, sentence_ in enumerate(inference_file["references"]):
  x = wer(sentence_ , inference_file["predictions"][i], debug=True)

# Run inference with a Language Model: 

Without a language model the output of the fine-tuned model is just the concatenation of predicted characters, and non-existing words could be predicted. To improve the predictions and rescore the error rates, a 5-gram language model can be applied to decode the output of the fine-tuned model. 

Following this [tutorial by Von Platen (2022)](https://huggingface.co/blog/wav2vec2-with-ngram) you can train a 5-gram model on texts of your target langauge. 

To apply the n-gram language model follow this steps **after the data have been processed**



### Create a vocabulary from the tokenizer


In [None]:
print("create the vocab from tokenizer")
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
print(vocab_dict)
print(sorted_vocab_dict)

### Create a decoder for the langauge model 

In [None]:
print("create a decoder for the language model")
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(labels=list(sorted_vocab_dict.keys()),
                           kenlm_model_path="./5gram_correct.arpa",)

### Create the "new" processor 

In [None]:
print("create the processor with the language model")
from transformers import Wav2Vec2ProcessorWithLM
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

### Run the Evaluation 
this time we don't apply argmax to the logits 
and the processor will be the **processor_with_lm** just created above


In [None]:
print("""evaluation""")

predictions=[]
for el in common_voice_test["input_values"]:
    input_dict = processor_with_lm(el, return_tensors="pt", padding=True)
    logits= saved_model(input_dict.input_values.cuda()).logits
    #print(logits.shape)
    transcription = processor_with_lm.batch_decode(logits.detach().cpu().numpy()).text
    #print(transcription)
    predictions.append(transcription[0])


After, compute the CER and WER as done [above](#s2)  
