In [None]:
### CODE CELL FOR RUNNING MMS (1B-ALL OR 1B-FL102)

from transformers import pipeline, Wav2Vec2ForCTC, AutoProcessor
import json
import os
import time
import torch
import datetime
import gc


start = time.time()
# Options used: "facebook/mms-1b-all" (pretrained on 1162 languages)
# or "facebook/mms-1b-fl102" (pretrained on 102 languages)
model_id = "facebook/mms-1b-all"

processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

# Loading the model
processor.tokenizer.set_target_lang("nld")
model.load_adapter("nld")

transcriber = pipeline("automatic-speech-recognition", model=model, device=0, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor,
                        chunk_length_s=10, stride_length_s=(4, 2)) # as recommended here: https://huggingface.co/blog/asr-chunking

# Keeping track of progress (how many files processed/total)
i = 0
recordings_path = '/evaluation/vl/jasmin/comp_p/5_sil/'
out_path = '/evaluation/mms-1b-all/jasmin_vl/p5/'

for recording in os.listdir(recordings_path):
    words = transcriber(recordings_path + recording, return_timestamps="word")
    i += 1

    # Converting to JSON format of Whisper
    words_to_add = []
    for word in words["chunks"]:
        words_to_add.append({
            "text": word["text"],
            "start": word["timestamp"][0],
            "end": word["timestamp"][1],
            "confidence": 1  # since no confidence/prob is provided, a default of 1 is used
        })

    # Only a list of words is outputted by MMS which is why we add "placeholder" to text
    # (and it is not relevant anyway as we will convert this output to CTM format
    # which only looks at words and their timestamps)
    result = {"segments": [{
        "text": "placeholder",
        "words": words_to_add
    }]}

    with open(out_path + recording[:-3] + 'json', 'w', encoding='utf-8') as f:
        json.dump(result, f, indent = 2, ensure_ascii = False)
        print(str(i) + '/' + str(len(os.listdir(recordings_path))))
        print(recording[:-4] + ' has been transcribed')
        # Unfortunately, only the total time is logged into a file
        print(f"gpu used {torch.cuda.max_memory_allocated(device='cuda:0') / float(1e9)} GB memory")
        torch.cuda.reset_peak_memory_stats(device='cuda:0')


end = time.time()
time_s = end - start
print('Time spent evaluating: ' + str(datetime.timedelta(seconds=time_s)))
with open(out_path + 'a_time_spent.txt', 'w') as f:
    f.write(str(datetime.timedelta(seconds=time_s)))

# free GPU memory
del transcriber
torch.cuda.empty_cache()
gc.collect()

In [None]:
### CODE CELL FOR RUNNING XLS-R
# Only difference is in loading the model

from transformers import pipeline
import json
import os
import time
import torch
import datetime
import gc


start = time.time()
transcriber = pipeline("automatic-speech-recognition", model='jonatasgrosman/wav2vec2-xls-r-1b-dutch', device=0,
                        chunk_length_s=10, stride_length_s=(4, 2)) # as recommended here: https://huggingface.co/blog/asr-chunking
print(f"loading XLS-R mem usage: {torch.cuda.max_memory_allocated(device='cuda:0') / float(1e9)} GB memory")
torch.cuda.reset_peak_memory_stats(device='cuda:0')

i = 0
recordings_path = '/evaluation/vl/jasmin/comp_p/5_sil/'
out_path = '/evaluation/xlsr/jasmin_vl/p5/'

for recording in os.listdir(recordings_path):
    words = transcriber(recordings_path + recording, return_timestamps="word")
    i += 1

    words_to_add = []
    for word in words["chunks"]:
        words_to_add.append({
            "text": word["text"],
            "start": word["timestamp"][0],
            "end": word["timestamp"][1],
            "confidence": 1
        })
    result = {"segments":[{
        "text": "placeholder",
        "words": words_to_add
    }]}

    with open(out_path + recording[:-3] + 'json', 'w', encoding='utf-8') as f:
        json.dump(result, f, indent = 2, ensure_ascii = False)
        print(str(i) + '/' + str(len(os.listdir(recordings_path))))
        print(recording[:-4] + ' has been transcribed')
        print(f"gpu used {torch.cuda.max_memory_allocated(device='cuda:0') / float(1e9)} GB memory")
        torch.cuda.reset_peak_memory_stats(device='cuda:0')


end = time.time()
time_s = end - start
print('Time spent evaluating: ' + str(datetime.timedelta(seconds=time_s)))
with open(out_path + 'a_time_spent.txt', 'w') as f:
    f.write(str(datetime.timedelta(seconds=time_s)))

# free GPU memory
del transcriber
torch.cuda.empty_cache()
gc.collect()