In [None]:
## MAKE SURE TO INSTALL THE DEPENDENCIES BELOW (RECOMMENDED TO DO IN A VIRTUAL ENV)
from faster_whisper import WhisperModel
import json
import os
import time
import torch
import datetime
import gc


# Starting the log of time it takes to evaluate
start = time.time()
# Loading the faster-whisper model
#
# To use the CPU instead of the GPU:
# - change "cuda" to "cpu"
# - remove `device_index=0`
# - change `compute_type` from "float16" to "float32" or "int8"
#
# You can also use fine-tuned models, but for that check the doc of "faster-whisper"
model = WhisperModel('large-v2', device="cuda", device_index=0, compute_type="float16")

# !!! Change the path to the dir where the audio files are stored !!!
for file in os.listdir('/home/jovyan/evaluation/jasmin_comp_p/5_sil/'):
    # the beam_size, best_of, and temperature settings correspond to the ones used by the original
    # implementation of Whisper by OpenAI
    #
    # !!! Also change the path here !!!
    segments, info = model.transcribe('/home/jovyan/evaluation/jasmin_comp_p/5_sil/' + file, vad_filter=True, beam_size=5,\
                                      best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0), language="nl", word_timestamps=True)
    segments_to_add = []
    # Reformatting the output to match the one seen in "whisper-timestamped" implementation
    for segment in segments:
        words_to_add = []
        for word in segment.words:
            words_to_add.append({
                # There was an issue with spaces being inserted at the beginning of words
                # To mitigate, I used `.lstrip()`
                "text": word.word.lstrip(),
                "start": word.start,
                "end": word.end,
                "confidence": word.probability
            })
        segments_to_add.append({
            "id": segment.id,
            "seek": segment.seek,
            "start": segment.start,
            "end": segment.end,
            "text": segment.text,
            "tokens": segment.tokens,
            "temperature": segment.temperature,
            "avg_logprob": segment.avg_logprob,
            "compression_ratio": segment.compression_ratio,
            "no_speech_prob": segment.no_speech_prob,
            "words": words_to_add
        })
    result = {"segments": segments_to_add}
    # Dump the results in a JSON file for postprocessing
    #
    # !!! Change the path to wherever you want to store the results !!!
    with open('/home/jovyan/evaluation/faster-whisper/group5_p/vad2/' + file[:-3] + 'json', 'w', encoding='utf-8') as f:
        json.dump(result, f, indent = 2, ensure_ascii = False)
        print(file[:-4] + ' has been transcribed')
        print(f"gpu used {torch.cuda.max_memory_allocated(device='cuda:0') / float(1e9)} GB memory")
        torch.cuda.reset_peak_memory_stats(device='cuda:0')

# End logging as evaluation finished
end = time.time()
time_s = end - start
# Print out the time, as well as save in a txt file in case the cell output bugs out
# (that happened to me so better be safe than sorry)
print('Time spent evaluating: ' + str(datetime.timedelta(seconds=time_s)))

# !!! Change the path to wherever you want to store the results !!!
with open('/home/jovyan/evaluation/faster-whisper/group5_p/vad2/a_time_spent.txt', 'w') as f:
    f.write(str(datetime.timedelta(seconds=time_s)))

# Freeing up memory (restarting the kernel is more effective and consistent)
del model
gc.collect()