In [3]:
# Audio to text

import os
import os
from tqdm import tqdm
import whisper
import subprocess
import string

#os.environ["PATH"] += os.pathsep + r"C:\ffmpeg\bin"

def check_ffmpeg():
    try:
        cmd = ["ffmpeg", "-version"]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            print("ffmpeg is accessible")
        else:
            print("ffmpeg is not accessible")
    except FileNotFoundError:
        print("ffmpeg not found")

check_ffmpeg()

model = whisper.load_model("base")

def transcribe_audio(file_path):
    audio = whisper.load_audio(file_path)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    _, probs = model.detect_language(mel)
    
    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(model, mel, options)

    transcription = result.text.translate(str.maketrans('', '', string.punctuation)).lower()

    return transcription

flac_file_path = r'audioResults/Tacotron2'

if os.path.isfile(flac_file_path):
    transcription = transcribe_audio(flac_file_path)
    print(f"Final Transcription: {transcription}")
else:
    print("File not found. Please check the file path.")


wav_directory = r'audioResults/Tacotron2'

transcriptions = []
for filename in tqdm(os.listdir(wav_directory), desc="Processing files"):
    if filename.endswith(".flac"):
        file_path = os.path.join(wav_directory, filename)
        transcription = transcribe_audio(file_path)
        transcriptions.append(transcription)



ffmpeg is accessible
File not found. Please check the file path.


Processing files: 100%|████████████████████████████████████████████████████| 335/335 [03:01<00:00,  1.84it/s]


In [6]:
# Save to txt

output_file_path = r'audioResults/transcriptions_Tacotron2.txt'
with open(output_file_path, 'w') as f:
    for transcription in transcriptions:
        f.write(transcription + '\n')

In [13]:
# txt to List

output_file_path = r'audioResults/transcriptions_VitsModel.txt'
with open(output_file_path, 'r') as f:
    transcriptions_from_file = [line.strip() for line in f]
output_file_path = r'audioResults/transcriptions_Tacotron2.txt'
with open(output_file_path, 'r') as f:
    transcriptions_from_file2 = [line.strip() for line in f]   

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2381: invalid continuation byte

In [8]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="clean.json")

textInput = [] 
audioPathInput = [] 

for data in dataset['train']['training_data']:
    textInput.extend(data['label'])
    audioPathInput.extend(data['name'])

printInputArrays = False

if printInputArrays:
    for i in range(len(textInput)):
        print(textInput[i])

    for i in range(len(audioPathInput)):
        print(audioPathInput[i])

In [11]:
# Calculate the WER between two texts

# Objective Evaluation
# Word Error Rate (WER):

# Transcribe the generated audio back to text using an automatic speech recognition (ASR) system.
# Compare the transcribed text with the original text to calculate the WER. The model with the lower WER is better.

from jiwer import wer

if len(textInput)==len(transcriptions_from_file):
    original_texts = textInput
    transcribed_text_model1 = transcriptions_from_file
    transcribed_text_model2 = transcriptions_from_file2

    wer_list_model1 = [wer(original, transcribed) for original, transcribed in zip(original_texts, transcribed_text_model1)] if transcribed_text_model1 else []
    wer_list_model2 = [wer(original, transcribed) for original, transcribed in zip(original_texts, transcribed_text_model2)] if transcribed_text_model2 else []

    average_wer_model1 = sum(wer_list_model1) / len(wer_list_model1) if wer_list_model1 else float('inf')
    average_wer_model2 = sum(wer_list_model2) / len(wer_list_model2) if wer_list_model2 else float('inf')

    print(f"Average WER Model 1: {average_wer_model1}")
    print(f"Average WER Model 2: {average_wer_model2}")
else:
    print("Input arrays are not of the same length.")
    print(f"textInput: {len(textInput)} transcriptions_from_file: {len(transcriptions_from_file)}")


Average WER Model 1: 1.2331827687700017
Average WER Model 2: 1.2331827687700017


In [None]:
# Perceptual Evaluation of Speech Quality (PESQ):

# Use PESQ to measure the quality of the speech signals. PESQ compares the generated audio to a reference audio signal and gives a quality score.
# Libraries like pypesq can be used for this purpose.

In [13]:
from pydub import AudioSegment
from scipy.io import wavfile
import os

def convert_flac_to_wav(flac_file_path, wav_file_path):
    audio = AudioSegment.from_file(flac_file_path, format="flac")
    audio.export(wav_file_path, format="wav")

# File paths
ref_flac_path = r'audioResults/VitsModel/07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac'
#gen_flac_path_model1 = r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\audioResults\VitsModel\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac'
#gen_flac_path_model2 = r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\audioResults\VitsModel\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00001.flac'


# File paths
ref_wav_path = r'output.wav'
#gen_wav_path_model1 = r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\audioResults\VitsModel\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.wav'
#gen_wav_path_model2 = r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\audioResults\VitsModel\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00001.wav'


convert_flac_to_wav(ref_flac_path, ref_wav_path)
#convert_flac_to_wav(gen_flac_path_model1, gen_wav_path_model1)
#convert_flac_to_wav(gen_flac_path_model2, gen_wav_path_model2)

# Read WAV files
rate, ref_audio = wavfile.read(ref_wav_path)
#rate, gen_audio_model1 = wavfile.read(gen_wav_path_model1)
#rate, gen_audio_model2 = wavfile.read(gen_wav_path_model2)

# Compute PESQ scores
pesq_score_model1 = pesq(rate, ref_audio, gen_audio_model1, 'wb')
#pesq_score_model2 = pesq(rate, ref_audio, gen_audio_model2, 'wb')

print(f"PESQ Score Model 1: {pesq_score_model1}")
#print(f"PESQ Score Model 2: {pesq_score_model2}")

# Clean up temporary WAV files if needed
os.remove(ref_wav_path)
os.remove(gen_wav_path_model1)
os.remove(gen_wav_path_model2)


CouldntDecodeError: Decoding failed. ffmpeg returned error code: 183

Output from ffmpeg/avlib:

ffmpeg version n6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13.2.1 (GCC) 20230801
  configuration: --prefix=/usr --disable-debug --disable-static --disable-stripping --enable-amf --enable-avisynth --enable-cuda-llvm --enable-lto --enable-fontconfig --enable-frei0r --enable-gmp --enable-gnutls --enable-gpl --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libdav1d --enable-libdrm --enable-libfreetype --enable-libfribidi --enable-libgsm --enable-libharfbuzz --enable-libiec61883 --enable-libjack --enable-libjxl --enable-libmodplug --enable-libmp3lame --enable-libopencore_amrnb --enable-libopencore_amrwb --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libplacebo --enable-libpulse --enable-librav1e --enable-librsvg --enable-librubberband --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libv4l2 --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpl --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxcb --enable-libxml2 --enable-libxvid --enable-libzimg --enable-nvdec --enable-nvenc --enable-opencl --enable-opengl --enable-shared --enable-vapoursynth --enable-version3 --enable-vulkan
  libavutil      58. 29.100 / 58. 29.100
  libavcodec     60. 31.102 / 60. 31.102
  libavformat    60. 16.100 / 60. 16.100
  libavdevice    60.  3.100 / 60.  3.100
  libavfilter     9. 12.100 /  9. 12.100
  libswscale      7.  5.100 /  7.  5.100
  libswresample   4. 12.100 /  4. 12.100
  libpostproc    57.  3.100 / 57.  3.100
[flac @ 0x5ab7436c3600] Could not find codec parameters for stream 0 (Audio: flac, 0 channels): unspecified sample format
Consider increasing the value for the 'analyzeduration' (0) and 'probesize' (5000000) options
Input #0, flac, from 'audioResults/VitsModel/07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac':
  Duration: N/A, bitrate: N/A
  Stream #0:0: Audio: flac, 0 channels
Stream mapping:
  Stream #0:0 -> #0:0 (flac (native) -> pcm_s32le (native))
Press [q] to stop, [?] for help
Cannot determine format of input stream 0:0 after EOF
Error marking filters as finished
Error while filtering: Invalid data found when processing input
[out#0/wav @ 0x5ab7436e7240] Nothing was written into output file, because at least one of its streams received no packets.
size=       0kB time=N/A bitrate=N/A speed=N/A    
Conversion failed!


In [7]:
from pypesq import pesq
from scipy.io import wavfile

rate, ref_audio = wavfile.read(r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\data\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac')
rate, gen_audio_model1 = wavfile.read(r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\audioResults\VitsModel\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.flac')
rate, gen_audio_model2 = wavfile.read(r'C:\CodeProjects\University\3.2_VU\deep learning\gmmGroup\audioResults\VitsModel\07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00001.flac')

pesq_score_model1 = pesq(rate, ref_audio, gen_audio_model1, 'wb')
pesq_score_model2 = pesq(rate, ref_audio, gen_audio_model2, 'wb')

print(f"PESQ Score Model 1: {pesq_score_model1}")
print(f"PESQ Score Model 2: {pesq_score_model2}")


ModuleNotFoundError: No module named 'pypesq'