## Voice Cloning System (VC) with TTS

### Data downloading and preprocessing

In [8]:
import sys
import os
import glob
import librosa
import numpy as np
from TTS.api import TTS
import soundfile as sf
import jiwer
from pydub import AudioSegment
import speech_recognition as sr
import whisper

In [9]:
# Step 1: Data Preparation
def download_and_prepare_timit(timit_path='TIMIT'):
    audio_paths_train= glob.glob(os.path.join(timit_path, 'TRAIN/DR*/**/*.WAV'), recursive=True)
    #print(audio_paths_train)
    audio_paths_test= glob.glob(os.path.join(timit_path, 'TEST/DR*/**/*.WAV'), recursive=True)
    transcription_paths_train = glob.glob(os.path.join(timit_path, 'TRAIN/DR*/**/*.TXT'), recursive=True)
    transcription_paths_test = glob.glob(os.path.join(timit_path, 'TEST/DR*/**/*.TXT'), recursive=True)

    audio_files_train = []
    transcriptions_train = []
    
    for audio_path in audio_paths_train:
        transcription_path_train = audio_path.replace(".WAV", ".TXT")
        if os.path.exists(transcription_path_train):
            with open(transcription_path_train, 'r') as f:
                transcription = f.readlines()[0].strip().split(' ', 2)[-1]
                audio_files_train.append(audio_path)
                transcriptions_train.append(transcription)
    
    audio_files_test = []
    transcriptions_test = []
    
    for audio_path in audio_paths_test:
        transcription_path_test = audio_path.replace(".WAV", ".TXT")
        if os.path.exists(transcription_path_test):
            with open(transcription_path_test, 'r') as f:
                transcription = f.readlines()[0].strip().split(' ', 2)[-1]
                audio_files_test.append(audio_path)
                transcriptions_test.append(transcription)
                
    return audio_files_train, transcriptions_train,audio_files_test, transcriptions_test 

# Usage
timit_path = 'TIMIT\\data'
audio_files_train, transcriptions_train,audio_files_test, transcriptions_test = download_and_prepare_timit(timit_path)
print(len(transcriptions_test))

1680


In [3]:
# Step 2: Feature Extraction
def extract_mel_spectrogram(audio_path, sr=22050, n_mels=80):
    y, sr = librosa.load(audio_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

mel_train = [extract_mel_spectrogram(f) for f in audio_files_train]
mel_test = [extract_mel_spectrogram(f) for f in audio_files_test]

### Model loading

In [10]:

# Initialize the TTS model
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True, gpu=False)

# Function to convert text to speech using Tacotron 2 and HiFi-GAN
def text_to_speech(text, output_file):
    # Convert text to speech using Tacotron 2 and HiFi-GAN
    wav = tts.tts(text)
    
    # Save the audio to a file
    sf.write(output_file, wav, 22050, "PCM_16")
# Initialize Whisper ASR model
asr_model = whisper.load_model("base")


 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P

100%|███████████████████████████████████████| 139M/139M [00:08<00:00, 17.4MiB/s]


### Generate and evaluate audio

In [11]:
# Function to transcribe audio to text using Whisper ASR system
def asr_transcribe(audio_file):
    result = asr_model.transcribe(audio_file)
    transcription = result['text']
    return transcription

# Function to calculate WER between reference and hypothesis texts
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

# List to store WER results
wer_results = []

for i, text in enumerate(transcriptions_test):
    output_file = f"audio_files/output_{i}.wav"
    text_to_speech(text, output_file)
    
    # Use an ASR system to get the hypothesis transcription from the generated audio
    hypothesis = asr_transcribe(output_file)
    wer = calculate_wer(text, hypothesis)
    wer_results.append(wer)
    print(f"Text: {text}, Hypothesis: {hypothesis}, WER: {wer}")

# Calculate and print average WER over the test set
average_wer = sum(wer_results) / len(wer_results)
print(f"Average WER over the test set: {average_wer}")

 > Text splitted to sentences.
['She had your dark suit in greasy wash water all year.']
 > Processing time: 2.1109464168548584
 > Real-time factor: 0.48868604581355646




Text: She had your dark suit in greasy wash water all year., Hypothesis:  She had your dark suit in greasy washwater all year., WER: 0.18181818181818182
 > Text splitted to sentences.
["Don't ask me to carry an oily rag like that."]
 > Processing time: 3.529628038406372
 > Real-time factor: 0.9210885514919109
Text: Don't ask me to carry an oily rag like that., Hypothesis:  Don't ask me to carry an oily rag like that., WER: 0.0
 > Text splitted to sentences.
['His captain was thin and haggard and his beautiful boots were worn and shabby.']
 > Processing time: 4.5654566287994385
 > Real-time factor: 0.8140470845599982
Text: His captain was thin and haggard and his beautiful boots were worn and shabby., Hypothesis:  His captain was thin and haggard, and his beautiful boots were worn and shabby., WER: 0.07142857142857142
 > Text splitted to sentences.
['The reasons for this dive seemed foolish now.']
 > Processing time: 1.7186710834503174
 > Real-time factor: 0.5103519902779505
Text: The r

In [13]:
wer_test = np.array([wer_results])
mean_wer = np.mean(wer_test)
std_wer = np.std(wer_test)
print(f"Mean WER on the test set: {mean_wer}")
print(f"Standard deviation WER on the test set: {std_wer}")


Mean WER on the test set: 0.3562705674732986
Standard deviation WER on the test set: 2.184923190249096


## Fake Audio Detection

In [5]:
import os
import pandas as pd
import torchaudio
from TTS.api import TTS
import soundfile as sf
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score, f1_score

### Preparing dataset

In [8]:
# Assuming you have manually downloaded and extracted the CommonVoice dataset
data_dir = "CommonVoice"

# Load the metadata
metadata = pd.read_csv(os.path.join(data_dir, "validated.tsv"), sep='\t')

# Select a subset of data (e.g., first 100 samples)
metadata = metadata.head(500)

# Create directories for real and fake audio
os.makedirs("data/real", exist_ok=True)
os.makedirs("data/fake", exist_ok=True)

# Function to load audio file and handle errors
def load_audio_file(audio_file_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_file_path)
        return waveform, sample_rate
    except Exception as e:
        print(f"Error loading {audio_file_path}: {e}")
        return None, None

# Save real audio samples
for i, row in metadata.iterrows():
    audio_file_path = os.path.join(data_dir, 'clips', row['path'])
    waveform, sample_rate = load_audio_file(audio_file_path)
    if waveform is not None:
        output_file = f"data/real/{i}.wav"
        torchaudio.save(output_file, waveform, sample_rate)


Error loading CommonVoice\clips\9051169da60fcf5e22c3927734b4dcfc794a6e833f98d7f767a9ec3843e6047d12a91b4f518a4eb3af18241cb125ac13769d60f19ff564a9859ac069579ba133: Error opening 'CommonVoice\\clips\\9051169da60fcf5e22c3927734b4dcfc794a6e833f98d7f767a9ec3843e6047d12a91b4f518a4eb3af18241cb125ac13769d60f19ff564a9859ac069579ba133': System error.
Error loading CommonVoice\clips\1dcef00e46910f330e7d0a7e19f8a648a0b28aa90745c1678487a9bff45e0c0131a6c21f735e0c6a974c6d799700ce36da5b91209dae41f85b369866e0c2544d: Error opening 'CommonVoice\\clips\\1dcef00e46910f330e7d0a7e19f8a648a0b28aa90745c1678487a9bff45e0c0131a6c21f735e0c6a974c6d799700ce36da5b91209dae41f85b369866e0c2544d': System error.
Error loading CommonVoice\clips\6222bd8d7937700129e475df13460a09c23f4fcb393397decf5cabcaa0fcc7a9472f70954462b162a649e3337a537aa4d0154316e6a048fe079d1762974b6c7f: Error opening 'CommonVoice\\clips\\6222bd8d7937700129e475df13460a09c23f4fcb393397decf5cabcaa0fcc7a9472f70954462b162a649e3337a537aa4d0154316e6a048fe079d176