## Voice Cloning System (VC) with TTS

### Data downloading and preprocessing

In [8]:
import sys
import os
import glob
import librosa
import numpy as np
from TTS.api import TTS
import soundfile as sf
import jiwer
from pydub import AudioSegment
import speech_recognition as sr
import whisper

In [9]:
# Step 1: Data Preparation
def download_and_prepare_timit(timit_path='TIMIT'):
    audio_paths_train= glob.glob(os.path.join(timit_path, 'TRAIN/DR*/**/*.WAV'), recursive=True)
    #print(audio_paths_train)
    audio_paths_test= glob.glob(os.path.join(timit_path, 'TEST/DR*/**/*.WAV'), recursive=True)
    transcription_paths_train = glob.glob(os.path.join(timit_path, 'TRAIN/DR*/**/*.TXT'), recursive=True)
    transcription_paths_test = glob.glob(os.path.join(timit_path, 'TEST/DR*/**/*.TXT'), recursive=True)

    audio_files_train = []
    transcriptions_train = []
    
    for audio_path in audio_paths_train:
        transcription_path_train = audio_path.replace(".WAV", ".TXT")
        if os.path.exists(transcription_path_train):
            with open(transcription_path_train, 'r') as f:
                transcription = f.readlines()[0].strip().split(' ', 2)[-1]
                audio_files_train.append(audio_path)
                transcriptions_train.append(transcription)
    
    audio_files_test = []
    transcriptions_test = []
    
    for audio_path in audio_paths_test:
        transcription_path_test = audio_path.replace(".WAV", ".TXT")
        if os.path.exists(transcription_path_test):
            with open(transcription_path_test, 'r') as f:
                transcription = f.readlines()[0].strip().split(' ', 2)[-1]
                audio_files_test.append(audio_path)
                transcriptions_test.append(transcription)
                
    return audio_files_train, transcriptions_train,audio_files_test, transcriptions_test 

# Usage
timit_path = 'TIMIT\\data'
audio_files_train, transcriptions_train,audio_files_test, transcriptions_test = download_and_prepare_timit(timit_path)
print(len(transcriptions_test))

1680


In [3]:
# Step 2: Feature Extraction
def extract_mel_spectrogram(audio_path, sr=22050, n_mels=80):
    y, sr = librosa.load(audio_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

mel_train = [extract_mel_spectrogram(f) for f in audio_files_train]
mel_test = [extract_mel_spectrogram(f) for f in audio_files_test]

### Model loading

In [10]:

# Initialize the TTS model
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True, gpu=False)

# Function to convert text to speech using Tacotron 2 and HiFi-GAN
def text_to_speech(text, output_file):
    # Convert text to speech using Tacotron 2 and HiFi-GAN
    wav = tts.tts(text)
    
    # Save the audio to a file
    sf.write(output_file, wav, 22050, "PCM_16")
# Initialize Whisper ASR model
asr_model = whisper.load_model("base")


 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P

100%|███████████████████████████████████████| 139M/139M [00:08<00:00, 17.4MiB/s]


### Generate and evaluate audio

In [11]:
# Function to transcribe audio to text using Whisper ASR system
def asr_transcribe(audio_file):
    result = asr_model.transcribe(audio_file)
    transcription = result['text']
    return transcription

# Function to calculate WER between reference and hypothesis texts
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

# List to store WER results
wer_results = []

for i, text in enumerate(transcriptions_test):
    output_file = f"audio_files/output_{i}.wav"
    text_to_speech(text, output_file)
    
    # Use an ASR system to get the hypothesis transcription from the generated audio
    hypothesis = asr_transcribe(output_file)
    wer = calculate_wer(text, hypothesis)
    wer_results.append(wer)
    print(f"Text: {text}, Hypothesis: {hypothesis}, WER: {wer}")

# Calculate and print average WER over the test set
average_wer = sum(wer_results) / len(wer_results)
print(f"Average WER over the test set: {average_wer}")

 > Text splitted to sentences.
['She had your dark suit in greasy wash water all year.']
 > Processing time: 2.1109464168548584
 > Real-time factor: 0.48868604581355646




Text: She had your dark suit in greasy wash water all year., Hypothesis:  She had your dark suit in greasy washwater all year., WER: 0.18181818181818182
 > Text splitted to sentences.
["Don't ask me to carry an oily rag like that."]
 > Processing time: 3.529628038406372
 > Real-time factor: 0.9210885514919109
Text: Don't ask me to carry an oily rag like that., Hypothesis:  Don't ask me to carry an oily rag like that., WER: 0.0
 > Text splitted to sentences.
['His captain was thin and haggard and his beautiful boots were worn and shabby.']
 > Processing time: 4.5654566287994385
 > Real-time factor: 0.8140470845599982
Text: His captain was thin and haggard and his beautiful boots were worn and shabby., Hypothesis:  His captain was thin and haggard, and his beautiful boots were worn and shabby., WER: 0.07142857142857142
 > Text splitted to sentences.
['The reasons for this dive seemed foolish now.']
 > Processing time: 1.7186710834503174
 > Real-time factor: 0.5103519902779505
Text: The r

In [13]:
wer_test = np.array([wer_results])
mean_wer = np.mean(wer_test)
std_wer = np.std(wer_test)
print(f"Mean WER on the test set: {mean_wer}")
print(f"Standard deviation WER on the test set: {std_wer}")


Mean WER on the test set: 0.3562705674732986
Standard deviation WER on the test set: 2.184923190249096


OpenAI Whisper model is used for transforming the audio file generated back into text and then evaluate the Word Error Rate (WER) respect to the original text. WER is evaluated as average on the test set of TIMIT dataset and it is equivalent to 0.34 with standard deviation of 2.18. We evaluate this as an acceptable WER then decide to use Tacotron2 to generate the fake samples for the Fake Audio Detector.

## Fake Audio Detection

In [19]:
import os
import pandas as pd
import torchaudio
from TTS.api import TTS
import soundfile as sf
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

### Preparing dataset

In [11]:
# Assuming you have manually downloaded and extracted the CommonVoice dataset
data_dir = "CommonVoiceSegment"

# Load the metadata
metadata = pd.read_csv(os.path.join(data_dir, "validated.tsv"), sep='\t')

# Select a subset of data (e.g., first 100 samples)
#metadata = metadata.head(500)

# Create directories for real and fake audio
os.makedirs("data/real", exist_ok=True)
os.makedirs("data/fake", exist_ok=True)

# Function to load audio file and handle errors
def load_audio_file(audio_file_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_file_path)
        return waveform, sample_rate
    except Exception as e:
        print(f"Error loading {audio_file_path}: {e}")
        return None, None

# Save real audio samples
for i, row in metadata.iterrows():
    audio_file_path = os.path.join(data_dir, 'clips', row['path'])
    waveform, sample_rate = load_audio_file(audio_file_path)
    if waveform is not None:
        output_file = f"data/real/{i}.wav"
        torchaudio.save(output_file, waveform, sample_rate)


### Generate audio samples

In [None]:
# Load TTS model
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True, gpu=False)

# Generate synthetic audio samples
for i, row in metadata.iterrows():
    text = row["sentence"]
    wav = tts.tts(text)
    output_file = f"data/fake/{i}.wav"
    sf.write(output_file, wav, 22050, "PCM_16")



### Load data

In [17]:
# Feature extraction
def extract_features(audio_path, fixed_length=250):
    try:
        y, sr = librosa.load(audio_path, sr=22050)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        if mel_spec_db.shape[1] > fixed_length:
            mel_spec_db = mel_spec_db[:, :fixed_length]
        else:
            pad_width = fixed_length - mel_spec_db.shape[1]
            mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
        return mel_spec_db
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

real_features = []
fake_features = []

for filename in os.listdir("data/real"):
    features = extract_features(os.path.join("data/real", filename))
    if features is not None:
        real_features.append(features)

for filename in os.listdir("data/fake"):
    features = extract_features(os.path.join("data/fake", filename))
    if features is not None:
        fake_features.append(features)

real_features = np.array(real_features)
fake_features = np.array(fake_features)
real_labels = np.ones(len(real_features))
fake_labels = np.zeros(len(fake_features))

### Define and training CNN

In [28]:
# Model training
features = np.concatenate((real_features, fake_features))
labels = np.concatenate((real_labels, fake_labels))
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(256 * 5 * 15, 128)  
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
       # print("Shape after conv1 and pool: ", x.shape)
        x = self.pool(self.relu(self.conv2(x)))
       # print("Shape after conv2 and pool: ", x.shape)
        x = self.pool(self.relu(self.conv3(x)))
       # print("Shape after conv3 and pool: ", x.shape)
        x = self.pool(self.relu(self.conv4(x)))
       # print("Shape after conv4 and pool: ", x.shape)
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

X_train = torch.tensor(X_train).unsqueeze(1).float()
y_train = torch.tensor(y_train).unsqueeze(1).float()
X_test = torch.tensor(X_test).unsqueeze(1).float()
y_test = torch.tensor(y_test).unsqueeze(1).float()

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

model = CNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")



Epoch 1, Loss: 0.06950058951777331
Epoch 2, Loss: 0.010973274546265262
Epoch 3, Loss: 0.005626889621367988
Epoch 4, Loss: 0.00899262345249455
Epoch 5, Loss: 0.002772123068230221
Epoch 6, Loss: 0.002343895590717095
Epoch 7, Loss: 0.3193255738924934
Epoch 8, Loss: 0.7161841804953023
Epoch 9, Loss: 0.22864557489217516
Epoch 10, Loss: 0.16091198097210088


### Evaluate the model

In [29]:
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    predictions = (outputs > 0.5).float()
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(cm)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f"Number of true positive: {tp}, number of false negative:{fn}, number of true negatives: {tn}, number of false positive {fp}.")


Precision: 0.9996498599439776
Recall: 0.9920083391243919
F1 Score: 0.9958144401813742
Confusion Matrix:
[[2875    1]
 [  23 2855]]
Number of true positive: 2855, number of false negative:23, number of true negatives: 2875, number of false positive 1.


In [30]:
torch.save(model, 'fake_audio_detector.pt')

## Conclusions

The final trained model was able to achieve 99% precision, recall and F1 score on the test set. In detail only one fake sample is misclassified as real and 23 real samples are classified as fake. The performance are exceptional for this particular case but bigger and complex dataset could require a more complex model. Also the method to create fake audio files could be improved with the fine-tuning of the pretrained Tacotron2 model or different model could be considered.