## Voice Cloning System (VC) with TTS

### Data downloading and preprocessing

In [110]:
import sys
import os
import glob
import librosa
import numpy as np
from TTS.api import TTS
import soundfile as sf
import jiwer
from pydub import AudioSegment
import speech_recognition as sr
import whisper
import torch
import torchaudio
import torchaudio.transforms as transforms
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from TTS.tts.models.tacotron2 import Tacotron2
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
from TTS.utils.audio import AudioProcessor
from statistics import mean

In [127]:
# Step 1: Data Preparation
def download_and_prepare_timit(timit_path='TIMIT'):
    audio_paths_train= glob.glob(os.path.join(timit_path, 'TRAIN/DR*/**/*.WAV'), recursive=True)
    #print(audio_paths_train)
    audio_paths_test1= glob.glob(os.path.join(timit_path, 'TEST/DR*/**/*.WAV'), recursive=True)
    transcription_paths_train = glob.glob(os.path.join(timit_path, 'TRAIN/DR*/**/*.TXT'), recursive=True)
    transcription_paths_test = glob.glob(os.path.join(timit_path, 'TEST/DR*/**/*.TXT'), recursive=True)
    i=0
    audio_paths_test=[]
    while i<len(audio_paths_test1):
        audio_paths_test.append(audio_paths_test1[i])
        i=i+2
    audio_files_train = []
    transcriptions_train = []
    
    for audio_path in audio_paths_train:
        transcription_path_train = audio_path.replace(".WAV", ".TXT")
        if os.path.exists(transcription_path_train):
            with open(transcription_path_train, 'r') as f:
                transcription = f.readlines()[0].strip().split(' ', 2)[-1]
                audio_files_train.append(audio_path)
                transcriptions_train.append(transcription)
    
    audio_files_test = []
    transcriptions_test = []
    
    for audio_path in audio_paths_test:
        transcription_path_test = audio_path.replace(".WAV", ".TXT")
        if os.path.exists(transcription_path_test):
            with open(transcription_path_test, 'r') as f:
                transcription = f.readlines()[0].strip().split(' ', 2)[-1]
                audio_files_test.append(audio_path)
                transcriptions_test.append(transcription)
    speaker=[]
    for a in audio_paths_test:
        speaker.append(a.split('\\')[4])
    return audio_files_train, transcriptions_train,audio_files_test, transcriptions_test,speaker,audio_paths_test
# Usage
timit_path = 'TIMIT\\data'
audio_files_train, transcriptions_train,audio_files_test, transcriptions_test,speakers,audio_paths_test = download_and_prepare_timit(timit_path)


### Model loading for generating samples with target speaker

In [89]:

# Initialize the TTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)

# Function to synthesize speech with target speaker embedding
def text_to_speech(text, speaker_wav, output_file):
    tts.tts_to_file(text=text, file_path=output_file,speaker_wav=speaker_wav,language="en",split_sentences=True)
    # Save the audio to a file
   # sf.write(output_file, wav, 22050, "PCM_16")
asr_model = whisper.load_model("base")


 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


### Generate and evaluate audio

In [128]:
# Function to transcribe audio to text using Whisper ASR system
def asr_transcribe(audio_file):
    result = asr_model.transcribe(audio_file)
    transcription = result['text']
    return transcription

# Function to calculate WER between reference and hypothesis texts
def calculate_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

# List to store WER results
wer_results = []
generated_audio_paths = []
for i, text in enumerate(transcriptions_test):
    output_file = f"audio_files/output_{i}.wav"
    text_to_speech(text, audio_paths_test[i], output_file)  # Generate audio with target speaker
    generated_audio_paths.append(output_file)
    
    # Use an ASR system to get the hypothesis transcription from the generated audio
    hypothesis = asr_transcribe(output_file)
    wer = calculate_wer(text, hypothesis)
    wer_results.append(wer)
    print(f"Text: {text}, Hypothesis: {hypothesis}, WER: {wer}")

# Calculate and print average WER over the test set
#average_wer = sum(wer_results) / len(wer_results)
#print(f"Average WER over the test set: {average_wer}")


 > Text splitted to sentences.
['She had your dark suit in greasy wash water all year.']
 > Processing time: 16.46333360671997
 > Real-time factor: 3.7507904821889504




Text: She had your dark suit in greasy wash water all year., Hypothesis:  She had your dark suit in greasy washwater all year., WER: 0.18181818181818182
 > Text splitted to sentences.
["Don't ask me to carry an oily rag like that."]
 > Processing time: 7.443759441375732
 > Real-time factor: 2.202798149054312
Text: Don't ask me to carry an oily rag like that., Hypothesis:  Don't ask me a carrying oily rag like that., WER: 0.3
 > Text splitted to sentences.
['His captain was thin and haggard and his beautiful boots were worn and shabby.']
 > Processing time: 11.020946025848389
 > Real-time factor: 2.276073916060589
Text: His captain was thin and haggard and his beautiful boots were worn and shabby., Hypothesis:  His captain was thin and heggard and his beautiful boots were worn and shabby., WER: 0.07142857142857142
 > Text splitted to sentences.
['The reasons for this dive seemed foolish now.']
 > Processing time: 9.346579551696777
 > Real-time factor: 2.205231115336778
Text: The reasons

In [130]:
wer_test = np.array([wer_results])
mean_wer = np.mean(wer_test)
std_wer = np.std(wer_test)
print(f"Mean WER on the test set: {mean_wer}")
print(f"Standard deviation WER on the test set: {std_wer}")
dict_speaker = {key: [] for key in speakers}
for i,wer in enumerate(wer_results):
    for key in dict_speaker:
        if key in audio_paths_test[i]:
            dict_speaker[key].append(wer)
            break
mean_dict = {key: mean(values) for key, values in dict_speaker.items()}
print(f"Mean WER on the test set for each speaker: {mean_dict}")
            

Mean WER on the test set: 0.1358089809094619
Standard deviation WER on the test set: 0.18810655616154268
Mean WER on the test set for each speaker: {'FAKS0': 0.11306277056277056, 'FDAC1': 0.12477272727272727, 'FELC0': 0.1867965367965368, 'FJEM0': 0.1434065934065934, 'MDAB0': 0.21137626262626263, 'MJSW0': 0.18171717171717172, 'MREB0': 0.15199134199134198, 'MRJO0': 0.14794372294372293, 'MSJS1': 0.17925324675324675, 'MSTK0': 0.11392857142857143, 'MWBT0': 0.08655844155844156, 'FCMR0': 0.1886904761904762, 'FDRD1': 0.10234848484848484, 'FJAS0': 0.1607142857142857, 'FJRE0': 0.13762626262626262, 'FJWB0': 0.1740151515151515, 'FPAS0': 0.11040404040404041, 'FRAM1': 0.08318181818181819, 'FSLB1': 0.11396936396936397, 'MABW0': 0.1532936507936508, 'MBJK0': 0.2125, 'MCCS0': 0.12314935064935065, 'MCEM0': 0.26785214785214784, 'MDBB0': 0.09804945054945055, 'MDLD0': 0.13333333333333333, 'MGWT0': 0.18803030303030302, 'MJAR0': 0.18916638916638917, 'MMDB1': 0.15903263403263404, 'MMDM2': 0.17326839826839827, 

OpenAI Whisper model is used for transforming the audio file generated back into text and then evaluate the Word Error Rate (WER) respect to the original text. WER is evaluated as average on the test set of TIMIT dataset and it is equivalent to 0.13 with standard deviation of 0.19. It is also printed the WER for each speaker looking comparable to the mean one. We evaluate this as an acceptable WER then decide to use Xtts v2 to generate the fake samples for the Fake Audio Detector.

## Fake Audio Detection

In [100]:
import os
import pandas as pd
import torchaudio
from TTS.api import TTS
import soundfile as sf
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

### Preparing dataset

In [101]:
# Assuming you have manually downloaded and extracted the CommonVoice dataset
data_dir = "CommonVoiceSegment"

# Load the metadata
metadata = pd.read_csv(os.path.join(data_dir, "validated.tsv"), sep='\t')

# Select a subset of data (e.g., first 100 samples)
#metadata = metadata.head(500)

# Create directories for real and fake audio
os.makedirs("data/real", exist_ok=True)
os.makedirs("data/fake", exist_ok=True)

# Function to load audio file and handle errors
def load_audio_file(audio_file_path):
    try:
        waveform, sample_rate = torchaudio.load(audio_file_path)
        return waveform, sample_rate
    except Exception as e:
        print(f"Error loading {audio_file_path}: {e}")
        return None, None

# Save real audio samples
for i, row in metadata.iterrows():
    audio_file_path = os.path.join(data_dir, 'clips', row['path'])
    waveform, sample_rate = load_audio_file(audio_file_path)
    if waveform is not None:
        output_file = f"data/real/{i}.wav"
        torchaudio.save(output_file, waveform, sample_rate)


### Generate audio samples

In [102]:
# Load TTS model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)


# Generate synthetic audio samples
for i, row in metadata.iterrows():
    text = row["sentence"]
    output_file = f"data/fake/{i}.wav"
    speaker_wav=f"data/real/{i}.wav"
    tts.tts_to_file(text=text, file_path=output_file,speaker_wav=speaker_wav,language="en",split_sentences=True)
    #sf.write(output_file, wav, 22050, "PCM_16")



 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
 > Text splitted to sentences.
['Coles Branch is underlaid by the Deep River Basin.']
 > Processing time: 9.017763137817383
 > Real-time factor: 2.449754548453495
 > Text splitted to sentences.
['It serves to support many other works.']
 > Processing time: 13.800077199935913
 > Real-time factor: 2.276814484755379
 > Text splitted to sentences.
['He was the chief clerk of the Chicago and Northwestern Railroad.']
 > Processing time: 17.286463022232056
 > Real-time factor: 2.409030928558353
 > Text splitted to sentences.
['Other collaborations on the project include Jon Randall and Billy Montana.']
 > Processing time: 19.817986965179443
 > Real-time factor: 2.3938700400024473
 > Text splitted to sentences.
['Finally he returns home to rebuild and rehabilitate his sister.']
 > Processing time: 11.427964687347412
 > Real-time factor: 2.262485825994922
 > Text splitted to sentences.
['It has also bee

### Load data

In [103]:
# Feature extraction
def extract_features(audio_path, fixed_length=250):
    try:
        y, sr = librosa.load(audio_path, sr=22050)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        if mel_spec_db.shape[1] > fixed_length:
            mel_spec_db = mel_spec_db[:, :fixed_length]
        else:
            pad_width = fixed_length - mel_spec_db.shape[1]
            mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
        return mel_spec_db
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

real_features = []
fake_features = []

for filename in os.listdir("data/real"):
    features = extract_features(os.path.join("data/real", filename))
    if features is not None:
        real_features.append(features)

for filename in os.listdir("data/fake"):
    features = extract_features(os.path.join("data/fake", filename))
    if features is not None:
        fake_features.append(features)

real_features = np.array(real_features)
fake_features = np.array(fake_features)
real_labels = np.ones(len(real_features))
fake_labels = np.zeros(len(fake_features))

### Define and training CNN

In [104]:
# Model training
features = np.concatenate((real_features, fake_features))
labels = np.concatenate((real_labels, fake_labels))
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(256 * 5 * 15, 128)  
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
       # print("Shape after conv1 and pool: ", x.shape)
        x = self.pool(self.relu(self.conv2(x)))
       # print("Shape after conv2 and pool: ", x.shape)
        x = self.pool(self.relu(self.conv3(x)))
       # print("Shape after conv3 and pool: ", x.shape)
        x = self.pool(self.relu(self.conv4(x)))
       # print("Shape after conv4 and pool: ", x.shape)
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

X_train = torch.tensor(X_train).unsqueeze(1).float()
y_train = torch.tensor(y_train).unsqueeze(1).float()
X_test = torch.tensor(X_test).unsqueeze(1).float()
y_test = torch.tensor(y_test).unsqueeze(1).float()

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

model = CNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")



Epoch 1, Loss: 0.11148391035382886
Epoch 2, Loss: 0.023468624346445567
Epoch 3, Loss: 0.015906729991496064
Epoch 4, Loss: 0.018662487516443473
Epoch 5, Loss: 0.014786070435736117
Epoch 6, Loss: 0.007748228799301759
Epoch 7, Loss: 0.014908527668057371
Epoch 8, Loss: 0.005709118464136014
Epoch 9, Loss: 0.07037089089111973
Epoch 10, Loss: 0.013793523620695788


### Evaluate the model

In [105]:
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    predictions = (outputs > 0.5).float()
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(cm)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f"Number of true positive: {tp}, number of false negative:{fn}, number of true negatives: {tn}, number of false positive {fp}.")


Precision: 0.9958448753462604
Recall: 0.9993050729673384
F1 Score: 0.997571973638571
Confusion Matrix:
[[2864   12]
 [   2 2876]]
Number of true positive: 2876, number of false negative:2, number of true negatives: 2864, number of false positive 12.


In [106]:
torch.save(model, 'fake_audio_detector.pt')

## Conclusions

The final trained model was able to achieve 99% precision, recall and F1 score on the test set. In detail only two fake sample are misclassified as real and 12 real samples are classified as fake. The performance are exceptional for this particular case but bigger and complex dataset could require a more complex model. Also the method to create fake audio files could be improved with the fine-tuning of the pretrained xtts v2 model or different model could be considered.