In [1]:
import json
from TTS.api import TTS
from tqdm import tqdm
import os
import torch
import logging
from pydub import AudioSegment
import re
import sys
import numpy as np



In [2]:
from TTS.utils.manage import ModelManager

# Initialize the ModelManager
model_manager = ModelManager()

# List available models
available_models = model_manager.list_models()
print(available_models)
en_tacotron2_models = [model for model in available_models if "en" in model and "tacotron2" in model]
print(en_tacotron2_models)


 Name format: type/language/dataset/model
 1: tts_models/multilingual/multi-dataset/xtts_v2
 2: tts_models/multilingual/multi-dataset/xtts_v1.1
 3: tts_models/multilingual/multi-dataset/your_tts
 4: tts_models/multilingual/multi-dataset/bark
 5: tts_models/bg/cv/vits
 6: tts_models/cs/cv/vits
 7: tts_models/da/cv/vits
 8: tts_models/et/cv/vits
 9: tts_models/ga/cv/vits
 10: tts_models/en/ek1/tacotron2 [already downloaded]
 11: tts_models/en/ljspeech/tacotron2-DDC [already downloaded]
 12: tts_models/en/ljspeech/tacotron2-DDC_ph
 13: tts_models/en/ljspeech/glow-tts
 14: tts_models/en/ljspeech/speedy-speech
 15: tts_models/en/ljspeech/tacotron2-DCA [already downloaded]
 16: tts_models/en/ljspeech/vits
 17: tts_models/en/ljspeech/vits--neon
 18: tts_models/en/ljspeech/fast_pitch
 19: tts_models/en/ljspeech/overflow
 20: tts_models/en/ljspeech/neural_hmm
 21: tts_models/en/vctk/vits
 22: tts_models/en/vctk/fast_pitch
 23: tts_models/en/sam/tacotron-DDC
 24: tts_models/en/blizzard2013/capa

In [3]:
class SuppressTTSOutput:
    def __enter__(self):
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_value, traceback):
        sys.stdout.close()
        sys.stderr.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

In [4]:
import torch
x = torch.rand(5, 3)
print(x)
print(torch.cuda.is_available())

tensor([[0.5071, 0.4838, 0.1513],
        [0.6296, 0.7884, 0.4236],
        [0.2962, 0.5414, 0.2672],
        [0.6511, 0.0221, 0.2305],
        [0.5950, 0.9620, 0.5338]])
True


In [5]:
# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [6]:
# Load JSON file
file_path = "../datasets/ccpe-main/data.json"  # Update this path if necessary
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

In [7]:
# Initialize TTS models for different speakers
user_tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DCA", progress_bar=True).to(device)
agent_tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True).to(device)

 > tts_models/en/ljspeech/tacotron2-DCA is already downloaded.
 > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:C:\Users\mackm\AppData\Local\tts\tts_models--en--ljspeech--tacotron2-DCA\scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model's reduction rate `r` is set to: 2
 > Vocoder Model: multiband_melgan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:C:\Users\mackm\AppData\Local\tts\vocoder_models--en--ljspeech--multiband-melgan\scale_stats.npy
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Generator Model: multiband_melgan_generator
 > Discriminator Model: melgan_multisca

In [8]:
# Create output directories
audio_output_dir = "../datasets/ccpe-main/generated_audio"
text_output_dir = "../datasets/ccpe-main/generated_text"
os.makedirs(audio_output_dir, exist_ok=True)
os.makedirs(text_output_dir, exist_ok=True)

In [9]:
# Count total utterances by "USER" for tqdm
total_utterances = sum(
    len([u for u in conversation.get("utterances", []) if u["speaker"] == "USER"])
    for conversation in data
)

In [10]:
processed_conversations = []
# get the processed audio files
for file in os.listdir(audio_output_dir):
    if file.endswith(".wav"):
        conversation_id = file.split(".")[0]
        processed_conversations.append(conversation_id)

print(f"Found {len(processed_conversations)} processed conversations.")


Found 501 processed conversations.


In [11]:
# Function to generate 2 seconds of silence
def generate_silence(duration_ms=2000):
    return AudioSegment.silent(duration=duration_ms)

failed_conversations = []

import traceback

# Process each conversation and generate combined audio and text files
with tqdm(total=len(data), desc="Processing Conversations") as pbar:
    for conversation in data:
        conversation_id = conversation.get("conversationId", "unknown_id")

        # Skip if already processed
        if conversation_id in processed_conversations:
            pbar.update(1)
            continue

        try:
            utterances = conversation.get("utterances", [])

            # Prepare combined text and speaker-separated audio sequence
            combined_audio_text = ""
            combined_text_with_labels = ""
            speaker_audio_sequence = []

            for utterance in utterances:
                speaker = utterance["speaker"]
                text = utterance["text"]
                combined_text_with_labels += f"{speaker}: {text}\n"

                # Apply text cleaning (remove non-alphanumeric characters and extra spaces)
                text = re.sub(r"[^a-zA-Z0-9.,!? ]", "", text)
                text = re.sub(' +', ' ', text)

                combined_audio_text += f"{text}\n"

                # Add cleaned text to the audio sequence
                speaker_audio_sequence.append({"text": text, "type": "utterance", "speaker": speaker})
                speaker_audio_sequence.append({"type": "silence"})  # Add 2 seconds of silence

            # Save the text conversation with speaker labels
            text_file_path = os.path.join(text_output_dir, f"{conversation_id}.txt")
            with open(text_file_path, "w") as text_file:
                text_file.write(combined_text_with_labels.strip())

            # Generate audio for the sequence with silence
            audio_output = AudioSegment.silent(duration=0)
            for item in speaker_audio_sequence:
                
                if item["type"] == "utterance":
                    
                    # Change TTS voice based on speaker
                    if item["speaker"] == "USER":
                        with SuppressTTSOutput():
                            audio_segment = user_tts_model.tts_to_file(item["text"])
                            audio_segment= AudioSegment.from_file(audio_segment)
                    else:
                        with SuppressTTSOutput():
                            audio_segment = agent_tts_model.tts_to_file(item["text"])
                            audio_segment= AudioSegment.from_file(audio_segment)
                elif item["type"] == "silence":
                    audio_segment = generate_silence()  # Generate 2 seconds of silence
                # print(type(audio_segment))
                audio_output += audio_segment

            audio_file_path = os.path.join(audio_output_dir, f"{conversation_id}.wav")
            audio_output.export(audio_file_path, format="wav")

            # Add conversation ID to processed list
            processed_conversations.append(conversation_id)

        except Exception as e:
            print(f"Error processing conversation ID {conversation_id}: {e}")
            traceback.print_exc()
            failed_conversations.append(conversation_id)
    
        pbar.update(1)  # Update progress bar
# Print out failed conversation IDs at the end
if failed_conversations:
    print("\nFailed Conversations:")
    for failed_id in failed_conversations:
        print(failed_id)

Processing Conversations:   0%|          | 0/502 [00:00<?, ?it/s]Traceback (most recent call last):
  File "C:\Users\mackm\Documents\School\OBS-FALL-24\CS 534 - Machine Learning\final project\LLA-VAP\venv\Lib\site-packages\TTS\tts\utils\text\characters.py", line 300, in char_to_id
    return self._char_to_id[char]
           ~~~~~~~~~~~~~~~~^^^^^^
KeyError: '͡'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\mackm\Documents\School\OBS-FALL-24\CS 534 - Machine Learning\final project\LLA-VAP\venv\Lib\site-packages\TTS\tts\utils\text\tokenizer.py", line 70, in encode
    idx = self.characters.char_to_id(char)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mackm\Documents\School\OBS-FALL-24\CS 534 - Machine Learning\final project\LLA-VAP\venv\Lib\site-packages\TTS\tts\utils\text\characters.py", line 302, in char_to_id
    raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
KeyError: " 

Error processing conversation ID CCPE-75731: 'charmap' codec can't encode character '\u025b' in position 1: character maps to <undefined>

Failed Conversations:
CCPE-75731



