In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install tortoise-tts scipy

Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
!pip install deepspeed

Collecting deepspeed
  Downloading deepspeed-0.15.4.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting ninja (from deepspeed)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Collecting nvidia-ml-py (from deepspeed)
  Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)
Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia

In [3]:
import os
import numpy as np
import torch
from tortoise.api import TextToSpeech
from scipy.io.wavfile import write
import json
from google.colab import drive  # Google 드라이브 마운트

# Google 드라이브 마운트
drive.mount('/content/drive')

# Google 드라이브에 저장할 루트 폴더 설정
output_root_folder = '/content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel'
os.makedirs(output_root_folder, exist_ok=True)

# A100 GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU 활성화: {gpu_name}")
    if "A100" not in gpu_name:
        print("⚠️ GPU가 A100이 아닙니다. 런타임 유형 설정에서 'A100 GPU'로 변경하세요.")
else:
    print("GPU를 사용할 수 없습니다. CPU로 실행됩니다.")

# 저장된 임베딩 로드 및 Tensor로 변환
embedding_path = "conditioning_latents_Mark.npz"  # 본인 경로에 맞게 설정
loaded_latents = np.load(embedding_path)
conditioning_latents = [torch.tensor(loaded_latents[key]).to(device) for key in loaded_latents]

# Tortoise TTS 모델 초기화 (A100 GPU 최적화 설정)
tts = TextToSpeech(use_deepspeed=True, kv_cache=True, half=True, device=device)

# JSON 파일 경로 설정
json_path = "novel.json"  # 본인 경로에 맞게 설정

# JSON 파일 읽기
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# JSON 데이터를 바탕으로 음성 파일 생성 및 저장
for category in data["categories"]:
    category_name = category["category_name"]
    for level in category["levels"]:
        level_number = level["level"]

        # level이 6에서 10 사이인 경우에만 음성 변환 수행
        if level_number < 6 or level_number > 10:
            continue

        for script in level["scripts"]:
            title = script["title"]
            contents = script["contents"]

            # Google 드라이브에 카테고리/레벨/타이틀 경로 생성
            output_folder = os.path.join(output_root_folder, category_name, f"level_{level_number}", title)
            os.makedirs(output_folder, exist_ok=True)

            # 각 문장을 한 줄씩 읽어 음성 변환 및 저장
            for i, text in enumerate(contents, 1):
                if not text.strip():
                    continue

                # 출력 파일 경로
                output_path = os.path.join(output_folder, f"{title}_line_{i}.wav")

                # 파일이 이미 존재하면 건너뛰기
                if os.path.exists(output_path):
                    print(f"이미 생성된 파일: {output_path}. 건너뜁니다.")
                    continue

                # 텍스트를 화자 샘플의 스타일로 음성 변환 (High Quality 설정)
                pcm_audio = tts.tts_with_preset(
                    text, conditioning_latents=conditioning_latents, preset="high_quality"
                )

                # PCM 데이터를 numpy 배열로 변환 후 최종 WAV 파일로 저장
                write(output_path, 22050, np.array(pcm_audio, dtype=np.float32))
                print(f"'{text}' 문장이 '{output_path}'로 저장되었습니다.")

print("레벨 6~10에 대한 텍스트 파일의 음성 변환이 완료되었습니다.")


Mounted at /content/drive
GPU 활성화: NVIDIA A100-SXM4-40GB


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2v

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

autoregressive.pth:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

  self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)


[2024-11-17 23:15:29,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-11-17 23:15:32,606] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.15.4, git-hash=unknown, git-branch=unknown
[2024-11-17 23:15:32,609] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[2024-11-17 23:15:32,706] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'dtype': torch.float16, 'pre_layer_norm': True, 'norm_type': <NormType.LayerNorm: 1>, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncT

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py310_cu121/transformer_inference...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/transformer_inference/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module transformer_inference...


Time to load transformer_inference op: 45.76247024536133 seconds


diffusion_decoder.pth:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

  self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))


clvp2.pth:   0%|          | 0.00/976M [00:00<?, ?B/s]

  self.clvp.load_state_dict(torch.load(get_model_path('clvp2.pth', models_dir)))
  WeightNorm.apply(module, name, dim)


vocoder.pth:   0%|          | 0.00/391M [00:00<?, ?B/s]

  self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])


이미 생성된 파일: /content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel/novel/level_6/The Firekeeper’s Quest/The Firekeeper’s Quest_line_1.wav. 건너뜁니다.
이미 생성된 파일: /content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel/novel/level_6/The Firekeeper’s Quest/The Firekeeper’s Quest_line_2.wav. 건너뜁니다.
이미 생성된 파일: /content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel/novel/level_6/The Firekeeper’s Quest/The Firekeeper’s Quest_line_3.wav. 건너뜁니다.
이미 생성된 파일: /content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel/novel/level_6/The Firekeeper’s Quest/The Firekeeper’s Quest_line_4.wav. 건너뜁니다.
이미 생성된 파일: /content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel/novel/level_6/The Firekeeper’s Quest/The Firekeeper’s Quest_line_5.wav. 건너뜁니다.
이미 생성된 파일: /content/drive/MyDrive/Colab_Notebooks/tortoise_tts_high_quality_novel/novel/level_6/The Firekeeper’s Quest/The Firekeeper’s Quest_line_6.wav. 건너뜁니다.
이미 생성된 파일: /content/drive/MyDrive/