In [40]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import librosa
from jamo import hangul_to_jamo
import librosa.feature

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [21]:
def text_to_sequence(text):
    # 한글 자모 분해
    sequence = []
    for char in text:
        if '가' <= char <= '힣':
            jamos = list(hangul_to_jamo(char))
            sequence.extend(jamos)
        else:
            sequence.append(char)
    return sequence


In [22]:
# 예시: 데이터셋의 모든 텍스트를 수집
all_texts = []
metadata_path = '여기에 파일'

with open(metadata_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('|')
        if len(parts) >= 2:
            path, text = parts[0], parts[1]
            all_texts.append(text)

# 모든 텍스트에서 고유한 토큰 추출
unique_tokens = set()
for text in all_texts:
    sequence = text_to_sequence(text)
    unique_tokens.update(sequence)

# 토큰과 인덱스 매핑
vocab = {token: idx for idx, token in enumerate(sorted(unique_tokens))}
vocab['<pad>'] = len(vocab)
vocab['<unk>'] = len(vocab)


FileNotFoundError: [Errno 2] No such file or directory: '여기에 파일'

In [33]:
def get_mask_from_lengths(lengths, max_len=None):
    batch_size = lengths.size(0)
    if max_len is None:
        max_len = torch.max(lengths).item()
    ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
    mask = ids < lengths.unsqueeze(1)
    return mask  # (batch_size, max_len)

In [24]:
def load_wav(path, sr=22050):
    wav, _ = librosa.load(path, sr=sr)
    return wav

def wav_to_mel(wav, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80, fmin=0, fmax=8000):
    # STFT
    stft = librosa.stft(wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
    # 멜 스펙트로그램
    mel_spectrogram = librosa.feature.melspectrogram(S=np.abs(stft)**2, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmin=fmin, fmax=fmax)
    # 로그 스케일
    mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
    return mel_spectrogram

def normalize_mel(mel_spectrogram):
    # 정규화: [-4, 4] -> [0, 1]
    mel_spectrogram = np.clip(mel_spectrogram, a_min=-4, a_max=4)
    mel_spectrogram = (mel_spectrogram + 4) / 8
    return mel_spectrogram


In [25]:
class TacotronDataset(Dataset):
    def __init__(self, metadata_path, vocab):
        self.vocab = vocab
        self.data = self.load_metadata(metadata_path)

    def load_metadata(self, metadata_path):
        data = []
        with open(metadata_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('|')
                if len(parts) >= 2:
                    path, text = parts[0], parts[1]
                    data.append((path, text))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        path, text = self.data[idx]
        # 텍스트 처리
        sequence = text_to_sequence(text)
        sequence = [self.vocab.get(char, self.vocab['<unk>']) for char in sequence]
        sequence = torch.LongTensor(sequence)
        input_length = sequence.size(0)
        # 오디오 처리
        wav = load_wav(path)
        mel = wav_to_mel(wav)
        mel = normalize_mel(mel)
        mel = torch.FloatTensor(mel)
        mel = mel.transpose(0, 1)  # (시간 축, 멜 밴드 수)
        mel_length = mel.size(0)
        return sequence, input_length, mel, mel_length


In [26]:
def collate_fn(batch):
    sequences, input_lengths, mels, mel_lengths = zip(*batch)
    # 입력 시퀀스 패딩
    max_input_len = max(input_lengths)
    padded_sequences = torch.zeros(len(sequences), max_input_len, dtype=torch.long)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :seq.size(0)] = seq
    # 멜 스펙트로그램 패딩
    max_mel_len = max(mel_lengths)
    num_mels = mels[0].size(1)
    padded_mels = torch.zeros(len(mels), max_mel_len, num_mels)
    for i, mel in enumerate(mels):
        padded_mels[i, :mel.size(0), :] = mel
    # 길이 텐서로 변환
    input_lengths = torch.LongTensor(input_lengths)
    mel_lengths = torch.LongTensor(mel_lengths)
    return padded_sequences, input_lengths, padded_mels, mel_lengths


In [27]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim):
        super(Encoder, self).__init__()
        conv_channels = 512
        self.prenet = nn.Sequential(
            nn.Conv1d(embedding_dim, conv_channels, kernel_size=5, padding=2),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Conv1d(conv_channels, conv_channels, kernel_size=5, padding=2),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Conv1d(conv_channels, conv_channels, kernel_size=5, padding=2),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.lstm = nn.LSTM(conv_channels, 512, num_layers=1, batch_first=True)

    def forward(self, x):
        # x: (batch_size, seq_len, embedding_dim)
        x = x.transpose(1, 2)  # (batch_size, embedding_dim, seq_len)
        x = self.prenet(x)     # (batch_size, conv_channels, seq_len)
        x = x.transpose(1, 2)  # (batch_size, seq_len, conv_channels)
        outputs, _ = self.lstm(x)
        return outputs  # (batch_size, seq_len, 512)


In [28]:
class PreNet(nn.Module):
    def __init__(self, in_dim):
        super(PreNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

    def forward(self, x):
        return self.layers(x)


In [29]:
class Attention(nn.Module):
    def __init__(self, attention_rnn_dim, encoder_dim):
        super(Attention, self).__init__()
        self.query_layer = nn.Linear(attention_rnn_dim, 128)
        self.memory_layer = nn.Linear(encoder_dim, 128)
        self.v = nn.Linear(128, 1)

    def forward(self, attention_hidden_state, memory, mask):
        # attention_hidden_state: (batch_size, attention_rnn_dim)
        # memory: (batch_size, seq_len, encoder_dim)
        # mask: (batch_size, seq_len)
        processed_query = self.query_layer(attention_hidden_state).unsqueeze(1)  # (batch_size, 1, 128)
        processed_memory = self.memory_layer(memory)  # (batch_size, seq_len, 128)
        energies = self.v(torch.tanh(processed_query + processed_memory)).squeeze(-1)  # (batch_size, seq_len)
        energies.data.masked_fill_(mask == 0, -float('inf'))
        attention_weights = F.softmax(energies, dim=1)  # (batch_size, seq_len)
        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory).squeeze(1)  # (batch_size, encoder_dim)
        return attention_context, attention_weights


In [30]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.prenet = PreNet(80)
        self.attention_rnn = nn.LSTMCell(256 + 512, 1024)
        self.attention_layer = Attention(1024, 512)
        self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024)
        self.linear_projection = nn.Linear(1024 + 512, 80)
        self.stop_projection = nn.Linear(1024 + 512, 1)

    def forward(self, memory, decoder_inputs, memory_lengths):
        # memory: (batch_size, seq_len, 512)
        # decoder_inputs: (batch_size, max_time, 80)
        # memory_lengths: (batch_size)
        batch_size = memory.size(0)
        max_time = decoder_inputs.size(1)
        device = memory.device

        # 마스크 생성
        mask = ~get_mask_from_lengths(memory_lengths, max_len=memory.size(1)).to(device)

        # 초기화
        attention_hidden = torch.zeros(batch_size, 1024).to(device)
        attention_cell = torch.zeros(batch_size, 1024).to(device)
        decoder_hidden = torch.zeros(batch_size, 1024).to(device)
        decoder_cell = torch.zeros(batch_size, 1024).to(device)
        attention_context = torch.zeros(batch_size, 512).to(device)
        mel_outputs = []
        stop_outputs = []

        for t in range(max_time):
            prenet_input = decoder_inputs[:, t, :]  # (batch_size, 80)
            prenet_output = self.prenet(prenet_input)
            # 어텐션 RNN
            attention_rnn_input = torch.cat((prenet_output, attention_context), dim=-1)
            attention_hidden, attention_cell = self.attention_rnn(attention_rnn_input, (attention_hidden, attention_cell))
            # 어텐션 메커니즘
            attention_context, attention_weights = self.attention_layer(attention_hidden, memory, mask)
            # 디코더 RNN
            decoder_rnn_input = torch.cat((attention_hidden, attention_context), dim=-1)
            decoder_hidden, decoder_cell = self.decoder_rnn(decoder_rnn_input, (decoder_hidden, decoder_cell))
            # 출력 계산
            decoder_output = torch.cat((decoder_hidden, attention_context), dim=-1)
            mel_output = self.linear_projection(decoder_output)
            stop_output = self.stop_projection(decoder_output)
            mel_outputs.append(mel_output.unsqueeze(1))
            stop_outputs.append(stop_output.unsqueeze(1))

        mel_outputs = torch.cat(mel_outputs, dim=1)  # (batch_size, max_time, 80)
        stop_outputs = torch.cat(stop_outputs, dim=1)  # (batch_size, max_time, 1)
        return mel_outputs, stop_outputs


In [31]:
class PostNet(nn.Module):
    def __init__(self):
        super(PostNet, self).__init__()
        self.layers = nn.ModuleList()
        in_channels = [80] + [512] * 4
        out_channels = [512] * 4 + [80]
        for i in range(5):
            self.layers.append(
                nn.Sequential(
                    nn.Conv1d(in_channels[i], out_channels[i], kernel_size=5, padding=2),
                    nn.BatchNorm1d(out_channels[i]),
                    nn.Tanh() if i < 4 else nn.Identity(),
                    nn.Dropout(0.5)
                )
            )

    def forward(self, x):
        x = x.transpose(1, 2)  # (batch_size, 80, max_time)
        for layer in self.layers:
            x = layer(x)
        x = x.transpose(1, 2)  # (batch_size, max_time, 80)
        return x


In [32]:
class Tacotron2(nn.Module):
    def __init__(self, vocab_size, embedding_dim=512):
        super(Tacotron2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = Encoder(embedding_dim)
        self.decoder = Decoder()
        self.postnet = PostNet()

    def forward(self, text_inputs, mel_targets, input_lengths):
        # 인코더
        embedded_inputs = self.embedding(text_inputs)  # (batch_size, seq_len, embedding_dim)
        encoder_outputs = self.encoder(embedded_inputs)

        # 디코더 입력 준비
        decoder_inputs = self._get_decoder_inputs(mel_targets)

        # 디코더
        mel_outputs, stop_outputs = self.decoder(encoder_outputs, decoder_inputs, input_lengths)

        # 포스트넷
        mel_outputs_postnet = mel_outputs + self.postnet(mel_outputs)

        return mel_outputs, mel_outputs_postnet, stop_outputs

    def _get_decoder_inputs(self, mel_targets):
        go_frame = torch.zeros(mel_targets.size(0), 1, mel_targets.size(2)).to(mel_targets.device)
        decoder_inputs = torch.cat((go_frame, mel_targets[:, :-1, :]), dim=1)
        return decoder_inputs  # (batch_size, mel_length, num_mels)


In [34]:
def Tacotron2Loss(mel_outputs, mel_outputs_postnet, mel_targets, stop_outputs, stop_targets, mel_mask):
    # 마스킹 적용하여 손실 계산
    mel_loss = F.mse_loss(mel_outputs[mel_mask], mel_targets[mel_mask])
    mel_postnet_loss = F.mse_loss(mel_outputs_postnet[mel_mask], mel_targets[mel_mask])
    stop_token_loss = F.binary_cross_entropy_with_logits(stop_outputs.squeeze(-1), stop_targets, reduction='none')
    stop_token_loss = (stop_token_loss * mel_mask.squeeze(-1)).sum() / mel_mask.sum()

    # 총 손실
    total_loss = mel_loss + mel_postnet_loss + 0.1 * stop_token_loss
    return total_loss

In [35]:
model = Tacotron2(vocab_size=len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-6)

NameError: name 'vocab' is not defined

In [36]:
# 데이터로더 생성
dataset = TacotronDataset(metadata_path, vocab)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

num_epochs = 100
log_interval = 10

for epoch in range(num_epochs):
    model.train()
    for i, (text_inputs, input_lengths, mel_targets, mel_lengths) in enumerate(data_loader):
        text_inputs = text_inputs.to(device)
        input_lengths = input_lengths.to(device)
        mel_targets = mel_targets.to(device)
        mel_lengths = mel_lengths.to(device)

        # 마스크 생성
        mel_mask = get_mask_from_lengths(mel_lengths, max_len=mel_targets.size(1)).unsqueeze(-1).to(device)

        # 스톱 토큰 타겟 생성
        stop_targets = torch.zeros(mel_targets.size(0), mel_targets.size(1)).to(device)
        for j, length in enumerate(mel_lengths):
            stop_targets[j, length - 1:] = 1.0

        # 모델 출력
        mel_outputs, mel_outputs_postnet, stop_outputs = model(text_inputs, mel_targets, input_lengths)

        # 손실 계산
        loss = Tacotron2Loss(mel_outputs, mel_outputs_postnet, mel_targets, stop_outputs, stop_targets, mel_mask)

        # 역전파 및 옵티마이저 스텝
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # 로그 출력
        if i % log_interval == 0:
            print(f'Epoch: {epoch}, Step: {i}, Loss: {loss.item()}')


NameError: name 'vocab' is not defined

In [37]:
def synthesize(model, text, vocab, max_decoder_steps=1000):
    model.eval()
    with torch.no_grad():
        # 텍스트 전처리
        sequence = text_to_sequence(text)
        sequence = [vocab.get(char, vocab['<unk>']) for char in sequence]
        text_inputs = torch.LongTensor(sequence).unsqueeze(0).to(device)
        input_lengths = torch.LongTensor([len(sequence)]).to(device)

        # 인코더
        embedded_inputs = model.embedding(text_inputs)
        encoder_outputs = model.encoder(embedded_inputs)

        # 디코더 초기화
        batch_size = 1
        attention_hidden = torch.zeros(batch_size, 1024).to(device)
        attention_cell = torch.zeros(batch_size, 1024).to(device)
        decoder_hidden = torch.zeros(batch_size, 1024).to(device)
        decoder_cell = torch.zeros(batch_size, 1024).to(device)
        attention_context = torch.zeros(batch_size, 512).to(device)
        mel_outputs = []
        stop_outputs = []
        decoder_input = torch.zeros(batch_size, 80).to(device)  # 시작 프레임

        # 마스크 생성
        memory_lengths = input_lengths
        mask = ~get_mask_from_lengths(memory_lengths, max_len=encoder_outputs.size(1)).to(device)

        for t in range(max_decoder_steps):
            prenet_output = model.decoder.prenet(decoder_input)
            attention_rnn_input = torch.cat((prenet_output, attention_context), dim=-1)
            attention_hidden, attention_cell = model.decoder.attention_rnn(attention_rnn_input, (attention_hidden, attention_cell))
            attention_context, attention_weights = model.decoder.attention_layer(attention_hidden, encoder_outputs, mask)
            decoder_rnn_input = torch.cat((attention_hidden, attention_context), dim=-1)
            decoder_hidden, decoder_cell = model.decoder.decoder_rnn(decoder_rnn_input, (decoder_hidden, decoder_cell))
            decoder_output = torch.cat((decoder_hidden, attention_context), dim=-1)
            mel_output = model.decoder.linear_projection(decoder_output)
            stop_output = model.decoder.stop_projection(decoder_output)
            mel_outputs.append(mel_output.unsqueeze(1))
            stop_outputs.append(stop_output)

            # 종료 조건 확인
            if torch.sigmoid(stop_output) > 0.5:
                break

            decoder_input = mel_output

        mel_outputs = torch.cat(mel_outputs, dim=1)  # (batch_size, time_steps, 80)
        mel_outputs_postnet = mel_outputs + model.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs_postnet.squeeze(0).cpu().numpy()
        return mel_outputs_postnet


In [38]:
# 예시: WaveGlow를 사용한 음성 합성
# WaveGlow 모델 로드 (사전 학습된 모델 필요)
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow').to(device)
waveglow.eval()

def mel_to_audio(mel):
    mel = torch.from_numpy(mel).unsqueeze(0).to(device)
    with torch.no_grad():
        audio = waveglow.infer(mel)
    audio = audio.cpu().numpy()[0]
    return audio


Downloading: "https://github.com/nvidia/DeepLearningExamples/zipball/torchhub" to C:\Users\Seungju/.cache\torch\hub\torchhub.zip
Downloading checkpoint from https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427


KeyboardInterrupt: 

In [39]:
# 텍스트 입력
text = "안녕하세요. 타코트론2 모델을 사용하여 음성을 합성합니다."

# 멜 스펙트로그램 생성
mel_output = synthesize(model, text, vocab)

# 음성 신호로 변환
audio = mel_to_audio(mel_output)

# 오디오 저장
import soundfile as sf
sf.write('output.wav', audio, 22050)


NameError: name 'model' is not defined