In [1]:
import sys
#from Tacotron import Tacotron
from text import text_to_sequence, symbols
import torch
from torch.autograd import Variable
import numpy as np

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torch.nn.functional as F

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

In [21]:


class HighwayNet(nn.Module):
    """
        raw_input: (batch_size, Channel, Length)
        input: (# of batch, seq_length, 128(input feature))
        h * t + x * (1. - t)
        output: (# of batch, seq_length, 128(output feature))
    """
    def __init__(self):
        super(HighwayNet, self).__init__()
        
        self.H = nn.Linear(128, 128)
        self.T = nn.Linear(128, 128)
        
    def forward(self, x):
        h = F.relu(self.H(x))
        t = torch.sigmoid(self.T(x))
        
        output = h * t + x * (1. - t)
        return output



class EncoderCBHG(nn.Module):
    """
        raw_input: prenet output
        input: (batch_size, channels, seq_length)
        Conv1D bank - Max Pooling - Conv1D projection - Conv1D Layer
        output: (seq_length, batch_size, 2 * hidden_size)
    """
    def __init__(self, K=16):
        super(EncoderCBHG, self).__init__()
        #-----------------Conv1Dbank-------------------#
        self.conv1dBank = nn.ModuleList(
            [nn.Conv1d(128, 128, k, stride=1, padding=k//2)
            for k in range(1, K+1)]
        )
        #-----------------Max pooling------------------#
        self.maxPool = nn.MaxPool1d(2, stride=1, padding=1)
        #---------------Conv1Dprojection---------------#
        self.conv1dProjs = nn.ModuleList(
            [nn.Conv1d(128 * K, 128, 3, stride=1, padding=1)
            ,nn.Conv1d(128, 128, 3, stride=1, padding=1)]
        )
        #-----------------Highway Net------------------#
        self.highwayNet = nn.ModuleList(
            [HighwayNet() for _ in range(4)]
        )
        #--------------Bidirectional GRU---------------#
        self.GRU = nn.GRU(128, 128, bidirectional=True, batch_first=True)
        #-------------Batch normalization--------------#
        self.bn = nn.BatchNorm1d(128)
        
    def forward(self, x):
        """
            raw_input: (# of batch, seq_length, 128(output feature))
            input: (batch_size, channels, seq_length)
            Conv1D bank - Max Pooling - Conv1D projection - Conv1D Layer
            output: (seq_length, batch_size, 2 * hidden_size)
        """
        x = x.transpose(1, 2)
        #-----------------Conv1Dbank-------------------#
        stacked = []
        for conv1d in self.conv1dBank:
            stacked.append(self.bn(conv1d(x)))
        stacked = torch.cat(stacked, dim=1)
        #-----------------Max pooling------------------#
        y = self.maxPool(stacked)
        #---------------Conv1Dprojection---------------#
        y = self.bn(self.relu(self.conv1dProjs[0](y)))
        y = self.bn(self.conv1dProjs[1](y))
        #-------------residual connection--------------#
        y = y + x
        #----------------Highway Net-------------------#
        y = y.T(1, 2)
        for layer in self.highwayNet:
            y = self.relu(layer(y))
        #--------------Bidirectional GRU---------------#
        y = y.T(0, 1)
        y, _ = self.GRU(y)
        
        return y


class Prenet(nn.Module):
    """
        raw_input: encoder input
        input: (# of batch, seq_length, 128(output feature))
        FC(Dense) - ReLU - Dropout - FC - ReLU - Dropout
        output: (# of batch, seq_length, 128(output feature))

    """
    def __init__(self):
        super(Prenet, self).__init__()
        self.layer = nn.ModuleList(
            [nn.Linear(256, 256)
            ,nn.Linear(256, 128)]
        )
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x):

        for layer in self.layer:
            x = self.dropout(F.relu(layer(x)))
        
        return x
        

class Encoder(nn.Module):
    """
        prenet - CBHG
    """
    def __init__(self):
        super(Encoder, self).__init__()
        
        self.prenet = Prenet()
        self.cbhg = EncoderCBHG()
    
    def forward(self, x):
        x = self.cbhg(self.prenet(x))
        
        return x


In [22]:
class HighwayNet(nn.Module):
    """
    h * t + x * (1. - t)
    """
    def __init__(self):
        super(HighwayNet, self).__init__()
        
        self.H = nn.Linear(128, 128)
        self.T = nn.Linear(128, 128)
        
    def forward(self, x):
        h = F.relu(self.H(x))
        t = torch.sigmoid(self.T(x))
        
        output = h * t + x * (1. - t)
        return output

class DecoderCBHG(nn.Module):
    """
    Conv1D bank - Max Pooling - Conv1D projection - Conv1D Layer
    """
    def __init__(self, K=8):
        super(DecoderCBHG, self).__init__()
        #conv1d: (batch_size, Channel, Length)
        #-----------------Conv1Dbank-------------------#
        self.conv1dBank = nn.ModuleList(
            [nn.Conv1d(128, 128, k, stride=1, padding=k//2)
            for k in range(1, K+1)]
        )
        #-----------------Max pooling------------------#
        self.maxPool = nn.MaxPool1d(2, stride=1, padding=1)
        #---------------Conv1Dprojection---------------#
        self.conv1dProjs = nn.ModuleList(
            [nn.Conv1d(128 * K, 256, 3, stride=1, padding=1)
            ,nn.Conv1d(256, 80, 3, stride=1, padding=1)]
        )
        #-----------------Highway Net------------------#
        self.highwayNet = nn.ModuleList(
            [HighwayNet() for _ in range(4)]
        )
        #--------------Bidirectional GRU---------------#
        self.GRU = nn.GRU(128, 128, bidirectional=True, batch_first=True)
        #-------------Batch normalization--------------#
        self.bn = nn.BatchNorm1d(128)
        
    def forward(self, x):
    
        x = x.transpose(1, 2) # Shape: (batch_size, channels, seq_length)
        #-----------------Conv1Dbank-------------------#
        stacked = []
        for conv1d in conv1dBank:
            stacked.append(self.bn(conv1d(x)))
        stacked = torch.cat(stacked, dim=1)
        #shape: 
        #-----------------Max pooling------------------#
        y = self.maxPool(stacked)
        #---------------Conv1Dprojection---------------#
        y = self.bn(self.relu(self.conv1dProjs[0](y)))
        y = self.bn(self.conv1dProjs[1](y))
        #-------------residual connection--------------#
        y = y + x
        #----------------Highway Net-------------------#
        for layer in self.highwayNet:
            y = self.relu(layer(y))
        #--------------Bidirectional GRU---------------#
        y, _ = self.GRU(y)
        
        return y

             
class AttentionWrapper(nn.Module):
    def __init__(self, rnn, use_attention):
        super(AttentionWrapper, self).__init__()
        self.rnn_cell = rnn
        self.attention = use_attention
        self.projection_for_decoderRNN = nn.Linear(512, 256, bias=False)
    def forward(self, memory, decoder_input, cell_hidden):
        """
        memory = (batch_size, encoder_T, dim)
        decoder_input = (batch_size, dim)
        cell_hidden (previous time step cell state) = (batch, dim)
        """
        batch_size = memory.size(0)
        #cell_input = torch.cat((decoder_input, prev_attention), -1) -- why do we have to concat?
        cell_input = decoder_input
        query = self.rnn_cell(cell_input, cell_hidden)
        #feed into attention
        attention_weights = self.attention(query, memory)
        #make context vector
        attention_weights = F.softmax(attention_weights, dim=-1)
        context = torch.bmm(attention_weights.view(batch_size, 1, -1), memory).squeeze(1)
        out = self.projection_for_decoderRNN(torch.cat([context, query],dim=-1))
        return out, query, attention_weights


class BahdanauAttention(nn.Module):
    def __init__(self):
        super(BahdanauAttention, self).__init__()
        self.v = nn.Linear(256,1,bias=False)
        self.query_layer = nn.Linear(256,256,bias=False)
        self.tanh = nn.Tanh()
    def forward(self, query, memory):
        """
        query : (batch, 1 ,dim)
        """
        if query.dim() == 2:
            query = query.unsqueeze(1)
        attention_weight = self.v(self.tanh(self.query_layer(query) + memory))
        return attention_weight


class Decoder(nn.Module):
    def __init__(self, spect_dim, r=2):
        super(Decoder, self).__init__()
        self.spect_dim = spect_dim
        self.r = r
        self.prenet = Prenet()
        self.attention_RNN = AttentionWrapper(nn.GRUCell(input_size=256, hidden_size =256), BahdanauAttention())
        self.decoder_RNN = nn.ModuleList(
                            [nn.GRUCell(input_size=256,hidden_size=256) for _ in range(2)])
        self.spectro_layer = nn.Linear(256,spect_dim*r,bias=False)
        self.epsilon = 0.2
        self.maximum_step = 1000
        return
    
    def forward(self, memory, target=None):
        """
        if training time, input is given, else input is decoder outputs
        input : 
            memory (encoder_output) = (batch_size, encoder_T, char_dim)
            decoder_input = (batch_size, decoder_T, dim)
        output:
            
        """
        batch_size = memory.size(0)
        test = target is None
        decoder_T = 0
        
        #train data를 r 단위로 묶어준 후 T의 크기를 바꾸어준다.
        if not test:
            target = target.view(batch_size, target.size(1) // r, -1)
            decoder_T = target.size(1)
            target = target.transpose(0,1) #for parallelization
            
        #2단계 decoderRNN 값 저장할 array
        decoderRNN_output = [memory.zero_() for _ in range(len(decoder_RNN))] 
        
        #<GO> Frame
        current_input = torch.zero([batch_size, self.r*self.spect_dim])
        t = 0
        targets = []
        attention_weights = []
        
        while (True):
            t = t + 1
            #prenet
            #(B, spect_dim * r)
            prenet_output = self.prenet(current_input)
            
            #attention
            #(B, 256)
            attention_output, cell_hidden, attention_weight = self.attention(memory, prenet_output, cell_hidden)
            
            #decoder
            #(B, spect_dim * r)
            for idx in range(2):
                decoderRNN_output[idx] = self.decoder_RNN[idx](attention_output, decoder_output[idx])
                decoderRNN_output[idx] += attention_output
                attention_output = decoder_output[idx]
            
            #projection
            targetchar =self.spectro_layer(attention_output)
            targets += [targetchar]
            attention_weights += [attention_weight]
            
            #check if this target is the end
            if test:
                if t > 1 and (targetchar<=self.epsilon).all(): break
                if t > self.maximum_step: 
                    print("ERROR : Not converge")
                    break
            else:
                if t >= decoder_T:
                    break
                    
            #change current input
            if test:
                current_input = targets[-1]
            else:
                current_input = target[t-1]
        
        attention_weights = torch.stack(attention_weights).transpose(0,1)
        outputs = torch.stack(outputs).transpose(0,1).contiguous()
        return outputs, attention_weights


In [26]:
class PostProcessing(nn.Module):
    """
    make post processing data
    input : (B, decoder.T, spect_dim)
    """
    def __init__(self, spect_dim):
        super(PostProcessing, self).__init__()
        self.postcbhg = DecoderCBHG(K=8)
        self.linear = nn.Linear(spect_dim * 2, 1025)
    def forward(self, batch_size, data):
        """
            make data shape (B, -1, 80)
        """
        data = data.view(batch_size, -1, 80)
        output = self.postcbhg(data)
        output = self.linear(output)
        
        return output


class Tacotron(nn.Module):
    def __init__(self, vocab_num, input_dim=256, spect_dim=80):
        super(Tacotron, self).__init__()
        self.input_dim = input_dim
        self.spect_dim = spect_dim
        self.embedding = nn.Embedding(vocab_num, input_dim) #embedding dimension
        self.embedding.weight.data.normal_(0,0.3)
        self.Encoder = Encoder()
        self.Decoder = Decoder(spect_dim, r=2) #write input_dimension
        self.Postprocessing = PostProcessing(spect_dim)
    def forward(self, inputs, spect_targets=None, r= 5):
        """
        make total model!
        input : (B, encoder.T, in_dim)
        
        """
        batch_size = inputs.size(0)
        memory = self.embedding(inputs)
        
        #encoding
        #(B, encoder.T, input_dim)
        memory = self.Encoder(memory)
        
        #decoding
        #(B, encoder.T, mel_dim * r)
        decoder_output = self.decoder(memory, spect_targets)
        
        #postprocessing
        #(B, decoder.T, 1025)
        decoder_output = decoder_output.view(B, -1, self.spect_dim)
        wav_output = self.PostProcessing(batch_size, decoder_output)
        
        return decoder_output, wav_output

In [27]:

def _pad(seq, max_len):
    return np.pad(seq, (0, max_len - len(seq)),
                  mode='constant', constant_values=0)


def test_taco():
    B, T_out, D_out = 2, 400, 80
    r = 5
    T_encoder = T_out // r

    texts = ["Thank you very much.", "Hello"]
    seqs = [np.array(text_to_sequence(
        t, ["english_cleaners"]), dtype=np.int) for t in texts]
    input_lengths = np.array([len(s) for s in seqs])
    max_len = np.max(input_lengths)
    seqs = np.array([_pad(s, max_len) for s in seqs])

    x = torch.LongTensor(seqs)
    y = torch.rand(B, T_out, D_out)
    x = Variable(x)
    y = Variable(y)

    model = Tacotron(vocab_num=len(symbols))

    print("Encoder input shape: ", x.size())
    print("Decoder input shape: ", y.size())
    a, b, c = model(x, spect_targets=y)
    print("Mel shape:", a.size())
    print("Linear shape:", b.size())
    print("Attention shape:", c.size())

    assert c.size() == (B, T_encoder, max_len)

    # Test greddy decoding
    a, b, c = model(x)

In [28]:
test_taco()

Encoder input shape:  torch.Size([2, 21])
Decoder input shape:  torch.Size([2, 400, 80])


RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 1. Got 21 and 22 in dimension 2 at c:\programdata\miniconda3\conda-bld\pytorch_1533090623466\work\aten\src\th\generic/THTensorMath.cpp:3616