In [1]:
import sys
sys.path.insert(0, '/tf/utils/')

In [2]:
from data_generators import NoisyTargetGenerator
from sound import Sound
import numpy as np

from utils import calculate_stft_magnitude_and_phase

In [3]:
base_shape_size = 8192
ws = 255
ol = 128
input_shape = (128, 64, 2)

In [4]:
sound_base = Sound('/tf/Dados/Vozes/', '/tf/Dados/Ruido/', base_shape_size)

Loading Speech Files: 100%|██████████| 8179/8179 [00:05<00:00, 1471.99it/s]
Loading Noise Files: 100%|██████████| 8137/8137 [00:05<00:00, 1401.75it/s]
  clean_sounds = [sound for sound in clean_sounds if sound != self.TOO_SHORT_ERROR]
  noise_sounds = [sound for sound in noise_sounds if sound != self.TOO_SHORT_ERROR]


In [5]:
class TransformerGenerator(NoisyTargetGenerator):
    def __init__(self, sound_files, noise_files, block_size=2, normalize_phi=True):
        super().__init__(sound_files, noise_files, block_size=block_size, normalize_phi=normalize_phi)

    def generate_batch_transformer(self, clean_phasors):
        """
        Generate a batch of transformed arrays and y_samples from clean and noisy phasors.
    
        :param clean_phasors: Numpy array of shape (128, 64, 2), representing the clean phasors.
        :return: Tuple containing a list of 64 transformed arrays of shape (128, 128, 2) and an array y_samples of shape (64, 128, 1, 2).
        """
    
        # Criando um novo array com shape (128, 64, 2) com zeros
        new_array = np.zeros((128, 64, 2))
    
        # Preenchendo o novo array com os valores do array clean_phasors
        # new_array[:, :64, :] = clean_phasors
    
        # Inicializando a lista para armazenar os 64 arrays
        arrays = []
    
        # Inicializando o array y_samples com o shape desejado (64, 128, 1, 2)
        y_samples = np.zeros((64, 128, 1, 2))
    
        for i in range(64):
            # Copiando o array new_array para um novo array temporário
            temp_array = np.copy(new_array)
            
            # Substituindo os valores em temp_array pelos valores correspondentes de clean_phasors
            if i != 0:
                temp_array[:, :i, :] = clean_phasors[:, :i, :]
    
            # Adicionando o temp_array à lista
            arrays.append(temp_array)
    
            # Preenchendo y_samples com os valores correspondentes de clean_phasors
            y_samples[i, :, 0, :] = clean_phasors[:, i, :]
    
        return np.array(arrays), y_samples
            
        
    
    def generate_sample_completo(self, batch_size=32, include_clean=False):
        while True:
            # Carrega um lote de vozes e ruidos
            sound_batch, noise_batch = self.pick_random_blocks(batch_size)

            xn_train = []
            xc_train = []
            y_train = []
            
            # Adiciona ruído a cada som e calcula a nota PESQ
            for sound, noise in zip(sound_batch, noise_batch):

                sound_escalado, noisy_sound = self.normalize_and_add_noise(sound, noise)
                
                if sound_escalado is None or noisy_sound is None:
                    continue
                
                # try:
                A, phi, _, _ = calculate_stft_magnitude_and_phase(sound_escalado)
                A_noisy, phi_noisy, _, _ = calculate_stft_magnitude_and_phase(noisy_sound)
                # except:
                #     continue

                F = self.assemble_phasors(A, phi)
                F_noisy = self.assemble_phasors(A_noisy, phi_noisy)

                xc_samples, y_samples = self.generate_batch_transformer(F)
                
                # Adiciona os exemplos aos lotes de treinamento
                for _ in range(64):
                    xn_train.append(F_noisy)
                xc_train.append(xc_samples)
                y_train.append(y_samples)
                
                if include_clean:
                    xc_samples, y_samples = self.generate_batch_transformer(F)
                    for _ in range(64):
                        xn_train.append(F)
                    xc_train.append(xc_samples)
                    y_train.append(y_samples)

            xn_train = np.array(xn_train)
            xc_train = np.array(xc_train)
            y_train = np.array(y_train)
            
            yield [xn_train, xc_train.reshape(-1, 128, 64, 2)], y_train.reshape(-1, 128, 1, 2)

In [6]:
data_generator_train = TransformerGenerator(sound_base.train_X, sound_base.noise_sounds)
data_generator_val = TransformerGenerator(sound_base.val_X, sound_base.noise_sounds)

In [7]:
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization
from tensorflow.keras import Input, Model
import tensorflow as tf

class MultiHeadAttention(Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)

        self.dense = Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention_logits = tf.matmul(q, k, transpose_b=True)
        scaled_attention_logits /= tf.math.sqrt(tf.cast(self.depth, tf.float32))

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

        output = tf.matmul(attention_weights, v)

        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))

        return self.dense(output)

# Agora, vamos construir as camadas de encoder e decoder usando a camada de MultiHeadAttention
def encoder_layer(d_model, num_heads, dff, rate=0.1):
    inputs = Input(shape=(None, d_model))
    padding_mask = Input(shape=(1, 1, None))

    attention = MultiHeadAttention(d_model, num_heads)(inputs, inputs, inputs, padding_mask)
    attention = Dropout(rate)(attention)
    attention = LayerNormalization(epsilon=1e-6)(inputs + attention)

    outputs = Dense(dff, activation='relu')(attention)
    outputs = Dense(d_model)(outputs)
    outputs = Dropout(rate)(outputs)
    outputs = LayerNormalization(epsilon=1e-6)(attention + outputs)

    return Model(inputs=[inputs, padding_mask], outputs=outputs, name="encoder_layer")

def decoder_layer(d_model, num_heads, dff, rate=0.1):
    inputs = Input(shape=(None, d_model))
    enc_outputs = Input(shape=(None, d_model))
    look_ahead_mask = Input(shape=(1, None, None))
    padding_mask = Input(shape=(1, 1, None))

    attention1 = MultiHeadAttention(d_model, num_heads)(inputs, inputs, inputs, look_ahead_mask)
    attention1 = Dropout(rate)(attention1)
    attention1 = LayerNormalization(epsilon=1e-6)(inputs + attention1)

    attention2 = MultiHeadAttention(d_model, num_heads)(enc_outputs, enc_outputs, attention1, padding_mask)
    attention2 = Dropout(rate)(attention2)
    attention2 = LayerNormalization(epsilon=1e-6)(attention1 + attention2)

    outputs = Dense(dff, activation='relu')(attention2)
    outputs = Dense(d_model)(outputs)
    outputs = Dropout(rate)(outputs)
    outputs = LayerNormalization(epsilon=1e-6)(attention2 + outputs)

    return Model(inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name="decoder_layer")

In [16]:
from tensorflow.keras.layers import Input, Dense, MaxPooling2D, Reshape
from tensorflow.keras.models import Model
import tensorflow as tf
def getTransformerLayers(input_seq, target_seq):
    # Parâmetros do modelo
    num_layers = 2
    d_model = 32
    num_heads = 2
    dff = 512
    max_seq_len = 512
    dropout_rate = 0.1
    
    # Função para criar uma única camada do Encoder
    def single_encoder_layer(d_model, num_heads, dff, rate):
        inputs = Input(shape=(None, d_model))
        attention = MultiHeadAttention(d_model, num_heads)(inputs, inputs, inputs, None)
        attention = Dropout(rate)(attention)
        attention = LayerNormalization(epsilon=1e-6)(inputs + attention)
    
        outputs = Dense(dff, activation='relu')(attention)
        outputs = Dense(d_model)(outputs)
        outputs = Dropout(rate)(outputs)
        outputs = LayerNormalization(epsilon=1e-6)(attention + outputs)
    
        return Model(inputs=inputs, outputs=outputs)
    
    # Função para criar uma única camada do Decoder
    def single_decoder_layer(d_model, num_heads, dff, rate):
        inputs = Input(shape=(None, d_model))
        enc_outputs = Input(shape=(None, d_model))
    
        attention1 = MultiHeadAttention(d_model, num_heads)(inputs, inputs, inputs, None)
        attention1 = Dropout(rate)(attention1)
        attention1 = LayerNormalization(epsilon=1e-6)(inputs + attention1)
    
        attention2 = MultiHeadAttention(d_model, num_heads)(enc_outputs, enc_outputs, attention1, None)
        attention2 = Dropout(rate)(attention2)
        attention2 = LayerNormalization(epsilon=1e-6)(attention1 + attention2)
    
        outputs = Dense(dff, activation='relu')(attention2)
        outputs = Dense(d_model)(outputs)
        outputs = Dropout(rate)(outputs)
        outputs = LayerNormalization(epsilon=1e-6)(attention2 + outputs)
    
        return Model(inputs=[inputs, enc_outputs], outputs=outputs)
    
    # Criando as camadas de Encoder e Decoder
    encoder_layers = [single_encoder_layer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
    decoder_layers = [single_decoder_layer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
    
    # Definindo as entradas do modelo
    # input_seq = Input(shape=(max_seq_len, d_model))
    # target_seq = Input(shape=(max_seq_len, d_model))
    
    # Construindo o Encoder
    x = input_seq
    for encoder_layer in encoder_layers:
        x = encoder_layer(x)
    encoder_output = x
    
    # Construindo o Decoder
    y = target_seq
    for decoder_layer in decoder_layers:
        y = decoder_layer([y, encoder_output])
    decoder_output = y
    
    # Camada de saída
    final_output = Dense(d_model, activation='linear')(decoder_output)

    return final_output
# Criando o modelo Transformer
# transformer = Model(inputs=[input_seq, target_seq], outputs=final_output)

# Resumo do modelo
# transformer.summary()

In [17]:
def double_conv_layer(x, filter_size=3, size=32, dropout=0., batch_norm=False):
        '''
        construction of a double convolutional layer using
        SAME padding
        RELU nonlinear activation function
        :param x: input
        :param filter_size: size of convolutional filter
        :param size: number of filters
        :param dropout: FLAG & RATE of dropout.
                if < 0 dropout cancelled, if > 0 set as the rate
        :param batch_norm: flag of if batch_norm used,
                if True batch normalization
        :return: output of a double convolutional layer
        '''

        axis = 3
        conv = tf.keras.layers.Conv2D(size, (filter_size, filter_size), padding='same')(x)
        if batch_norm is True:
            conv = tf.keras.layers.BatchNormalization(axis=axis)(conv)
        conv = tf.keras.layers.Activation('relu')(conv)
        conv = tf.keras.layers.Conv2D(size, (filter_size, filter_size), padding='same')(conv)
        if batch_norm is True:
            conv = tf.keras.layers.BatchNormalization(axis=axis)(conv)
        conv = tf.keras.layers.Activation('relu')(conv)
        if dropout > 0:
            conv = tf.keras.layers.Dropout(dropout)(conv)

        shortcut = tf.keras.layers.Conv2D(size, kernel_size=(1, 1), padding='same')(x)
        if batch_norm is True:
            shortcut = tf.keras.layers.BatchNormalization(axis=axis)(shortcut)

        res_path = tf.keras.layers.add([shortcut, conv])
        return res_path

In [18]:
def embedding(inputs):
    FILTER_SIZE = 3
    FILTER_NUM = 32
    # Downsampling layers
    # DownRes 1, double residual convolution + pooling
    conv_128 = double_conv_layer(inputs, FILTER_SIZE, FILTER_NUM)
    pool_64 = MaxPooling2D(pool_size=(2,2))(conv_128)
    # DownRes 2
    conv_64 = double_conv_layer(pool_64, FILTER_SIZE, 2*FILTER_NUM)
    pool_32 = MaxPooling2D(pool_size=(2,2))(conv_64)
    # DownRes 3
    conv_32 = double_conv_layer(pool_32, FILTER_SIZE, 4*FILTER_NUM)
    pool_16 = MaxPooling2D(pool_size=(2,2))(conv_32)
    # DownRes 4
    conv_16 = double_conv_layer(pool_16, FILTER_SIZE, 8*FILTER_NUM)
    pool_8 = MaxPooling2D(pool_size=(2,2))(conv_16)
    # DownRes 5, convolution only
    conv_8 = double_conv_layer(pool_8, FILTER_SIZE, 16*FILTER_NUM)

    return conv_8

In [19]:
def desembeding(inputs):
    FILTER_SIZE = 3
    FILTER_NUM = 32
    UP_SAMP_SIZE = (2, 2)
    axis = 3

    # Upsampling layers
    up_16 = tf.keras.layers.UpSampling2D(size=UP_SAMP_SIZE, data_format="channels_last")(inputs)
    up_conv_16 = double_conv_layer(up_16, FILTER_SIZE, 8*FILTER_NUM)

    up_32 = tf.keras.layers.UpSampling2D(size=UP_SAMP_SIZE, data_format="channels_last")(up_conv_16)
    up_conv_32 = double_conv_layer(up_32, FILTER_SIZE, 4*FILTER_NUM)

    up_64 = tf.keras.layers.UpSampling2D(size=UP_SAMP_SIZE, data_format="channels_last")(up_conv_32)
    up_conv_64 = double_conv_layer(up_64, FILTER_SIZE, 2*FILTER_NUM)

    up_128 = tf.keras.layers.UpSampling2D(size=UP_SAMP_SIZE, data_format="channels_last")(up_conv_64)
    up_conv_128 = double_conv_layer(up_128, FILTER_SIZE, FILTER_NUM)

    # Ajustando o shape para (128, 1, 2)
    conv_final = tf.keras.layers.Conv2D(2, kernel_size=(64, 1), strides=(64, 1))(up_conv_128)
    final = tf.keras.layers.Reshape((128, 1, 2))(conv_final)

    return final

In [20]:
noisy = Input((128, 64, 2))
gen = Input((128, 64, 2))

emb_noizy = embedding(noisy)
emb_gen = embedding(gen)

emb_noizy_reshape = Reshape((512, 32))(emb_noizy)
emb_gen_reshape = Reshape((512, 32))(emb_gen)

trasformer = getTransformerLayers(emb_noizy_reshape, emb_gen_reshape)
emb_out = Reshape((8, 4, 512))(trasformer)

output = desembeding(emb_out)

model = Model(inputs=[noisy, emb_gen], outputs=output)

In [21]:
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 128, 64, 2)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_43 (Conv2D)             (None, 128, 64, 32)  608         ['input_16[0][0]']               
                                                                                                  
 activation_28 (Activation)     (None, 128, 64, 32)  0           ['conv2d_43[1][0]']              
                                                                                                  
 conv2d_44 (Conv2D)             (None, 128, 64, 32)  9248        ['activation_28[1][0]']   

In [22]:
model.compile(optimizer='adam', loss='msle')

In [23]:
batch_size = 2
steps_per_epoch = len(sound_base.train_X) // batch_size

print('Starting training')

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    
    # Gera um novo lote de validação para cada época
    validation_batch = next(data_generator_val.generate_sample_completo(batch_size=batch_size))
    [x1_val, x2_val], y_val = validation_batch
    
    model.fit(data_generator_train.generate_sample_completo(batch_size=batch_size, include_clean=False),
                     steps_per_epoch=steps_per_epoch,
                     epochs=1,
                     validation_data=([x1_val, x2_val], y_val),
                    )

Starting training
Epoch 1
 1300/16114 [=>............................] - ETA: 2:18:34 - loss: 0.1680

KeyboardInterrupt: 