In [1]:
import sys
sys.path.insert(0, '/tf/utils/')

In [35]:
from data_generators import NoisyTargetGenerator
from sound import Sound
import numpy as np
from artigos.Transformer import getTransformerLayers
from utils import calculate_stft_magnitude_and_phase

In [30]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Conv2D, Reshape, Input
from tensorflow.keras import Model

In [15]:
class PositionalEncoding(Layer):
    def __init__(self, num_positions, d_model, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.d_model = d_model
        self.num_positions = num_positions
        self.pos_encoding = self.positional_encoding(num_positions, d_model)

    def positional_encoding(self, num_positions, d_model):
        pos_enc = np.zeros((num_positions, d_model))
        for pos in range(num_positions):
            for i in range(0, d_model, 2):
                pos_enc[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        return tf.cast(pos_enc, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:tf.shape(inputs)[1]]

def image_to_patches_with_positional_encoding(image, segment_size, num_channels):
    # Crie um layer para dividir a imagem em segmentos/palavras
    segment_layer = Conv2D(filters=num_channels * segment_size * segment_size, kernel_size=(segment_size, segment_size), strides=(segment_size, segment_size), padding='valid')

    # Converta a imagem em segmentos
    patches = segment_layer(image)
    d_model = segment_size * segment_size * num_channels  # Dimensão do embedding

    # Redimensione os patches para a forma correta para o Transformer
    patches = Reshape((-1, d_model))(patches)  # 'batch_size' é substituído por '-1' para flexibilidade

    # Número total de posições (segmentos)
    num_positions = (original_height // segment_size) * (original_width // segment_size)

    # Aplique o embedding posicional
    positional_encoding_layer = PositionalEncoding(num_positions, d_model)
    encoded_patches = positional_encoding_layer(patches)

    return encoded_patches

In [27]:
class ImageReconstructionLayer(Layer):
    def __init__(self, original_height, original_width, segment_size, num_channels, **kwargs):
        super(ImageReconstructionLayer, self).__init__(**kwargs)
        self.original_height = original_height
        self.original_width = original_width
        self.segment_size = segment_size
        self.num_channels = num_channels

    def call(self, encoded_patches):
        num_segments_height = self.original_height // self.segment_size
        num_segments_width = self.original_width // self.segment_size

        # Redimensione os patches para a forma original de segmentos
        reshaped_patches = tf.reshape(encoded_patches, (-1, num_segments_height, num_segments_width, self.segment_size, self.segment_size, self.num_channels))

        # Reorganize os patches para a forma de uma imagem
        batch_size = tf.shape(reshaped_patches)[0]
        reshaped_image = tf.reshape(reshaped_patches, (batch_size, self.original_height, self.original_width, self.num_channels))

        # Redimensione a largura para 1 enquanto mantém a altura e os canais
        resized_image = tf.image.resize(reshaped_image, [self.original_height, 1])

        return resized_image

In [16]:
class TransformerGenerator(NoisyTargetGenerator):
    def __init__(self, sound_files, noise_files, block_size=2, normalize_phi=True):
        super().__init__(sound_files, noise_files, block_size=block_size, normalize_phi=normalize_phi)

    def generate_batch_transformer(self, clean_phasors):
        """
        Generate a batch of transformed arrays and y_samples from clean and noisy phasors.
    
        :param clean_phasors: Numpy array of shape (128, 64, 2), representing the clean phasors.
        :return: Tuple containing a list of 64 transformed arrays of shape (128, 128, 2) and an array y_samples of shape (64, 128, 1, 2).
        """
    
        # Criando um novo array com shape (128, 64, 2) com zeros
        new_array = np.zeros((128, 64, 2))
    
        # Preenchendo o novo array com os valores do array clean_phasors
        # new_array[:, :64, :] = clean_phasors
    
        # Inicializando a lista para armazenar os 64 arrays
        arrays = []
    
        # Inicializando o array y_samples com o shape desejado (64, 128, 1, 2)
        y_samples = np.zeros((64, 128, 1, 2))
    
        for i in range(64):
            # Copiando o array new_array para um novo array temporário
            temp_array = np.copy(new_array)
            
            # Substituindo os valores em temp_array pelos valores correspondentes de clean_phasors
            if i != 0:
                temp_array[:, :i, :] = clean_phasors[:, :i, :]
    
            # Adicionando o temp_array à lista
            arrays.append(temp_array)
    
            # Preenchendo y_samples com os valores correspondentes de clean_phasors
            y_samples[i, :, 0, :] = clean_phasors[:, i, :]
    
        return np.array(arrays), y_samples
            
        
    
    def generate_sample_completo(self, batch_size=32, include_clean=False):
        while True:
            # Carrega um lote de vozes e ruidos
            sound_batch, noise_batch = self.pick_random_blocks(batch_size)

            xn_train = []
            xc_train = []
            y_train = []
            
            # Adiciona ruído a cada som e calcula a nota PESQ
            for sound, noise in zip(sound_batch, noise_batch):

                sound_escalado, noisy_sound = self.normalize_and_add_noise(sound, noise)
                
                if sound_escalado is None or noisy_sound is None:
                    continue
                
                # try:
                A, phi, _, _ = calculate_stft_magnitude_and_phase(sound_escalado)
                A_noisy, phi_noisy, _, _ = calculate_stft_magnitude_and_phase(noisy_sound)
                # except:
                #     continue

                F = self.assemble_phasors(A, phi)
                F_noisy = self.assemble_phasors(A_noisy, phi_noisy)

                xc_samples, y_samples = self.generate_batch_transformer(F)
                
                # Adiciona os exemplos aos lotes de treinamento
                for _ in range(64):
                    xn_train.append(F_noisy)
                xc_train.append(xc_samples)
                y_train.append(y_samples)
                
                if include_clean:
                    xc_samples, y_samples = self.generate_batch_transformer(F)
                    for _ in range(64):
                        xn_train.append(F)
                    xc_train.append(xc_samples)
                    y_train.append(y_samples)

            xn_train = np.array(xn_train)
            xc_train = np.array(xc_train)
            y_train = np.array(y_train)
            
            yield [xn_train, xc_train.reshape(-1, 128, 64, 2)], y_train.reshape(-1, 128, 1, 2)

In [6]:
base_shape_size = 8192
ws = 255
ol = 128
input_shape = (128, 64, 2)

In [7]:
sound_base = Sound('/tf/Dados/Vozes/', '/tf/Dados/Ruido/', base_shape_size)

Loading Speech Files: 100%|██████████| 8179/8179 [00:05<00:00, 1369.52it/s]
Loading Noise Files: 100%|██████████| 8137/8137 [00:06<00:00, 1341.34it/s]
  clean_sounds = [sound for sound in clean_sounds if sound != self.TOO_SHORT_ERROR]
  noise_sounds = [sound for sound in noise_sounds if sound != self.TOO_SHORT_ERROR]


In [8]:
data_generator_train = TransformerGenerator(sound_base.train_X, sound_base.noise_sounds)
data_generator_val = TransformerGenerator(sound_base.val_X, sound_base.noise_sounds)

In [31]:
# Defina o tamanho do segmento e o número de canais
segment_size = 16
num_channels = 2  # Número de canais na imagem

# Calcule as dimensões do embedding
original_height = 128
original_width = 64
num_segments_height = original_height // segment_size
num_segments_width = original_width // segment_size
d_model = segment_size * segment_size * num_channels  # Dimensão do embedding

# Ajuste o código do modelo para usar estas funções
input_image_encoder = Input(shape=(original_height, original_width, num_channels))
input_image_decoder = Input(shape=(original_height, original_width, num_channels))

# Processamento de embedding
encoded_patches_encoder = image_to_patches_with_positional_encoding(input_image_encoder, segment_size, num_channels)
encoded_patches_decoder = image_to_patches_with_positional_encoding(input_image_decoder, segment_size, num_channels)

# Passar os embeddings pelas camadas do Transformer
transformer_output = getTransformerLayers(encoded_patches_encoder, encoded_patches_decoder, d_model=d_model)

# Reconstrução da imagem de saída
reconstruction_layer = ImageReconstructionLayer(original_height, original_width, segment_size, num_channels)
output_image = reconstruction_layer(transformer_output)
output_image = Reshape((original_height, 1, num_channels))(output_image)

# Construção do modelo
transformer_model = Model(inputs=[input_image_encoder, input_image_decoder], outputs=output_image)

In [32]:
transformer_model.summary()

Model: "model_44"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_81 (InputLayer)          [(None, 128, 64, 2)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_20 (Conv2D)             (None, 8, 4, 512)    262656      ['input_81[0][0]']               
                                                                                                  
 input_82 (InputLayer)          [(None, 128, 64, 2)  0           []                               
                                ]                                                                 
                                                                                           

In [37]:
transformer_model.compile(optimizer='adam', loss='msle')

In [38]:
batch_size = 2
steps_per_epoch = len(sound_base.train_X) // batch_size

print('Starting training')

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    
    # Gera um novo lote de validação para cada época
    validation_batch = next(data_generator_val.generate_sample_completo(batch_size=batch_size))
    [x1_val, x2_val], y_val = validation_batch
    
    transformer_model.fit(data_generator_train.generate_sample_completo(batch_size=batch_size, include_clean=False),
                     steps_per_epoch=steps_per_epoch,
                     epochs=1,
                     validation_data=([x1_val, x2_val], y_val),
                    )

Starting training
Epoch 1

KeyboardInterrupt: 