In [1]:
import sys
sys.path.insert(0, '/tf/utils/')

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import matplotlib.pyplot as plt

In [65]:
from keras.models import Model, load_model
from keras.layers import multiply, Lambda, add, Activation, Dropout, Conv2DTranspose, ReLU, ZeroPadding2D, BatchNormalization, Input, Conv2D, Conv2DTranspose, Flatten, Dense, LeakyReLU, MaxPooling2D, UpSampling2D, Concatenate, concatenate, Bidirectional, LSTM, TimeDistributed, Reshape
from keras.optimizers import Adam
from tensorflow.keras.backend import sigmoid
import numpy as np
import tensorflow.keras.backend as K
import tensorflow as tf
from pesq import pesq
from utils import calculate_snr, itakura_distortion, somar_sinais, add_white_gaussian_noise, performance
import librosa
from tqdm import tqdm

from sound import Sound

from IPython.display import Audio
import time
from IPython import display

from scipy.ndimage import zoom

In [4]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [6]:
base_shape_size = 8192
ws = 255
ol = 128

In [7]:
sound_base = Sound('../../../Dados/Vozes/', '../../../Dados/Ruido/', base_shape_size)

Loading Speech Files: 100%|██████████| 8179/8179 [00:07<00:00, 1133.40it/s]
Loading Noise Files: 100%|██████████| 8137/8137 [00:08<00:00, 1001.30it/s]
  clean_sounds = [sound for sound in clean_sounds if sound != self.TOO_SHORT_ERROR]
  noise_sounds = [sound for sound in noise_sounds if sound != self.TOO_SHORT_ERROR]


In [8]:
def calculate_stft_magnitude_and_phase(signal, sampling_rate=8000, window_size=ws, overlap=ol):
    # Calcula a STFT usando a biblioteca librosa
    stft_result = librosa.stft(signal, n_fft=window_size, hop_length=overlap)
    
    magnitude, phase = librosa.magphase(stft_result)
    phi = np.angle(phase)
    f = librosa.fft_frequencies(sr=sampling_rate, n_fft=window_size)
    t = librosa.frames_to_time(np.arange(stft_result.shape[1]), sr=sampling_rate, hop_length=overlap)

    return magnitude, phi, f, t

def reconstruct_signal_from_stft(magnitude, phi, sampling_rate=8000, window_size=ws, overlap=ol):
    # Reconstruct the signal from magnitude and phase
    complex_spec = magnitude * np.exp(1j * phi)
    signal = librosa.istft(complex_spec, hop_length=overlap)

    return signal

In [70]:
class DataGenerator:
    def __init__(self, sound_files, noise_files):
        self.sound_files = sound_files
        self.noise_files = noise_files

    def generate_sample_completo(self, batch_size=32):
        while True:
            # # Carrega um lote de sons
            # sound_batch_choices = np.random.choice(self.sound_files.shape[0], size=batch_size, replace=False)
            # sound_batch = self.sound_files[sound_batch_choices]
            
            # # Carrega um lote de ruídos
            # noise_batch_choices = np.random.choice(self.noise_files.shape[0], size=batch_size, replace=False)
            # noise_batch = self.noise_files[noise_batch_choices]
            block_size = 8
            
            if batch_size % block_size != 0:
                raise ValueError("O tamanho do lote deve ser um múltiplo de 8")

            # Calcula quantos blocos de 8 existem nos dados fornecidos
            num_blocks = batch_size // block_size
            
            # Escolhe blocos aleatórios de sons e ruídos
            sound_block_indices = np.random.choice(self.sound_files.shape[0] // block_size, size=num_blocks, replace=False) * block_size
            noise_block_indices = np.random.choice(self.noise_files.shape[0] // block_size, size=num_blocks, replace=False) * block_size

            # Seleciona os arquivos de sons e ruídos
            sound_batch = np.array([self.sound_files[i:i+8] for i in sound_block_indices]).reshape(-1, self.sound_files.shape[-1])
            noise_batch = np.array([self.noise_files[i:i+8] for i in noise_block_indices]).reshape(-1, self.noise_files.shape[-1])
            
            # Verifica se reshape não excedeu a quantidade de amostras disponível, ajustando se necessário
            if len(sound_batch) > batch_size:
                sound_batch = sound_batch[:batch_size]
            if len(noise_batch) > batch_size:
                noise_batch = noise_batch[:batch_size]

            x_train = []
            y_train = []
            # y_pesq = []
            
            # Adiciona ruído a cada som e calcula a nota PESQ
            for sound, noise in zip(sound_batch, noise_batch):
                # noisy_sound = somar_sinais(sound, noise, sr)
                try:
                    min_valor = np.min(sound)
                    max_valor = np.max(sound)
                    
                    # Defina o novo intervalo desejado
                    novo_min = -0.4
                    novo_max = 0.4
                    
                    # Realize a escala do sinal para o novo intervalo
                    sound_escalado = (sound - min_valor) / (max_valor - min_valor) * (novo_max - novo_min) + novo_min
    
                    potencia_sound = np.mean(np.abs(sound_escalado) ** 2.0)
                    potencia_noise = np.mean(np.abs(noise) ** 2.0)
    
                    if potencia_sound > 0. and potencia_noise > 0.:
                        sr = np.random.randint(0, 20, size=(1,)[0])
                        noisy_sound = somar_sinais(sound_escalado, noise, sr)
    
                    elif potencia_sound > 0.:
                        noisy_sound = sound_escalado
    
                    else:
                        continue
                    
                    # noisy_sound = add_white_gaussian_noise(noisy_sound, np.random.randint(20, 30, size=(1,)[0]))
                    noisy_sound = add_white_gaussian_noise(noisy_sound, np.random.randint(20, 30, size=(1,)[0]))
                    noisy_sound = np.clip(noisy_sound, -1.0, 1.0)
    
                    #Calcula a nota PESQ
                    try:
                        pesq_score = pesq(8000, sound, noisy_sound, 'nb')
                    except:
                        continue
    
                    # valor_min = -0.6
                    # valor_max = 4.6
                    # pesq_score = (pesq_score - valor_min) / (valor_max - valor_min)
                    
                    A, phi, _, _ = calculate_stft_magnitude_and_phase(sound_escalado)
                    A_noisy, phi_noisy, _, _ = calculate_stft_magnitude_and_phase(noisy_sound)
                    
                    # Monta o fasor normalizando a faze por Pi
                    F = np.concatenate([A.reshape(A.shape[0], A.shape[1], 1), (phi.reshape(phi.shape[0], phi.shape[1], 1) / (2*np.pi)) + 0.5], axis=-1)
                    F_noisy = np.concatenate([A_noisy.reshape(A_noisy.shape[0], A_noisy.shape[1], 1), (phi_noisy.reshape(phi_noisy.shape[0], phi_noisy.shape[1], 1) / (2*np.pi)) + 0.5], axis=-1)
                    
                    # Adiciona o exemplo ao lote de treinamento
                    x_train.append(F_noisy)
                    x_train.append(F)

                    y_train.append(pesq_score)
                    y_train.append(4.64)
                    
                except:
                    continue

            x_train = np.array(x_train)
            y_train = np.array(y_train).reshape(-1, 1)
            
            yield x_train, y_train

In [71]:
data_generator_train = DataGenerator(sound_base.train_X, sound_base.noise_sounds)
data_generator_val = DataGenerator(sound_base.val_X, sound_base.noise_sounds)

In [72]:
x, y = next(data_generator_train.generate_sample_completo())

In [74]:
y.shape

(64, 1)

In [75]:
# Custom activation function
def custom_activation(x):
    return 3.6 * sigmoid(x) + 1.04

# Define the PESQNet model
def PESQNet(input_shape):
    # Define the input layer
    input_layer = Input(shape=input_shape)

    conv1 = Conv2D(filters=32, kernel_size=(5, 5), activation='relu', padding='same')(input_layer)
    maxpool1 = MaxPooling2D((2, 2))(conv1)

    conv2 = Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same')(maxpool1)
    maxpool2 = MaxPooling2D((2, 1))(conv2)

    # Encoder CNN - assuming the use of multiple widths for convolutional kernels
    cnn_branches = []
    for w in [5, 3, 1]:  # Example widths
        conv = Conv2D(filters=32, kernel_size=(w, w), activation='relu', padding='same')(maxpool2)
        maxpool = MaxPooling2D((2, 2))(conv)
        cnn_branches.append(maxpool)

    # Concatenate all CNN branches
    cnn_output = Concatenate()(cnn_branches)

    # Aplicamos TimeDistributed para processar cada uma das "timesteps" 16x16 de forma independente
    # Suponha que você quer reduzir o número de features antes da LSTM
    # Reduzimos a dimensionalidade antes da LSTM para não sobrecarregar a quantidade de parâmetros
    time_distributed_output = TimeDistributed(Dense(128, activation='relu'))(cnn_output) # (None, 16, 16, 128)
    
    # Agora ajustamos o tensor para ter a forma correta para o LSTM, que é (None, timesteps, features)
    # Nesse caso, consideramos cada linha da "imagem" após a TimeDistributed como um timestep
    reshape_to_lstm = Reshape((-1, 16 * 128))(time_distributed_output) # (None, 16, 16*128)
    
    # LSTM part
    blstm = Bidirectional(LSTM(128, return_sequences=True))(reshape_to_lstm)

    # Statistics over blocks - here we're assuming this operation is done post-LSTM
    # For now, we'll just pass the output through to the fully connected layers
    # Further implementation is required to actually compute statistics over blocks

    # Fully connected layers
    fc1 = Dense(128, activation='relu')(blstm)
    fc2 = Dense(32, activation='relu')(fc1)
    output_layer = Dense(1, activation=custom_activation)(fc2)

    # Create the model
    model = Model(inputs=input_layer, outputs=output_layer)

    return model

In [76]:
# Assuming input_shape is (Kin, W, 1) where Kin is the number of frequency bins and W is the number of time frames
# We also need to define the number of blocks B which would be a hyperparameter
input_shape = (128, 64, 2)  # Replace Kin and W with actual values

# Get the PESQNet model
model = PESQNet(input_shape)

In [77]:
# Compile the model with a custom loss function
model.compile(optimizer='adam', loss='mean_squared_error')  # Replace 'mean_squared_error' with the actual PESQ loss function

# Model summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 128, 64, 2)  0           []                               
                                ]                                                                 
                                                                                                  
 conv2d_30 (Conv2D)             (None, 128, 64, 32)  1632        ['input_7[0][0]']                
                                                                                                  
 max_pooling2d_30 (MaxPooling2D  (None, 64, 32, 32)  0           ['conv2d_30[0][0]']              
 )                                                                                                
                                                                                            

In [79]:
batch_size = 128
steps_per_epoch = len(sound_base.train_X) // batch_size

print('Starting training')

for epoch in range(10):
    print(f"Epoch {epoch + 1}")
    
    # Gera um novo lote de validação para cada época
    validation_batch = next(data_generator_val.generate_sample_completo(batch_size=batch_size))
    x_val, y_val = validation_batch
    
    model.fit(data_generator_train.generate_sample_completo(batch_size=batch_size),
              steps_per_epoch=steps_per_epoch,
              epochs=1,
              validation_data=(x_val, y_val),
              # callbacks=[PlotLossesCallback()]
             )

Starting training
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10


In [81]:
import datetime
# Get the current datetime
current_datetime = datetime.datetime.now()

# Format the datetime as a string to use in the file name
datetime_str = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

model.save('PESQNet-loss-0.0913-epochs-10-'+datetime_str+'.h5')