### Import pacchetti

In [1]:
gdrive = True

In [2]:
from psutil import virtual_memory
def check_gpu_and_ram():
    print('******************************************************************************')
    print('---------------------------------  GPU  --------------------------------------')
    print('******************************************************************************')
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
        print('and then re-execute this cell.')
    else:
        print(gpu_info)
    print('\n')
    print('******************************************************************************')

    print('---------------------------------  RAM  --------------------------------------')

    print('******************************************************************************')

    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
        print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
        print('re-execute this cell.')
    else:
        print('You are using a high-RAM runtime!')

In [3]:
import sys
import os
if gdrive:
    from google.colab import drive
    drive.mount('/content/gdrive')
    %tensorflow_version 1.x
    sys.path.append('/content/gdrive/My Drive/Tesi/src/')
    check_gpu_and_ram()

Mounted at /content/gdrive
TensorFlow 1.x selected.
******************************************************************************
---------------------------------  GPU  --------------------------------------
******************************************************************************
Sat Feb 13 10:25:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |      

In [4]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tensorflow as tf
import pickle
import time
from dataset import *
from training_ops import *
from params import *
from models import get_model

Using TensorFlow backend.


In [5]:
print(tf.__version__)

1.15.2


### Import dati e set di alcuni parametri

In [6]:
multispeaker = False
if multispeaker:
    sample_training = True
else:
    sample_training = False
architecture_params, opt_params = read_params('gionet2')
data_path = read_datapath(multispeaker = multispeaker, gdrive = gdrive)
logdir = get_logdir(data_path, opt_params, architecture_params)

In [7]:
%%time
X_train, Y_train, X_val, Y_val = read_data(data_path, sample_training = sample_training)
n_train, n_validation = X_train.shape[0], X_val.shape[0]
print('number of training examples: {}'.format(n_train))
print('number of validation examples: {}'.format(n_validation))

number of training examples: 6656
number of validation examples: 768
CPU times: user 123 ms, sys: 376 ms, total: 498 ms
Wall time: 4.23 s


In [8]:
checkpoint_root = os.path.join(logdir, 'model.ckpt')
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=True))
if os.path.isdir(logdir):
    print('Model loading...')
    checkpoint = read_latest_checkpoint(logdir, gdrive)
    meta = checkpoint + '.meta'
    saver = tf.train.import_meta_graph(meta)
    g = tf.get_default_graph()
    saver.restore(sess, checkpoint)
    X_lr, Y, alpha = tf.get_collection('inputs')[0:3]
    inputs = (X_lr, Y, alpha)
    predictions = tf.get_collection('preds')[0]
    loss = create_objective(Y, predictions)
    train_op = tf.get_collection('train_op')
    with open(os.path.join(logdir, 'train_metrics.pickle'), 'rb') as handle:
        train_metrics = pickle.load(handle)
    epochs_completed = get_epochs_completed(checkpoint)
else:
    print('Model creation...')
    checkpoint = logdir
    n_dim, n_chan = 8192, 1
    X_lr = tf.placeholder(tf.float32, shape=(None, n_dim, 1), name='X_lr') #(None, n_dim, 1)
    Y = tf.placeholder(tf.float32, shape=(None, None, 1), name='Y') #(None, n_dim, 1)
    alpha = tf.placeholder(tf.float32, shape=(), name='alpha') # weight multiplier
    tf.add_to_collection('inputs', X_lr)
    tf.add_to_collection('inputs', Y)
    tf.add_to_collection('inputs', alpha)
    inputs = (X_lr, Y, alpha)
    predictions = get_model(X_lr, architecture_params)
    tf.add_to_collection('preds', predictions)
    loss = create_objective(Y, predictions)
    params = get_params()
    optimizer = create_optimzier(opt_params)
    grads = create_gradients(loss, params, optimizer)
    with tf.name_scope('optimizer'):
        train_op = create_updates(params, grads, alpha, optimizer)
    init = tf.global_variables_initializer()
    sess.run(init)
    tf.add_to_collection('train_op', train_op)
    saver = tf.train.Saver()
    train_metrics = {}
    epochs_completed = 0
print("The total number of parameters in the model is: " + str(count_parameters()))
train_data = DataSet(X_train, Y_train, epochs_completed = epochs_completed)
validation_data = DataSet(X_val, Y_val, epochs_completed = epochs_completed)
del X_train
del Y_train
del X_val
del Y_val
print('Model is ready for training phase. The total number of epochs completed is {}'.format(epochs_completed))  

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0

Model loading...
INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/Tesi/processedData/speaker1/train&validation/model_ckpt/gionet2.lr0.000030dec.4.g3.b128/model_ckpt-91



The total number of parameters in the model is: 23485829
Model is ready for training phase. The total number of epochs completed is 91


### Fit del modello 

In [9]:
batch_size = opt_params['batch_size']
if multispeaker:
    n_steps = opt_params['tot_steps']
    steps_for_interruption = opt_params['steps_for_interruption']
    n_epochs = calc_epochs_from_n_steps(n_steps, batch_size, n_train)
    epochs_for_interruption = calc_epochs_from_n_steps(steps_for_interruption, batch_size, n_train)
else:
    n_epochs = n_epochs_in_multispeaker_task(logdir) #:= number of epochs performed for this model in multispeaker task 
    epochs_for_interruption = None
n_steps_for_1_validation_epoch = int(n_validation/128)
print('n_epochs: {}'.format(n_epochs))
print('epochs_for_interruption: {}'.format(epochs_for_interruption))

n_epochs: 246
epochs_for_interruption: None


In [10]:
current_epoch = train_data._epochs_completed
alpha = 1.0
losses_on_batch = []
snr_on_batch = []
epoch_start_time = time.time()
if len(train_metrics)>0:
    interrupt = early_stopping(train_metrics, epochs_for_interruption, multispeaker = multispeaker)
else:
    interrupt = False
n_iters = 0
while (train_data._epochs_completed < n_epochs)&(interrupt==False):
    batch = train_data.next_batch(batch_size)
    feed_dict = load_batch(inputs, batch, alpha) #mi serve sia per calcolare la loss in training che per fare l'update dei parametri
    feed_dict_not_train = load_batch(inputs, batch, alpha, train=False)
    tr_objective = train(feed_dict, train_op, loss, sess) #fa l'update dei parametri e restituisce il valore della loss
    losses_on_batch.append(tr_objective)
    snr_on_batch.append(sess.run(tf.get_collection(key='snr_on_batch'), feed_dict=feed_dict_not_train)[0])
    if train_data._epochs_completed > current_epoch:
        l2_training = np.mean(losses_on_batch, dtype = np.float32)
        snr_training = np.mean(snr_on_batch, dtype = np.float32)
        l2_v = []
        snr_v = []
        for b in range(n_steps_for_1_validation_epoch):
            v_batch = validation_data.next_batch(batch_size)
            feed_dict_validation = load_batch(inputs, v_batch, alpha, train=False)
            l2_v.append(sess.run(loss, feed_dict=feed_dict_validation))
            snr_v.append(sess.run(tf.get_collection(key='snr_on_batch'), feed_dict=feed_dict_validation)[0])
        l2_validation = np.mean(l2_v, dtype = np.float32)
        snr_validation = np.mean(snr_v, dtype = np.float32)
        train_metrics[current_epoch] = {'l2_training': l2_training,
                                        'snr_training': snr_training,
                                        'l2_validation': l2_validation,
                                        'snr_validation': snr_validation
                                       }
        epoch_end_time = time.time()
        tot_time_s = epoch_end_time - epoch_start_time
        print_epoch_with_time(tot_time_s, train_data._epochs_completed)
        print_metrics(l2_training, snr_training, l2_validation, snr_validation)
        save_model_phase(sess, logdir, train_data._epochs_completed, train_metrics, saver, multispeaker = multispeaker) #save the model and the dictionary with metrics
        save_architecture_opt_params(logdir, architecture_params, opt_params)
        interrupt = early_stopping(train_metrics, epochs_for_interruption, multispeaker = multispeaker)
        losses_on_batch = []
        snr_on_batch = []
        current_epoch = train_data._epochs_completed
        epoch_start_time = time.time()
        n_iters +=1

------- EPOCH 92 RESULTS ------- (trained in 1.88 minutes. Current time: 11:28)
l2 (loss) on training is 0.0058752126060426235
snr on training is 16.577205657958984
l2 (loss) on validation is 0.005062172655016184
snr on validation is 17.519365310668945 

------- EPOCH 93 RESULTS ------- (trained in 1.53 minutes. Current time: 11:30)
l2 (loss) on training is 0.00588283920660615
snr on training is 16.56191062927246
l2 (loss) on validation is 0.005043835379183292
snr on validation is 17.54297637939453 

------- EPOCH 94 RESULTS ------- (trained in 1.55 minutes. Current time: 11:31)
l2 (loss) on training is 0.0058833155781030655
snr on training is 16.57038116455078
l2 (loss) on validation is 0.005060721654444933
snr on validation is 17.518857955932617 

------- EPOCH 95 RESULTS ------- (trained in 1.55 minutes. Current time: 11:33)
l2 (loss) on training is 0.005884953774511814
snr on training is 16.584732055664062
l2 (loss) on validation is 0.005073168780654669
snr on validation is 17.5020