# VoxCeleb trainer performance measures for pretrained ResNet34 on different type of audio signals

**Src**: https://github.com/clovaai/voxceleb_trainer

In [1]:
from google.colab import drive

ROOT = '/content/drive'

drive.mount(ROOT)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!nvidia-smi

Sun Jul 12 21:09:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
%cd '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition'

/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition


In [4]:
!pip install -q torchaudio

[K     |████████████████████████████████| 3.2MB 2.8MB/s 
[?25h

In [5]:
import sys, time, os, argparse, socket, random
import numpy as np
import pdb
import torch
import glob
from tuneThreshold import tuneThresholdfromScore
from SpeakerNet import SpeakerNet
from DatasetLoader import DatasetLoader
from utils import add_noise
import librosa

# Default parameters
## Data loader
* parser.add_argument('--max_frames', type=int, default=200,  help='Input length to the network');
* parser.add_argument('--batch_size', type=int, default=200,  help='Batch size');
* parser.add_argument('--max_seg_per_spk', type=int, default=100, help='Maximum number of utterances per speaker per epoch');
* parser.add_argument('--nDataLoaderThread', type=int, default=5, help='Number of loader threads');

## Training details
* parser.add_argument('--test_interval', type=int, default=10, help='Test and save every [test_interval] epochs');
* parser.add_argument('--max_epoch',      type=int, default=500, help='Maximum number of epochs');
* parser.add_argument('--trainfunc', type=str, default="",    help='Loss function');
* parser.add_argument('--optimizer', type=str, default="adam", help='sgd or adam');

## Learning rates
* parser.add_argument('--lr', type=float, default=0.001,      help='Learning rate');
* parser.add_argument("--lr_decay", type=float, default=0.95, help='Learning rate decay every [test_interval] epochs');

## Loss functions
* parser.add_argument("--hard_prob", type=float, default=0.5, help='Hard negative mining probability, otherwise random, only for some loss functions');
* parser.add_argument("--hard_rank", type=int, default=10,    help='Hard negative mining rank in the batch, only for some loss functions');
* parser.add_argument('--margin', type=float,  default=1,     help='Loss margin, only for some loss functions');
* parser.add_argument('--scale', type=float,   default=15,    help='Loss scale, only for some loss functions');
* parser.add_argument('--nSpeakers', type=int, default=5994,  help='Number of speakers in the softmax layer for softmax-based losses, utterances per speaker per iteration for other losses');

## Load and save
* parser.add_argument('--initial_model',  type=str, default="", help='Initial model weights');
* parser.add_argument('--save_path',      type=str, default="./data/exp1", help='Path for model and logs');

## Training and test data
* parser.add_argument('--train_list', type=str, default="",   help='Train list');
* parser.add_argument('--test_list',  type=str, default="",   help='Evaluation list');
* parser.add_argument('--train_path', type=str, default="voxceleb2", help='Absolute path to the train set');
* parser.add_argument('--test_path',  type=str, default="voxceleb1", help='Absolute path to the test set');

## For test only
* parser.add_argument('--eval', dest='eval', action='store_true', help='Eval only')

## Model definition
* parser.add_argument('--model', type=str,        default="",     help='Name of model definition');
* parser.add_argument('--encoder_type', type=str, default="SAP",  help='Type of encoder');
* parser.add_argument('--nOut', type=int,         default=512,    help='Embedding size in the last FC layer');

# Tuned parameters from the repo author
* -- model "ResNetSE34L"
* -- trainfunc "angleproto"
* -- max_frames = 300
* -- save_path
* -- test_list
* -- test_path
* -- initial_model "baseline_lite_ap.model"

# Init pretrained model

In [6]:
# params
eval = True
model = 'ResNetSE34L'
trainfunc = 'angleproto'
save_path = '/content/drive/My Drive/Stage-Imaging/'
max_frames = 300
batch_size = 200
max_seg_per_spk = 100
nDataLoaderThread = 5
max_epoch = 500
optimizer = 'adam'
hard_prob = .5
hard_rank = 10
margin = 1
scale = 15
nSpeakers = 5994
test_list = '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition/veri_test.txt'
test_list_25 = '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition/veri_test_2.5.txt'
test_list_75 = '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition/veri_test_7.5.txt'
test_list_125 = '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition/veri_test_12.5.txt'
test_list_175 = '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition/veri_test_17.5.txt'
test_path = '/content/drive/My Drive/Datasets/VoxCeleb1_test/wav/'
initial_model = 'baseline_lite_ap.model'
encoder_type = 'SAP'
nOut = 512
lr = 0.001


# defaults
lr_decay = .95
test_interval = 10

In [7]:
s = SpeakerNet(model = model, max_frames = max_frames, batch_size = batch_size, max_seg_per_spk = max_seg_per_spk, nDataLoaderThread = nDataLoaderThread, encoder_type = encoder_type, nOut = nOut,
               test_interval = test_interval, max_epoch = max_epoch, optimizer = optimizer, lr = lr, lr_decay = lr_decay, hard_prob = hard_prob, hard_rank = hard_rank, margin = margin, scale = scale,
               nSpeakers = nSpeakers)

it = 1
prevloss = float("inf")
sumloss = 0

Embedding size is 512, encoder SAP.
Initialised Pairwise Loss


**Evaluation measure EER**:
* Normal audio &rarr; **2.2322**
* Noisy audio SNR 2.5 &rarr; **12.7094**
* Noisy audio SNR 7.5 &rarr; **6.8452**
* Noisy audio SNR 12.5 &rarr; **4.1304**
* Noisy audio SNR 17.5 &rarr; **3.0170**
* Denoised audio with WaveNet &rarr;
* Denoised audio with Pix2Pix &rarr;

# Measures on test data

In [8]:
## Load model weights
modelfiles = glob.glob('%s/model0*.model'%save_path)
modelfiles.sort()

if len(modelfiles) >= 1:
    s.loadParameters(modelfiles[-1]);
    print("Model %s loaded from previous state!"%modelfiles[-1]);
    it = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][5:]) + 1
elif(initial_model != ""):
    s.loadParameters(initial_model);
    print("Model %s loaded!"%initial_model);

for ii in range(0, it-1):
    if ii % test_interval == 0:
        clr = s.updateLearningRate(lr_decay)

Model baseline_lite_ap.model loaded!


## Normal audio

In [None]:
if eval == True:   
  sc, lab = s.evaluateFromListSave(test_list, print_interval=100, test_path=test_path)
  result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
  print('EER %2.4f'%result[1])

Reading 4700: 1.31 Hz, embed size 512
Computing 37700: 1925.20 Hz

EER 2.2322


## Noisy audio SNR 2.5

In [None]:
%cd '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition'

/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition


In [None]:
main_path = '/content/drive/My Drive/Datasets/VoxCeleb1_test/wav'
noise_choice = {'music':659, 'noise':929, 'speech':425}
sample_rate = 16000
snr = 2.5

print('Making noisy audio with SNR: {}'.format(snr))
count = 1
for folder in os.listdir(main_path):
  print('Making audio files for {}'.format(folder))
  for video in os.listdir(os.path.join(main_path, folder)):
    for file in os.listdir(os.path.join(main_path, folder, video)):
      if not any(substring in file for substring in ["_2.5.", "_7.5.", "_12.5.", "_17.5."]):
        noise_type = random.choice(['music', 'noise', 'speech'])
        noise_track = np.random.randint(0, noise_choice[noise_type])
        _, noise_path = open('../data/musan_{}.scp'.format(noise_type)).readlines()[noise_track].rstrip().split()
        noise_path = noise_path.replace('/export/corpora/', '/content/drive/My Drive/Datasets/')
        noise_audio, _ = librosa.load(noise_path, sr = sample_rate)
        clean_audio, _ = librosa.load(os.path.join(main_path, folder, video, file), sr = sample_rate)
        filename = file.strip('.wav')
        filename = filename+'_'+str(snr)+'.wav'
        noisy_audio = add_noise(clean_audio, noise_audio, snr = snr)
        output_path = os.path.join(main_path, folder, video, filename)
        librosa.output.write_wav(output_path, noisy_audio, sr = sample_rate)
        count += 1
      #print(clean_audio.shape, noise_path, noise_audio.shape, output_path)
  #break

print('Made {} noisy audios'.format(count))

Making noisy audio with SNR: 2.5
Making audio files for id10270
Making audio files for id10280
Making audio files for id10279
Making audio files for id10308
Making audio files for id10276
Making audio files for id10275
Making audio files for id10309
Making audio files for id10274
Making audio files for id10307
Making audio files for id10273
Making audio files for id10272
Making audio files for id10306
Making audio files for id10305
Making audio files for id10284
Making audio files for id10304
Making audio files for id10303
Making audio files for id10278
Making audio files for id10271
Making audio files for id10302
Making audio files for id10301
Making audio files for id10300
Making audio files for id10298
Making audio files for id10297
Making audio files for id10296
Making audio files for id10295
Making audio files for id10294
Making audio files for id10292
Making audio files for id10290
Making audio files for id10287
Making audio files for id10286
Making audio files for id10285
Making

In [None]:
with open('veri_test_2.5.txt', 'w') as f:
  for line in open('veri_test.txt'):
    files = line.split()
    files[1] = files[1].replace('.wav', '_2.5.wav')
    files[2] = files[2].replace('.wav', '_2.5.wav')
    f.write('{} {} {}\n'.format(files[0], files[1], files[2]))

print('Wrote txt file...')

In [None]:
if eval == True:   
  sc, lab = s.evaluateFromListSave(test_list_25, print_interval=100, test_path=test_path)
  result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
  print('EER %2.4f'%result[1])

Reading 4700: 6.66 Hz, embed size 512
Computing 37700: 1906.67 Hz

EER 12.7094


## Noisy audio SNR 7.5

In [None]:
%cd '/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition'

/content/drive/My Drive/Stage-Imaging/Signal-denoising-in-the-wild/SpeakerRecognition


In [None]:
main_path = '/content/drive/My Drive/Datasets/VoxCeleb1_test/wav'
noise_choice = {'music':659, 'noise':929, 'speech':425}
sample_rate = 16000
snr = 7.5

print('Making noisy audio with SNR: {}'.format(snr))
count = 1
for folder in os.listdir(main_path):
  print('Making audio files for {}'.format(folder))
  for video in os.listdir(os.path.join(main_path, folder)):
    for file in os.listdir(os.path.join(main_path, folder, video)):
      if not any(substring in file for substring in ["_2.5.", "_7.5.", "_12.5.", "_17.5."]):
        noise_type = random.choice(['music', 'noise', 'speech'])
        noise_track = np.random.randint(0, noise_choice[noise_type])
        _, noise_path = open('../data/musan_{}.scp'.format(noise_type)).readlines()[noise_track].rstrip().split()
        noise_path = noise_path.replace('/export/corpora/', '/content/drive/My Drive/Datasets/')
        noise_audio, _ = librosa.load(noise_path, sr = sample_rate)
        clean_audio, _ = librosa.load(os.path.join(main_path, folder, video, file), sr = sample_rate)
        filename = file.strip('.wav')
        filename = filename+'_'+str(snr)+'.wav'
        noisy_audio = add_noise(clean_audio, noise_audio, snr = snr)
        output_path = os.path.join(main_path, folder, video, filename)
        librosa.output.write_wav(output_path, noisy_audio, sr = sample_rate)
        count += 1
      #print(clean_audio.shape, noise_path, noise_audio.shape, output_path)
  #break

print('Made {} noisy audios'.format(count))

Making noisy audio with SNR: 7.5
Making audio files for id10280
Making audio files for id10279
Making audio files for id10308
Making audio files for id10276
Making audio files for id10275
Making audio files for id10309
Making audio files for id10274
Making audio files for id10307
Making audio files for id10273
Making audio files for id10272
Making audio files for id10306
Making audio files for id10305
Making audio files for id10284
Making audio files for id10304
Making audio files for id10303
Making audio files for id10278
Making audio files for id10271
Making audio files for id10302
Making audio files for id10301
Making audio files for id10300
Making audio files for id10298
Making audio files for id10297
Making audio files for id10296
Making audio files for id10295
Making audio files for id10294
Making audio files for id10292
Making audio files for id10290
Making audio files for id10287
Making audio files for id10286
Making audio files for id10285
Making audio files for id10283
Making

In [None]:
with open('veri_test_7.5.txt', 'w') as f:
  for line in open('veri_test.txt'):
    files = line.split()
    files[1] = files[1].replace('.wav', '_7.5.wav')
    files[2] = files[2].replace('.wav', '_7.5.wav')
    f.write('{} {} {}\n'.format(files[0], files[1], files[2]))

print('Wrote txt file...')

Wrote txt file...


In [None]:
if eval == True:   
  sc, lab = s.evaluateFromListSave(test_list_75, print_interval=100, test_path=test_path)
  result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
  print('EER %2.4f'%result[1])

Reading 4700: 3.24 Hz, embed size 512
Computing 37700: 1843.11 Hz

EER 6.8452


## Noisy audio SNR 12.5

In [None]:
main_path = '/content/drive/My Drive/Datasets/VoxCeleb1_test/wav'
noise_choice = {'music':659, 'noise':929, 'speech':425}
sample_rate = 16000
snr = 12.5

print('Making noisy audio with SNR: {}'.format(snr))
count = 1
for folder in os.listdir(main_path):
  print('Making audio files for {}'.format(folder))
  for video in os.listdir(os.path.join(main_path, folder)):
    for file in os.listdir(os.path.join(main_path, folder, video)):
      if not any(substring in file for substring in ["_2.5.", "_7.5.", "_12.5.", "_17.5."]):
        noise_type = random.choice(['music', 'noise', 'speech'])
        noise_track = np.random.randint(0, noise_choice[noise_type])
        _, noise_path = open('../data/musan_{}.scp'.format(noise_type)).readlines()[noise_track].rstrip().split()
        noise_path = noise_path.replace('/export/corpora/', '/content/drive/My Drive/Datasets/')
        noise_audio, _ = librosa.load(noise_path, sr = sample_rate)
        clean_audio, _ = librosa.load(os.path.join(main_path, folder, video, file), sr = sample_rate)
        filename = file.strip('.wav')
        filename = filename+'_'+str(snr)+'.wav'
        noisy_audio = add_noise(clean_audio, noise_audio, snr = snr)
        output_path = os.path.join(main_path, folder, video, filename)
        librosa.output.write_wav(output_path, noisy_audio, sr = sample_rate)
        count += 1
      #print(clean_audio.shape, noise_path, noise_audio.shape, output_path)
  #break

print('Made {} noisy audios'.format(count))

Making noisy audio with SNR: 12.5
Making audio files for id10280
Making audio files for id10279
Making audio files for id10308
Making audio files for id10276
Making audio files for id10275
Making audio files for id10309
Making audio files for id10274
Making audio files for id10307
Making audio files for id10273
Making audio files for id10272
Making audio files for id10306
Making audio files for id10305
Making audio files for id10284
Making audio files for id10304
Making audio files for id10303
Making audio files for id10278
Making audio files for id10271
Making audio files for id10302
Making audio files for id10301
Making audio files for id10300
Making audio files for id10298
Making audio files for id10297
Making audio files for id10296
Making audio files for id10295
Making audio files for id10294
Making audio files for id10292
Making audio files for id10290
Making audio files for id10287
Making audio files for id10286
Making audio files for id10285
Making audio files for id10283
Makin

In [None]:
with open('veri_test_12.5.txt', 'w') as f:
  for line in open('veri_test.txt'):
    files = line.split()
    files[1] = files[1].replace('.wav', '_12.5.wav')
    files[2] = files[2].replace('.wav', '_12.5.wav')
    f.write('{} {} {}\n'.format(files[0], files[1], files[2]))

print('Wrote txt file...')

Wrote txt file...


In [9]:
if eval == True:   
  sc, lab = s.evaluateFromListSave(test_list_125, print_interval=100, test_path=test_path)
  result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
  print('EER %2.4f'%result[1])

Reading 4700: 2.03 Hz, embed size 512
Computing 37700: 3091.02 Hz

EER 4.1304


## Noisy audio SNR 17.5

In [None]:
main_path = '/content/drive/My Drive/Datasets/VoxCeleb1_test/wav'
noise_choice = {'music':659, 'noise':929, 'speech':425}
sample_rate = 16000
snr = 17.5

print('Making noisy audio with SNR: {}'.format(snr))
count = 1
for folder in os.listdir(main_path):
  print('Making audio files for {}'.format(folder))
  for video in os.listdir(os.path.join(main_path, folder)):
    for file in os.listdir(os.path.join(main_path, folder, video)):
      if not any(substring in file for substring in ["_2.5.", "_7.5.", "_12.5.", "_17.5."]):
        noise_type = random.choice(['music', 'noise', 'speech'])
        noise_track = np.random.randint(0, noise_choice[noise_type])
        _, noise_path = open('../data/musan_{}.scp'.format(noise_type)).readlines()[noise_track].rstrip().split()
        noise_path = noise_path.replace('/export/corpora/', '/content/drive/My Drive/Datasets/')
        noise_audio, _ = librosa.load(noise_path, sr = sample_rate)
        clean_audio, _ = librosa.load(os.path.join(main_path, folder, video, file), sr = sample_rate)
        filename = file.strip('.wav')
        filename = filename+'_'+str(snr)+'.wav'
        noisy_audio = add_noise(clean_audio, noise_audio, snr = snr)
        output_path = os.path.join(main_path, folder, video, filename)
        librosa.output.write_wav(output_path, noisy_audio, sr = sample_rate)
        count += 1
      #print(clean_audio.shape, noise_path, noise_audio.shape, output_path)
  #break

print('Made {} noisy audios'.format(count))

Making noisy audio with SNR: 17.5
Making audio files for id10280
Making audio files for id10279
Making audio files for id10308
Making audio files for id10276
Making audio files for id10275
Making audio files for id10309
Making audio files for id10274
Making audio files for id10307
Making audio files for id10273
Making audio files for id10272
Making audio files for id10306
Making audio files for id10305
Making audio files for id10284
Making audio files for id10304
Making audio files for id10303
Making audio files for id10278
Making audio files for id10271
Making audio files for id10302
Making audio files for id10301
Making audio files for id10300
Making audio files for id10298
Making audio files for id10297
Making audio files for id10296
Making audio files for id10295
Making audio files for id10294
Making audio files for id10292
Making audio files for id10290
Making audio files for id10287
Making audio files for id10286
Making audio files for id10285
Making audio files for id10283
Makin

In [None]:
with open('veri_test_17.5.txt', 'w') as f:
  for line in open('veri_test.txt'):
    files = line.split()
    files[1] = files[1].replace('.wav', '_17.5.wav')
    files[2] = files[2].replace('.wav', '_17.5.wav')
    f.write('{} {} {}\n'.format(files[0], files[1], files[2]))

print('Wrote txt file...')

Wrote txt file...


In [9]:
if eval == True:   
  sc, lab = s.evaluateFromListSave(test_list_175, print_interval=100, test_path=test_path)
  result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
  print('EER %2.4f'%result[1])

Reading 4700: 1.33 Hz, embed size 512
Computing 37700: 1943.36 Hz

EER 3.0170


## Denoised audio