In [None]:
import importlib
from SpeakerNet import SpeakerNet
from util import *

import torch
import yaml

from torch.profiler import profile, record_function, ProfilerActivity
from validation import *

In [None]:
with open('./configs/K_NeXt_TDNN.yaml') as file:
    config = yaml.safe_load(file)

BATCH_SIZE = config['PARAMS']['BATCH_SIZE']
BASE_LR = float(config['PARAMS']['BASE_LR'])
NUM_WORKER = config['PARAMS']['NUM_WORKER']
CHANNEL_SIZE = config['PARAMS']['CHANNEL_SIZE']
EMBEDDING_SIZE = config['PARAMS']['EMBEDDING_SIZE']
MAX_FRAME = config['PARAMS']['MAX_FRAME']
SAMPLING_RATE = config['PARAMS']['SAMPLING_RATE']
MAX_EPOCH = config['PARAMS']['MAX_EPOCH']
DEVICE = 'cpu' #config['PARAMS']['DEVICE']
BASE_PATH = config['PARAMS']['BASE_PATH']

In [None]:
feature_extractor = importlib.import_module('preprocessing.mel_transform').__getattribute__("feature_extractor")
feature_extractor = feature_extractor(*config['FEATURE_EXTRACTOR'].values()).to(DEVICE)

spec_aug = importlib.import_module('preprocessing.spec_aug').__getattribute__("spec_aug")
spec_aug = spec_aug(*config['SPEC_AUG'].values()).to(DEVICE)

model_cfg = config['MODEL']
model = importlib.import_module('models.NeXt_TDNN').__getattribute__("MainModel")
model =  model(
    depths = model_cfg['depths'], 
    dims = model_cfg['dims'],
    kernel_size = model_cfg['kernel_size'],
    block = model_cfg['block']
).to(DEVICE)

aggregation = importlib.import_module('aggregation.vap_bn_tanh_fc_bn').__getattribute__("Aggregation")
aggregation = aggregation(*config['AGGREGATION'].values()).to(DEVICE)

loss_function = importlib.import_module("loss.aamsoftmax").__getattribute__("LossFunction")
loss_function = loss_function(*config['LOSS'].values())

speaker_net = SpeakerNet(feature_extractor = feature_extractor,
                       spec_aug = spec_aug, 
                       model = model,
                       aggregation=aggregation,
                       loss_function = loss_function).to(DEVICE)

optimizer = importlib.import_module("optimizer." + 'adamw').__getattribute__("Optimizer")
optimizer = optimizer(speaker_net.parameters(), lr= BASE_LR*BATCH_SIZE, weight_decay = 0.01,)    

scheduler = importlib.import_module("scheduler." + 'steplr').__getattribute__("Scheduler")
scheduler = scheduler(optimizer, step_size = 10, gamma = 0.8)

In [None]:
get_model_param_mmac(speaker_net, int(160*300 + 240), DEVICE)

In [None]:
from check_vram import check_vram

check_vram()

In [None]:
speaker_net.eval()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack=True,profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        speaker_net(torch.randn(24320,).unsqueeze(0).to(DEVICE))

In [None]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total"))

In [None]:
print(prof.key_averages().table(sort_by="cuda_memory_usage"))

Original

In [None]:
# original
dic = torch.load('./experiments/NeXt_TDNN_light_C192_B1_K65/NeXt_TDNN_light_C192_B1_K65.pt')
speaker_net.load_state_dict(dic['state_dict'], strict=False)

In [None]:
cos_eer, euc_eer, cos_dcf, euc_dcf = validation(speaker_net, BASE_PATH, DEVICE)
print('Cosine EER : {0}, Euclidean EER : {1}'.format(cos_eer, euc_eer))
print('Cosine MinDCF : {0}, Euclidean MinDCF : {1}'.format(cos_dcf, euc_dcf))

Korean model

In [None]:
# knext
dic = torch.load('./experiments/K_NeXt_TDNN/ckpt_5.pt')
speaker_net.load_state_dict(dic['model'], strict=False)

In [None]:
cos_eer, euc_eer, cos_dcf, euc_dcf = validation(speaker_net, BASE_PATH, DEVICE)
print('Cosine EER : {0}, Euclidean EER : {1}'.format(cos_eer, euc_eer))
print('Cosine MinDCF : {0}, Euclidean MinDCF : {1}'.format(cos_dcf, euc_dcf))

In [None]:
import time
from inference import *

test_audio_file = 'KHOtest.wav' ####

speaker_net.eval()
test_audio = load_audio(test_audio_file, DEVICE)

start = time.time()
test_emb = speaker_net(test_audio.unsqueeze(0))
end = time.time()
print(end-start)