In [1]:
import importlib
from SpeakerNet import SpeakerNet
from util import *

import torch
import yaml

from torch.profiler import profile, record_function, ProfilerActivity
from validation import *

In [2]:
with open('./configs/K_NeXt_TDNN.yaml') as file:
#with open('toy.yaml') as file:
    config = yaml.safe_load(file)

BATCH_SIZE = config['PARAMS']['BATCH_SIZE']
BASE_LR = float(config['PARAMS']['BASE_LR'])
NUM_WORKER = config['PARAMS']['NUM_WORKER']
CHANNEL_SIZE = config['PARAMS']['CHANNEL_SIZE']
EMBEDDING_SIZE = config['PARAMS']['EMBEDDING_SIZE']
MAX_FRAME = config['PARAMS']['MAX_FRAME']
SAMPLING_RATE = config['PARAMS']['SAMPLING_RATE']
MAX_EPOCH = config['PARAMS']['MAX_EPOCH']
DEVICE = 'cpu' #config['PARAMS']['DEVICE']
BASE_PATH = config['PARAMS']['BASE_PATH']

In [3]:
feature_extractor = importlib.import_module('preprocessing.mel_transform').__getattribute__("feature_extractor")
feature_extractor = feature_extractor(*config['FEATURE_EXTRACTOR'].values()).to(DEVICE)

#fe = feature_extractor(x.to(DEVICE))
#print('feature extractor :', fe.shape)

spec_aug = importlib.import_module('preprocessing.spec_aug').__getattribute__("spec_aug")
spec_aug = spec_aug(*config['SPEC_AUG'].values()).to(DEVICE)

#sa = spec_aug(fe)
#print('spec aug :', sa.shape)

model_cfg = config['MODEL']
model = importlib.import_module('models.NeXt_TDNN').__getattribute__("MainModel")
model =  model(
    depths = model_cfg['depths'], 
    dims = model_cfg['dims'],
    kernel_size = model_cfg['kernel_size'],
    block = model_cfg['block']
).to(DEVICE)

#m = model(sa.to(DEVICE))
#print('model :', m.shape)

aggregation = importlib.import_module('aggregation.vap_bn_tanh_fc_bn').__getattribute__("Aggregation")
aggregation = aggregation(*config['AGGREGATION'].values()).to(DEVICE)

#a = aggregation(m).to(DEVICE)
#print('aggregation : ', a.shape)

loss_function = importlib.import_module("loss.aamsoftmax").__getattribute__("LossFunction")
loss_function = loss_function(*config['LOSS'].values())

speaker_net = SpeakerNet(feature_extractor = feature_extractor,
                       spec_aug = spec_aug, 
                       model = model,
                       aggregation=aggregation,
                       loss_function = loss_function).to(DEVICE)

optimizer = importlib.import_module("optimizer." + 'adamw').__getattribute__("Optimizer")
optimizer = optimizer(speaker_net.parameters(), lr= BASE_LR*BATCH_SIZE, weight_decay = 0.01,)    

scheduler = importlib.import_module("scheduler." + 'steplr').__getattribute__("Scheduler")
scheduler = scheduler(optimizer, step_size = 10, gamma = 0.8)

Initialised AAMSoftmax margin 0.300 scale 40.000
⚡ feature_extractor ⚡
Mel_Spectrogram(
  (pre_emphasis): PreEmphasis()
  (mel_spectrogram): MelSpectrogram(
    (spectrogram): Spectrogram()
    (mel_scale): MelScale()
  )
)
⚡ spec_aug ⚡
SpecAugment(
  (fm): FrequencyMasking()
  (tm): TimeMasking()
)
⚡ model ⚡
NeXtTDNN(
  (stem): ModuleList(
    (0): Sequential(
      (0): Conv1d(80, 192, kernel_size=(4,), stride=(1,))
      (1): LayerNorm()
    )
  )
  (stages): ModuleList(
    (0-2): 3 x Sequential(
      (0): TSConvNeXt_light(
        (dwconv): Conv1d(192, 192, kernel_size=(65,), stride=(1,), padding=(32,), groups=192)
        (norm): LayerNorm()
        (pwconv1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (grn): GRN()
        (pwconv2): Linear(in_features=768, out_features=192, bias=True)
        (drop_path): Identity()
      )
    )
  )
  (MFA): Sequential(
    (0): Conv1d(576, 576, kernel_size=(1,), stride=(1,))
    (1): L

In [4]:
get_model_param_mmac(speaker_net, int(160*300 + 240), DEVICE)

SpeakerNet(
  1.63 M, 84.888% Params, 420.52 MMac, 98.346% MACs, 
  (feature_extractor): Mel_Spectrogram(
    0, 0.000% Params, 0.0 Mac, 0.000% MACs, 
    (pre_emphasis): PreEmphasis(0, 0.000% Params, 0.0 Mac, 0.000% MACs, )
    (mel_spectrogram): MelSpectrogram(
      0, 0.000% Params, 0.0 Mac, 0.000% MACs, 
      (spectrogram): Spectrogram(0, 0.000% Params, 0.0 Mac, 0.000% MACs, )
      (mel_scale): MelScale(0, 0.000% Params, 0.0 Mac, 0.000% MACs, )
    )
  )
  (spec_aug): SpecAugment(
    0, 0.000% Params, 0.0 Mac, 0.000% MACs, 
    (fm): FrequencyMasking(0, 0.000% Params, 0.0 Mac, 0.000% MACs, )
    (tm): TimeMasking(0, 0.000% Params, 0.0 Mac, 0.000% MACs, )
  )
  (model): NeXtTDNN(
    1.32 M, 68.832% Params, 395.25 MMac, 92.439% MACs, 
    (stem): ModuleList(
      (0): Sequential(
        61.63 k, 3.215% Params, 18.43 MMac, 4.310% MACs, 
        (0): Conv1d(61.63 k, 3.215% Params, 18.43 MMac, 4.310% MACs, 80, 192, kernel_size=(4,), stride=(1,))
        (1): LayerNorm(0, 0.000% P

('427.59', '1.92', 418.415136, 1.627416, 155.27652, 1.917144)

In [4]:
from check_vram import check_vram

check_vram()

>> GPU:0 총 VRAM: 8188.00 MB
>> GPU:0 사용 중인 VRAM: 805.69 MB
>> GPU:0 남은 VRAM: 7382.31 MB


In [5]:
speaker_net.eval()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_stack=True,profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        speaker_net(torch.randn(24320,).unsqueeze(0).to(DEVICE))

In [6]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total"))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls                                                                      Input Shapes  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                 model_inference        19.65%       3.696ms       100.00%      18.809ms      18.809ms       3.210ms        17.11%      18.763ms

In [7]:
print(prof.key_averages().table(sort_by="cuda_memory_usage"))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                 model_inference        19.65%       3.696ms       100.00%      18.809ms      18.809ms       3.210ms        17.11%      18.763ms      18.763ms           0 b     -15.94 Mb             1  
                     aten::randn         0.24%      44.400us         0.80%     150.100us     150.100us      36.000us         0.19%     152.000us     152.000us      95.00 Kb           0 b 

Original

In [12]:
# original
dic = torch.load('./experiments/NeXt_TDNN_light_C192_B1_K65/NeXt_TDNN_light_C192_B1_K65.pt')
speaker_net.load_state_dict(dic['state_dict'], strict=False)

  dic = torch.load(r'C:\Users\jwjln\Desktop\SV\SpeakerVerification\experiments\NeXt_TDNN_light_C192_B1_K65\NeXt_TDNN_light_C192_B1_K65.pt')


_IncompatibleKeys(missing_keys=['feature_extractor.pre_emphasis.flipped_filter', 'feature_extractor.mel_spectrogram.spectrogram.window', 'feature_extractor.mel_spectrogram.mel_scale.fb', 'model.stages.0.0.dwconv.weight', 'model.stages.0.0.dwconv.bias', 'model.stages.0.0.norm.weight', 'model.stages.0.0.norm.bias', 'model.stages.0.0.pwconv1.weight', 'model.stages.0.0.pwconv1.bias', 'model.stages.0.0.grn.gamma', 'model.stages.0.0.grn.beta', 'model.stages.0.0.pwconv2.weight', 'model.stages.0.0.pwconv2.bias', 'model.stages.1.0.dwconv.weight', 'model.stages.1.0.dwconv.bias', 'model.stages.1.0.norm.weight', 'model.stages.1.0.norm.bias', 'model.stages.1.0.pwconv1.weight', 'model.stages.1.0.pwconv1.bias', 'model.stages.1.0.grn.gamma', 'model.stages.1.0.grn.beta', 'model.stages.1.0.pwconv2.weight', 'model.stages.1.0.pwconv2.bias', 'model.stages.2.0.dwconv.weight', 'model.stages.2.0.dwconv.bias', 'model.stages.2.0.norm.weight', 'model.stages.2.0.norm.bias', 'model.stages.2.0.pwconv1.weight', 'mod

In [13]:
cos_eer, euc_eer, cos_dcf, euc_dcf = validation(speaker_net, BASE_PATH, DEVICE)
print('Cosine EER : {0}, Euclidean EER : {1}'.format(cos_eer, euc_eer))
print('Cosine MinDCF : {0}, Euclidean MinDCF : {1}'.format(cos_dcf, euc_dcf))

Model Validation..


100%|██████████| 50700/50700 [03:47<00:00, 222.38it/s]

Cosine EER : 36.34319526627219, Euclidean EER : 36.34319526627219
Cosine MinDCF : 0.9282445759368835, Euclidean MinDCF : 0.9282445759368835





Korean model

In [10]:
# knext
dic = torch.load('./experiments/K_NeXt_TDNN/ckpt_5.pt')
speaker_net.load_state_dict(dic['model'], strict=False)

  dic = torch.load('./experiments/K_NeXt_TDNN/ckpt_5.pt')


<All keys matched successfully>

In [11]:
cos_eer, euc_eer, cos_dcf, euc_dcf = validation(speaker_net, BASE_PATH, DEVICE)
print('Cosine EER : {0}, Euclidean EER : {1}'.format(cos_eer, euc_eer))
print('Cosine MinDCF : {0}, Euclidean MinDCF : {1}'.format(cos_dcf, euc_dcf))

Model Validation..


100%|██████████| 50700/50700 [15:14<00:00, 55.42it/s]

Cosine EER : 11.44378698224852, Euclidean EER : 11.44378698224852
Cosine MinDCF : 0.4206311637080868, Euclidean MinDCF : 0.4206311637080868





In [22]:
import time
from eval import *

test_audio_file = 'KHOtest.wav' ####

speaker_net.eval()
test_audio = load_audio(test_audio_file, DEVICE)

start = time.time()
test_emb = speaker_net(test_audio.unsqueeze(0))
end = time.time()
print(end-start)

0.001005411148071289
