In [1]:
NUM_WORKERS = 2
BASE_TRAIN = '../input/gendersclassification-vdt-2022/dataset_v2/train'
BASE_TEST = '../input/gendersclassification-vdt-2022/public-test/public-test/wav'

In [6]:
import pandas as pd
def get_fn_lbs():
    lbs = []
    fns = []
    for name_dir in os.listdir(BASE_TRAIN):
        for i in os.listdir(os.path.join(BASE_TRAIN,name_dir)):
            if name_dir == 'male':
                for j in os.listdir(os.path.join(BASE_TRAIN,name_dir,i)):
                    dur = librosa.get_duration(filename=os.path.join(BASE_TRAIN,name_dir,i,j))
                    if dur >1:
                        lbs.append(1)
                        fns.append(os.path.join(BASE_TRAIN,name_dir,i,j))
            if name_dir == 'female':
                for j in os.listdir(os.path.join(BASE_TRAIN,name_dir,i)):
                    dur = librosa.get_duration(filename=os.path.join(BASE_TRAIN,name_dir,i,j))
                    if dur >1:
                        lbs.append(0)
                        fns.append(os.path.join(BASE_TRAIN,name_dir,i,j))
    return lbs,fns


def get_fn_test():
    lbs = []
    fns = []
    df = pd.read_csv("../input/gendersclassification-vdt-2022/public-test/public-test/test_files.txt", header=None,names=["gender", "filename"])
    for index, row in df.iterrows():
        if str(row['gender']) == 'M':
            dur = librosa.get_duration(filename=os.path.join('../input/gendersclassification-vdt-2022/public-test/public-test/wav',str(row['filename'])))
            if dur >1:
                lbs.append(1)
                fns.append(os.path.join('../input/gendersclassification-vdt-2022/public-test/public-test/wav',str(row['filename'])))
        elif str(row['gender']) == 'F':
            dur = librosa.get_duration(filename=os.path.join('../input/gendersclassification-vdt-2022/public-test/public-test/wav',str(row['filename'])))
            if dur >1:
                lbs.append(0)
                fns.append(os.path.join('../input/gendersclassification-vdt-2022/public-test/public-test/wav',str(row['filename'])))
    return lbs,fns
test_lbs,test_fns = get_fn_test()

def get_fn_submit():
    fns = []
    lbs = []
    for i in os.listdir('../input/gendersclassification-vdt-2022/private-test/private-test/wav'):
        fns.append(os.path.join('../input/gendersclassification-vdt-2022/private-test/private-test/wav',i))
        lbs.append(0)
    return lbs,fns


In [7]:
import random
import warnings
warnings.filterwarnings("ignore")
import librosa
from torch.utils import data



def load_audio(audio_path, feature_method='melspectrogram', mode='train', sr=8000, chunk_duration=2, augmentors=None):

    wav, sr_ret = librosa.load(audio_path, sr=sr)
    if mode == 'train':

        num_wav_samples = wav.shape[0]

        if num_wav_samples < sr:
            raise Exception(f'：{(num_wav_samples/sr):.2f}s')
        num_chunk_samples = int(chunk_duration * sr)
        if num_wav_samples > num_chunk_samples + 1:
            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
            stop = start + num_chunk_samples
            wav = wav[start:stop]
            if random.random() > 0.5:
                wav[:random.randint(1, sr // 4)] = 0
                wav = wav[:-random.randint(1, sr // 4)]

        if augmentors is not None:
            for key, augmentor in augmentors.items():
                if key == 'specaug':continue
                wav = augmentor(wav)
    elif mode == 'eval':

        num_wav_samples = wav.shape[0]
        num_chunk_samples = int(chunk_duration * sr)
        if num_wav_samples > num_chunk_samples + 1:
            wav = wav[:num_chunk_samples]

    if feature_method == 'melspectrogram':
        features = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=400, n_mels=80, hop_length=160, win_length=400)
    elif feature_method == 'spectrogram':
        linear = librosa.stft(wav, n_fft=400, win_length=400, hop_length=160)
        features, _ = librosa.magphase(linear)
    else:
        raise Exception(f' not have{feature_method} ！')
    features = librosa.power_to_db(features, ref=1.0, amin=1e-10, top_db=None)
    if mode == 'train' and augmentors is not None:
        for key, augmentor in augmentors.items():
            if key == 'specaug':
                features = augmentor(features)
    mean = np.mean(features, 0, keepdims=True)
    std = np.std(features, 0, keepdims=True)
    features = (features - mean) / (std + 1e-5)
    return features


class CustomDataset(data.Dataset):
    def __init__(self, filenames,labels, feature_method='melspectrogram', mode='train', sr=8000, chunk_duration=2, augmentors=None):
        super(CustomDataset, self).__init__()

        self.feature_method = feature_method
        self.mode = mode
        self.sr = sr
        self.chunk_duration = chunk_duration
        self.augmentors = augmentors
        self.fns = filenames
        self.lbs = labels
    def __getitem__(self, idx):
            fname = self.fns[idx]
            label = self.lbs[idx]
            features = load_audio(fname, feature_method=self.feature_method, mode=self.mode, sr=self.sr,
                                  chunk_duration=self.chunk_duration, augmentors=self.augmentors)
            return features, np.array(int(label), dtype=np.int64)

    def __len__(self):
        return len(self.fns)

    @property
    def input_size(self):
        if self.feature_method == 'melspectrogram':
            return 80
        elif self.feature_method == 'spectrogram':
            return 201



def collate_fn(batch):
    batch = sorted(batch, key=lambda sample: sample[0].shape[1], reverse=True)
    freq_size = batch[0][0].shape[0]
    max_audio_length = batch[0][0].shape[1]
    batch_size = len(batch)

    inputs = np.zeros((batch_size, freq_size, max_audio_length), dtype='float32')
    input_lens = []
    labels = []
    for x in range(batch_size):
        sample = batch[x]
        tensor = sample[0]
        labels.append(sample[1])
        seq_length = tensor.shape[1]
        inputs[x, :, :seq_length] = tensor[:, :]
        input_lens.append(seq_length/max_audio_length)
    input_lens = np.array(input_lens, dtype='float32')
    labels = np.array(labels, dtype='int64')
    return torch.tensor(inputs), torch.tensor(labels), torch.tensor(input_lens)


In [8]:

import torch.nn as nn
from torch.nn import Parameter


class Res2Conv1dReluBn(nn.Module):
    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
        super().__init__()
        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
        self.scale = scale
        self.width = channels // scale
        self.nums = scale if scale == 1 else scale - 1

        self.convs = []
        self.bns = []
        for i in range(self.nums):
            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
            self.bns.append(nn.BatchNorm1d(self.width))
        self.convs = nn.ModuleList(self.convs)
        self.bns = nn.ModuleList(self.bns)

    def forward(self, x):
        out = []
        spx = torch.split(x, self.width, 1)
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            # Order: conv -> relu -> bn
            sp = self.convs[i](sp)
            sp = self.bns[i](F.relu(sp))
            out.append(sp)
        if self.scale != 1:
            out.append(spx[self.nums])
        out = torch.cat(out, dim=1)
        return out


class Conv1dReluBn(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
        self.bn = nn.BatchNorm1d(out_channels)

    def forward(self, x):
        return self.bn(F.relu(self.conv(x)))


class SE_Connect(nn.Module):
    def __init__(self, channels, s=2):
        super().__init__()
        assert channels % s == 0, "{} % {} != 0".format(channels, s)
        self.linear1 = nn.Linear(channels, channels // s)
        self.linear2 = nn.Linear(channels // s, channels)

    def forward(self, x):
        out = x.mean(dim=2)
        out = F.relu(self.linear1(out))
        out = torch.sigmoid(self.linear2(out))
        out = x * out.unsqueeze(2)
        return out


def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
    return nn.Sequential(
        Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
        Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale),
        Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
        SE_Connect(channels)
    )


class AttentiveStatsPool(nn.Module):
    def __init__(self, in_dim, bottleneck_dim):
        super().__init__()
        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
        self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper

    def forward(self, x):
        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
        alpha = torch.tanh(self.linear1(x))
        alpha = torch.softmax(self.linear2(alpha), dim=2)
        mean = torch.sum(alpha * x, dim=2)
        residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
        std = torch.sqrt(residuals.clamp(min=1e-9))
        return torch.cat([mean, std], dim=1)


class EcapaTdnn(nn.Module):
    def __init__(self, input_size=80, channels=512, embd_dim=192):
        super().__init__()
        self.layer1 = Conv1dReluBn(input_size, channels, kernel_size=5, padding=2, dilation=1)
        self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
        self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
        self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)

        cat_channels = channels * 3
        out_channels = cat_channels * 2
        self.emb_size = embd_dim
        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
        self.pooling = AttentiveStatsPool(cat_channels, 128)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.linear = nn.Linear(out_channels, embd_dim)
        self.bn2 = nn.BatchNorm1d(embd_dim)

    def forward(self, x):
        out1 = self.layer1(x)
        out2 = self.layer2(out1) + out1
        out3 = self.layer3(out1 + out2) + out1 + out2
        out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3

        out = torch.cat([out2, out3, out4], dim=1)
        out = F.relu(self.conv(out))
        out = self.bn1(self.pooling(out))
        out = self.bn2(self.linear(out))
        return out


class Classification(nn.Module):
    def __init__(
            self,
            backbone,
            num_class=1,
            lin_blocks=0,
            lin_neurons=192,
            dropout=0.1, ):
        """

        Args:
            backbone (Paddle.nn.Layer class): the speaker identification backbone network model
            num_class (_type_): the speaker class num in the training dataset
            lin_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0.
            lin_neurons (int, optional): the output dimension of final linear layer. Defaults to 192.
            dropout (float, optional): the dropout factor on the embedding. Defaults to 0.1.
        """
        super(Classification, self).__init__()
        self.backbone = backbone
        if dropout > 0:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None


        input_size = self.backbone.emb_size
        self.blocks = list()
        for i in range(lin_blocks):
            self.blocks.extend([
                nn.BatchNorm1d(input_size),
                nn.Linear(in_features=input_size, out_features=lin_neurons),
            ])
            input_size = lin_neurons

        # the final layer
        self.weight = Parameter(torch.FloatTensor(num_class, input_size), requires_grad=True)
        nn.init.xavier_normal_(self.weight, gain=1)

    def forward(self, x):
        """Do the speaker identification model forwrd,
           including the speaker embedding model and the classifier model network

        Args:
            x (paddle.Tensor): input audio feats,
                               shape=[batch, dimension, times]
            lengths (paddle.Tensor, optional): input audio length.
                                        shape=[batch, times]
                                        Defaults to None.

        Returns:
            paddle.Tensor: return the logits of the feats
        """
        # x.shape: (N, C, L)
        x = self.backbone(x)  # (N, emb_size)
        if self.dropout is not None:
            x = self.dropout(x)

        for fc in self.blocks:
            x = fc(x)

        logits = F.linear(F.normalize(x), F.normalize(self.weight, dim=-1))

        return logits


In [9]:
import math

import torch.nn as nn
import torch.nn.functional as F


class AdditiveAngularMargin(nn.Module):
    def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
        """The Implementation of Additive Angular Margin (AAM) proposed
       in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
       (https://arxiv.org/abs/1906.07317)

        Args:
            margin (float, optional): margin factor. Defaults to 0.0.
            scale (float, optional): scale factor. Defaults to 1.0.
            easy_margin (bool, optional): easy_margin flag. Defaults to False.
        """
        super(AdditiveAngularMargin, self).__init__()
        self.margin = margin
        self.scale = scale
        self.easy_margin = easy_margin

        self.cos_m = math.cos(self.margin)
        self.sin_m = math.sin(self.margin)
        self.th = math.cos(math.pi - self.margin)
        self.mm = math.sin(math.pi - self.margin) * self.margin

    def forward(self, outputs, targets):
        cosine = outputs.float()
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        outputs = (targets * phi) + ((1.0 - targets) * cosine)
        return self.scale * outputs


In [10]:
!pip install -q torchsummary

[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
import distutils.util

from tqdm import tqdm


def print_arguments(args):
    print("-----------  Configuration Arguments -----------")
    for arg, value in sorted(vars(args).items()):
        print("%s: %s" % (arg, value))
    print("------------------------------------------------")


def add_arguments(argname, type, default, help, argparser, **kwargs):
    type = distutils.util.strtobool if type == bool else type
    argparser.add_argument("--" + argname,
                           default=default,
                           type=type,
                           help=help + '',
                           **kwargs)


def cal_accuracy_threshold(y_score, y_true):
    y_score = np.asarray(y_score)
    y_true = np.asarray(y_true)
    best_accuracy = 0
    best_threshold = 0
    for i in tqdm(range(0, 100)):
        threshold = i * 0.01
        y_test = (y_score >= threshold)
        acc = np.mean((y_test == y_true).astype(int))
        if acc > best_accuracy:
            best_accuracy = acc
            best_threshold = threshold

    return best_accuracy, best_threshold


def cal_accuracy(y_score, y_true, threshold=0.5):
    y_score = np.asarray(y_score)
    y_true = np.asarray(y_true)
    y_test = (y_score >= threshold)
    accuracy = np.mean((y_test == y_true).astype(int))
    return accuracy



def cosin_metric(x1, x2):
    return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))



In [12]:
!pip install -q visualdl

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
markdown 3.3.7 requires importlib-metadata>=4.4; python_version < "3.10", but you have importlib-metadata 4.2.0 which is incompatible.
gym 0.23.1 requires importlib-metadata>=4.10.0; python_version < "3.10", but you have importlib-metadata 4.2.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [18]:
import argparse
import functools
import os
import time
from datetime import datetime, timedelta
import copy

import numpy as np
import torch
import yaml
from torch.utils.data import DataLoader
from torch.nn import DataParallel
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchsummary import summary
from visualdl import LogWriter
from sklearn.model_selection import train_test_split
# from modules.loss import AAMLoss
# from modules.ecapa_tdnn import EcapaTdnn, SpeakerIdetification
# from data_utils.reader import CustomDataset, collate_fn
# from data_utils.noise_perturb import NoisePerturbAugmentor
# from data_utils.speed_perturb import SpeedPerturbAugmentor
# from data_utils.volume_perturb import VolumePerturbAugmentor
# from data_utils.spec_augment import SpecAugmentor
# from utils.utility import add_arguments, print_arguments


parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('gpus',             str,    '0',                      )
add_arg('use_model',        str,    'ecapa_tdnn',             )
add_arg('batch_size',       int,    256,                      )
add_arg('num_workers',      int,    4,                        )
add_arg('num_epoch',        int,    100,                      )
add_arg('num_speakers',     int,    2,                        )
add_arg('learning_rate',    float,  1e-3,                     )
add_arg('save_model_dir',   str,    'models/',                )
add_arg('feature_method',   str,    'melspectrogram'         , choices=['melspectrogram', 'spectrogram'])
add_arg('augment_conf_path',str,    'configs/augment.yml',    )
add_arg('resume',           str,    None,                     )
add_arg('pretrained_model', str,    None,                     )
args, unknown = parser.parse_known_args()


@torch.no_grad()
def evaluate(model, eval_loader):
    model.eval()
    accuracies = []
    device = torch.device("cuda")
    lbs = []
    pred = []
    
    for batch_id, (audio, label, _) in enumerate(eval_loader):
        audio = audio.to(device)
        output = model(audio)
        # 计算准确率
        lbs+=label.cpu().tolist()
        output = output.data.cpu().numpy()
        output = np.argmax(output, axis=1)
        pred+=output.tolist()
        
        label = label.data.cpu().numpy()
        acc = np.mean((output == label).astype(int))
        accuracies.append(acc.item())
    model.train()
    return float(sum(accuracies) / len(accuracies)) , pred , lbs

def save_model(save_path, model, optimizer, epoch):
    os.makedirs(save_path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(save_path, 'model.pth'))
    torch.save({'last_epoch': torch.tensor(epoch)}, os.path.join(save_path, 'model.state'))
    torch.save(optimizer.state_dict(), os.path.join(save_path, 'optimizer.pth'))





In [14]:
lbs,fns = get_fn_lbs()
print("Done")
train_fns, val_fns, train_lbs, val_lbs = train_test_split(fns, lbs, test_size=0.1, random_state=42,shuffle = True)
train_fns, test_fns, train_lbs, test_lbs = train_test_split(fns, lbs, test_size=0.1, random_state=42,shuffle = True)


Done


In [19]:
acc_save_model = 0
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
print_arguments(args)

device_ids = [int(i) for i in args.gpus.split(',')]

writer = LogWriter(logdir='log')

augmentors = None
#     if args.augment_conf_path is not None:
#         augmentors = {}
#         with open(args.augment_conf_path, encoding="utf-8") as fp:
#             configs = yaml.load(fp, Loader=yaml.FullLoader)
#         augmentors['noise'] = NoisePerturbAugmentor(**configs['noise'])
#         augmentors['speed'] = SpeedPerturbAugmentor(**configs['speed'])
#         augmentors['volume'] = VolumePerturbAugmentor(**configs['volume'])
#         augmentors['specaug'] = SpecAugmentor(**configs['specaug'])




train_dataset = CustomDataset(train_fns,train_lbs,
                              feature_method=args.feature_method,
                              mode='train',
                              sr=8000,
                              chunk_duration=2,
                              augmentors=augmentors)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=args.batch_size * len(device_ids),
                          collate_fn=collate_fn,
                          shuffle=True,
                          num_workers=args.num_workers)

eval_dataset = CustomDataset(val_fns,val_lbs,
                             feature_method=args.feature_method,
                             mode='eval',
                             sr=8000,
                             chunk_duration=2)
eval_loader = DataLoader(dataset=eval_dataset,
                         batch_size=args.batch_size,
                         collate_fn=collate_fn,
                         num_workers=args.num_workers)

device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")

if args.use_model == 'ecapa_tdnn':
    ecapa_tdnn = EcapaTdnn(input_size=train_dataset.input_size)
    model = Classification(backbone=ecapa_tdnn, num_class=args.num_speakers)
else:
    raise Exception(f'{args.use_model} 模型不存在！')

if len(args.gpus.split(',')) > 1:
    model = DataParallel(model, device_ids=device_ids, output_device=device_ids[0])

model.to(device)
if len(args.gpus.split(',')) > 1:
    summary(model.module, (train_dataset.input_size, 98), device='cuda')
else:
    summary(model, (train_dataset.input_size, 98), device='cuda')


last_epoch = 0

optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=5e-4)

scheduler = CosineAnnealingLR(optimizer, T_max=args.num_epoch)


if args.pretrained_model is not None:
    model_dict = model.state_dict()
    param_state_dict = torch.load(os.path.join(args.pretrained_model, 'model.pth'))
    for name, weight in model_dict.items():
        if name in param_state_dict.keys():
            if list(weight.shape) != list(param_state_dict[name].shape):
                print('{} not used, shape {} unmatched with {} in model.'.
                      format(name, list(param_state_dict[name].shape), list(weight.shape)))
                param_state_dict.pop(name, None)
        else:
            print('Lack weight: {}'.format(name))
    model.load_state_dict(param_state_dict, strict=False)



if args.resume is not None:
    model.load_state_dict(torch.load(os.path.join(args.resume, 'model.pth')))
    state = torch.load(os.path.join(args.resume, 'model.state'))
    last_epoch = state['last_epoch']
    optimizer_state = torch.load(os.path.join(args.resume, 'optimizer.pth'))
    optimizer.load_state_dict(optimizer_state)

criterion = nn.CrossEntropyLoss()
train_step = 0
test_step = 0
sum_batch = len(train_loader) * (args.num_epoch - last_epoch)
model = model.to(device)


-----------  Configuration Arguments -----------
augment_conf_path: configs/augment.yml
batch_size: 256
feature_method: melspectrogram
gpus: 0
learning_rate: 0.001
num_epoch: 100
num_speakers: 2
num_workers: 4
pretrained_model: None
resume: None
save_model_dir: models/
test_list_path: dataset/test_list.txt
train_list_path: dataset/train_list.txt
use_model: ecapa_tdnn
------------------------------------------------
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1              [-1, 512, 98]         204,800
       BatchNorm1d-2              [-1, 512, 98]           1,024
      Conv1dReluBn-3              [-1, 512, 98]               0
            Conv1d-4              [-1, 512, 98]         262,144
       BatchNorm1d-5              [-1, 512, 98]           1,024
      Conv1dReluBn-6              [-1, 512, 98]               0
            Conv1d-7               [-1, 64, 98]          12,288
    

In [20]:

for epoch in range(last_epoch, args.num_epoch):
    loss_sum = []
    accuracies = []
    start = time.time()
    for batch_id, (audio, label, _) in enumerate(train_loader):          
        audio = audio.to(device)
        label = label.to(device).long()
        output = model(audio)

        loss = criterion(output, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = output.data.cpu().numpy()
        output = np.argmax(output, axis=1)
        label = label.data.cpu().numpy()
        acc = np.mean((output == label).astype(int))
        accuracies.append(acc.item())
        loss_sum.append(loss.item())

        if batch_id % 30 == 0:
            eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id)
            eta_str = str(timedelta(seconds=int(eta_sec / 1000)))
            print(f'[{datetime.now()}] '
                  f'Train epoch [{epoch}/{args.num_epoch}], '
                  f'batch: [{batch_id}/{len(train_loader)}], '
                  f'loss: {(sum(loss_sum) / len(loss_sum)):.5f}, '
                  f'accuracy: {(sum(accuracies) / len(accuracies)):.5f}, '
                  f'lr: {scheduler.get_lr()[0]:.8f}, '
                  f'eta: {eta_str}')
            writer.add_scalar('Train/Loss', loss.item(), train_step)
            writer.add_scalar('Train/Accuracy', (sum(accuracies) / len(accuracies)), train_step)
            train_step += 1
        start = time.time()

    s = time.time()
    acc, _ ,_ = evaluate(model, eval_loader)
    eta_str = str(timedelta(seconds=int(time.time() - s)))
    print('='*70)
    print(f'[{datetime.now()}] Test {epoch}, accuracy: {acc:.5f} time: {eta_str}')
    print('='*70)
    writer.add_scalar('Test/Accuracy', acc, test_step)

    writer.add_scalar('Train/Learning rate', scheduler.get_lr()[0], epoch)
    test_step += 1
    
    
    scheduler.step()

    save_path = os.path.join(args.save_model_dir, args.use_model)
    
    
    if acc > acc_save_model:
        save_path_1 = 'model.pt'
        print("Save model acc>acc_save" + str(acc))
        acc_save_model = acc
        best_model = copy.deepcopy(model)
        save_model(save_path_1, best_model, optimizer, epoch)
        
    if len(device_ids) > 1:
        print("Save model de>1")
        save_model(save_path, model.module, optimizer, epoch)
    else:
        print("Save model de<1")
        save_model(save_path, model, optimizer, epoch)



[2022-06-13 13:17:02.708153] Train epoch [0/100], batch: [0/52], loss: 0.69333, accuracy: 0.52734, lr: 0.00100000, eta: 11:37:00
[2022-06-13 13:18:04.227575] Train epoch [0/100], batch: [30/52], loss: 0.34344, accuracy: 0.93284, lr: 0.00100000, eta: 0:47:28
[2022-06-13 13:18:57.201923] Test 0, accuracy: 0.94745 time: 0:00:15
Save model acc>acc_save0.9474500240847784
Save model de<1
[2022-06-13 13:19:06.159999] Train epoch [1/100], batch: [0/52], loss: 0.22104, accuracy: 0.98047, lr: 0.00099951, eta: 12:16:11
[2022-06-13 13:19:58.579912] Train epoch [1/100], batch: [30/52], loss: 0.24485, accuracy: 0.95728, lr: 0.00099951, eta: 1:25:04
[2022-06-13 13:20:39.723473] Test 1, accuracy: 0.95331 time: 0:00:10
Save model acc>acc_save0.9533093990847784
Save model de<1
[2022-06-13 13:20:48.947786] Train epoch [2/100], batch: [0/52], loss: 0.22561, accuracy: 0.95703, lr: 0.00099827, eta: 12:26:41
[2022-06-13 13:21:42.042546] Train epoch [2/100], batch: [30/52], loss: 0.21064, accuracy: 0.96510, l

In [22]:

test_dataset = CustomDataset(test_fns,test_lbs,
                             feature_method=args.feature_method,
                             mode='eval',
                             sr=8000,
                             chunk_duration=2)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=args.batch_size,
                         collate_fn=collate_fn,
                         num_workers=args.num_workers)

In [23]:
submit_lbs,submit_fns = get_fn_submit()
submit_dataset = CustomDataset(submit_fns,submit_lbs,
                             feature_method=args.feature_method,
                             mode='eval',
                             sr=8000,
                             chunk_duration=2)
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=args.batch_size,
                         collate_fn=collate_fn,shuffle=False,
                         num_workers=args.num_workers)

NameError: name 'submit_fns' is not defined

In [24]:
acc, pred, lbs = evaluate(model, test_loader)

In [26]:
from sklearn.metrics import classification_report
print(classification_report(lbs,pred,digits=4))

              precision    recall  f1-score   support

           0     0.9662    0.9620    0.9641       684
           1     0.9663    0.9701    0.9682       769

    accuracy                         0.9663      1453
   macro avg     0.9663    0.9660    0.9662      1453
weighted avg     0.9663    0.9663    0.9663      1453



In [27]:
print(acc)

0.9656001625722542


In [29]:
acc, pred, lbs = evaluate(best_model, test_loader)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(lbs,pred,digits=4))

              precision    recall  f1-score   support

           0     0.9652    0.9722    0.9687       684
           1     0.9751    0.9688    0.9720       769

    accuracy                         0.9704      1453
   macro avg     0.9701    0.9705    0.9703      1453
weighted avg     0.9704    0.9704    0.9704      1453



In [None]:
# acc, pred, lbs = evaluate(best_model, test_loader)
# print(acc)