In [1]:
import librosa
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import os
import numpy as np

from torch.utils.data import DataLoader, Dataset

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [42]:
from torch.nn.utils.rnn import pad_sequence 
def custom_spec_collate_fn(data):
    """
       data: is a list of tuples with (example, label, length)
             where 'example' is a tensor of arbitrary shape
             and label/length are scalars
    """
    features = [torch.tensor(d['data']) for d in data] #(3)
    labels = torch.tensor([d['target']  for d in data]) 
    new_features = pad_sequence([f.T for f in features], batch_first=True).squeeze()

    return  {
        'data': new_features.to(device), 
        'target': labels.to(device)
    }


In [16]:
class VCC2018SpecDatasetLoad(Dataset):
    def __init__(self, audio_names: list, labels: list):
        self.audio_names = audio_names
        self.labels = labels
        self.transformation = torchaudio.transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=1024,
            hop_length=512,
            n_mels=80
        )
        #self.label_to_id = dict((mos,id) for id, mos in enumerate(labels))
        
    def __len__(self):
        return len(self.audio_names)

    def __getitem__(self, idx):
        filename = self.audio_names[idx]
        #waveform, sample_rate = torchaudio.load(filename)
        wav, sr = torchaudio.load(filename)
        mel_spec = self.transformation(wav).unsqueeze(1)

        target = self.labels[idx]
        
        return {"data": mel_spec, "target": target}

    def get_transform_spec(self, idx, data_dir='./'):
        # get audio path
        specs = []
        audio_name = self.audio_names[idx]
        audio_path = os.path.join(data_dir, audio_name)
          
        # load audio and get its melspectrogram
        audio_wave, sr = librosa.load(audio_path, sr=None)

        mel_spec = librosa.feature.melspectrogram(y=audio_wave, sr=sr)
        
        mel_spec = librosa.power_to_db(mel_spec)
        mel_spec = torch.Tensor(self.normalize(mel_spec)).unsqueeze(0)
        return mel_spec      

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [18]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )

        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = F.avg_pool2d(x, 2)
        return x

class CNN2D_MODEL(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        self.conv = nn.Sequential(
            ConvBlock(in_channels=1, out_channels=64),
            ConvBlock(in_channels=64, out_channels=128),
            ConvBlock(in_channels=128, out_channels=256),
            ConvBlock(in_channels=256, out_channels=512),
        )
        
        self.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.PReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.1),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        x = self.conv(x)
        x = torch.mean(x, dim=3)
        x, _ = torch.max(x, dim=2)
        x = self.fc(x)
        return x


In [31]:
train_csv = '../data/train.csv'
test_csv = '../data/test.csv'

In [32]:
train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)

In [33]:
train_data['filepath'] 

0        VCC2018_MOS_preprocessed/wav/N09_VCC2TF1_VCC2S...
1        VCC2018_MOS_preprocessed/wav/N11_VCC2TF2_VCC2S...
2        VCC2018_MOS_preprocessed/wav/N16_VCC2TM1_VCC2S...
3        VCC2018_MOS_preprocessed/wav/N04_VCC2TM1_VCC2S...
4        VCC2018_MOS_preprocessed/wav/N11_VCC2TF1_VCC2S...
                               ...                        
18517    VCC2018_MOS_preprocessed/wav/D04_VCC2TF1_VCC2S...
18518    VCC2018_MOS_preprocessed/wav/N04_VCC2TF2_VCC2S...
18519    VCC2018_MOS_preprocessed/wav/N12_VCC2TF2_VCC2S...
18520    VCC2018_MOS_preprocessed/wav/N18_VCC2TM2_VCC2S...
18521    VCC2018_MOS_preprocessed/wav/D05_VCC2TM2_VCC2S...
Name: filepath, Length: 18522, dtype: object

In [34]:
train_data['filepath'] = '../../../DATASETS/MOS/' + train_data['filepath']
test_data['filepath'] = '../../../DATASETS/MOS/' + test_data['filepath']

In [35]:
categories_train = train_data['score'].to_list()
audio_names_train = train_data['filepath'].to_list()

categories_test = test_data['score'].to_list()
audio_names_test = test_data['filepath'].to_list()

In [36]:
num_classes = len(pd.concat([train_data, test_data])['score'].unique())

In [47]:
dataset_train = VCC2018SpecDatasetLoad(audio_names_train, categories_train)
loader_train = DataLoader(dataset_train, batch_size=48, shuffle=True, collate_fn=custom_spec_collate_fn)

dataset_test = VCC2018SpecDatasetLoad(audio_names_test, categories_test)
loader_test = DataLoader(dataset_test, batch_size=8, shuffle=True, collate_fn=custom_spec_collate_fn)


In [38]:
model = CNN2D_MODEL(num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-1)
criterion = nn.CrossEntropyLoss()

In [48]:
def train(model, iterator, optimizer, criterion, scheduler, epoch=0):
    model.train()
    epoch_loss, accuracy, f1, recall, precision = 0, 0, 0, 0, 0

    total_steps = len(iterator)
    for i, batch in enumerate(iterator):
        data = batch['data'].to(device, dtype=torch.float32).unsqueeze(1)
        labels = batch['target'].to(device)

        optimizer.zero_grad()

        output = model(data)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # calculate metrics
        output = F.softmax(output, 1)
        result = output.argmax(1)

        accuracy += accuracy_score(result.cpu(), labels.cpu())
        f1 += f1_score(result.cpu(), labels.cpu(), average='micro')
        recall += recall_score(result.cpu(), labels.cpu(), average='micro')
        precision += precision_score(result.cpu(), labels.cpu(), average='micro')

        if (i % 100 == 0):
            step_loss = "{:.5f}".format(epoch_loss / (i + 1))
            step_acc = "{:.5f}".format(accuracy / (i + 1))
            step_f1 = "{:.5f}".format(f1 / (i + 1))
            step_recall = "{:.5f}".format(recall / (i + 1))
            step_precision = "{:.5f}".format(precision / (i + 1))
            print(
                f"Train step {i} loss: {step_loss} acc: {step_acc} f1: {step_f1} recall {step_recall} precision: {step_precision}")
    # wandb.log({"loss_train": epoch_loss / (i+1), "accuracy_train": accuracy / (i+1), "f1_train": f1 / (i+1), "recall_train": recall / (i+1), "precision_train": precision / (i+1)})

    accuracy /= (i + 1)
    f1 /= (i + 1)
    epoch_loss /= (i + 1)
    recall /= (i + 1)
    precision /= (i + 1)

    return epoch_loss, accuracy, f1, recall, precision


def evaluate(model, iterator, criterion, epoch):
    model.eval()
    epoch_loss, accuracy, f1, recall, precision = 0, 0, 0, 0, 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            data = batch['data'].to(device, dtype=torch.float32).unsqueeze(1)
            batch_size = data.shape[0]

            labels = batch['target'].to(device)
            output = model(data)
            loss = criterion(output, labels)

            epoch_loss += loss.item()
            result = output.argmax(1)

            accuracy += accuracy_score(result.cpu(), labels.cpu())
            f1 += f1_score(result.cpu(), labels.cpu(), average='micro')
            recall += recall_score(result.cpu(), labels.cpu(), average='micro')
            precision += precision_score(result.cpu(), labels.cpu(), average='micro')

    # wandb.log({"loss_train": epoch_loss / (i+1), "accuracy_train": accuracy / (i+1), "f1_train": f1 / (i+1), "recall_train": recall / (i+1), "precision_train": precision / (i+1)})

    accuracy /= (i + 1)
    f1 /= (i + 1)
    epoch_loss /= (i + 1)
    recall /= (i + 1)
    precision /= (i + 1)

    return epoch_loss, accuracy, f1, recall, precision


In [40]:
lambda2 = lambda epoch: epoch * 0.95
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,  lr_lambda=[lambda2])


In [45]:
import warnings
warnings.filterwarnings("ignore")

In [53]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

EPOCHS = 1

train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
train_f1s, val_f1s = [], []

best_valid_loss = float('inf')

# wandb.watch(model)
for epoch in range(EPOCHS):
    train_loss, train_accuracy, train_f1, train_recall, train_precision = train(model, loader_train, optimizer,
                                                                                criterion, scheduler, epoch)
    val_loss, val_accuracy, val_f1, val_recall, val_precision = evaluate(model, loader_test, criterion, epoch)


    # fill data
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_accuracy)
    train_f1s.append(train_f1)
    val_accuracies.append(val_accuracy)
    val_f1s.append(val_f1)

    torch.save(model.state_dict(), 'best-val-model.pt')

    print(f'Epoch: {epoch + 1:02}')
    print(
        f'\tTrain Loss: {train_loss}, accuracy: {train_accuracy}, f1 {train_f1}, recall {train_recall}, precision {train_precision}')
    print(
        f'\t Val. Loss: {val_loss}, accuracy: {val_accuracy}, f1 {val_f1}, recall {val_recall}, precision {val_precision}')


Train step 0 loss: 3.16676 acc: 0.08333 f1: 0.08333 recall 0.08333 precision: 0.08333
Train step 100 loss: 3.21965 acc: 0.03837 f1: 0.03837 recall 0.03837 precision: 0.03837
Train step 200 loss: 3.20451 acc: 0.04239 f1: 0.04239 recall 0.04239 precision: 0.04239
Train step 300 loss: 3.20255 acc: 0.04298 f1: 0.04298 recall 0.04298 precision: 0.04298
Epoch: 01
	Train Loss: 3.2022181174915687, accuracy: 0.04302368615840121, f1 0.04302368615840121, recall 0.04302368615840121, precision 0.04302368615840121
	 Val. Loss: 3.096637265626774, accuracy: 0.04844961240310078, f1 0.04844961240310078, recall 0.04844961240310078, precision 0.04844961240310078
