In [None]:
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import os
import numpy as np

from torch.utils.data import DataLoader, Dataset

In [None]:
!nvidia-smi

Sat May 14 01:43:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
!pip install -U pip gdown

[0m

In [None]:
# https://drive.google.com/file/d/1eLVEr6dvycpzCBsW7qcpYE5LoSahgz_W/view?usp=sharing
import gdown, os
os.chdir('/content')
id = '1eLVEr6dvycpzCBsW7qcpYE5LoSahgz_W'
url = "https://drive.google.com/uc?id={}".format(id)
output = "VCC2018_MOS_preprocessed.tar.bz"

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1eLVEr6dvycpzCBsW7qcpYE5LoSahgz_W
To: /content/VCC2018_MOS_preprocessed.tar.bz
100%|██████████| 1.32G/1.32G [00:13<00:00, 97.7MB/s]


'VCC2018_MOS_preprocessed.tar.bz'

In [None]:
import tarfile

tar = tarfile.open(output, "r:bz2")
tar.extractall()
tar.close()

In [None]:
import pandas as pd

df = pd.read_csv('/content/VCC2018_MOS_preprocessed/mos_list.txt', header=None)
df[0] = '/content/VCC2018_MOS_preprocessed/wav/' + df[0] 
df.columns = ['filepath', 'score']
df.head()

Unnamed: 0,filepath,score
0,/content/VCC2018_MOS_preprocessed/wav/N14_VCC2...,1.25
1,/content/VCC2018_MOS_preprocessed/wav/N14_VCC2...,2.75
2,/content/VCC2018_MOS_preprocessed/wav/N14_VCC2...,3.5
3,/content/VCC2018_MOS_preprocessed/wav/N14_VCC2...,1.75
4,/content/VCC2018_MOS_preprocessed/wav/N14_VCC2...,4.0


In [None]:
num_classes = len(df['score'].unique())
num_classes

21

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df['score'])
df['score'] = le.transform(df['score'])

In [None]:
df['score']

0         1
1         7
2        11
3         3
4        14
         ..
22355    20
22356    20
22357    20
22358    20
22359    20
Name: score, Length: 22360, dtype: int64

In [None]:
df['score'].shape

(22360,)

In [None]:
len(df)

22360

In [None]:
from tqdm import tqdm
import torchaudio
data = []

wavs_folder = '/content/VCC2018_MOS_preprocessed/wav/'
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #filepath = os.path.join(wavs_folder, row['path'])
    filepath = row['filepath']
    score = row['score']
    try:
        # There are some broken files
        s = torchaudio.load(filepath)
        data.append({
            # "name": name,
            "filepath": filepath,
            "score": score
        })
    except Exception as e:
        #print(str(filepath), e)
        pass

100%|██████████| 22360/22360 [00:23<00:00, 958.31it/s] 


In [None]:
data = pd.DataFrame(data)

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.10) 

In [None]:
categories_train = train_data['score'].to_list()
audio_names_train = train_data['filepath'].to_list()

categories_test = test_data['score'].to_list()
audio_names_test = test_data['filepath'].to_list()

In [None]:
# get all unique categories
categories_types = np.sort(df['score'].unique())
categories_types, categories_types.shape

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20]), (21,))

In [None]:
# we will do saving melspectrograms just to skip the process of loading, getting specs too often
def normalize(spec, eps=1e-6):
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    return spec_scaled

def get_transform_specs(audio_names, data_dir='./'):
    # get audio path
    specs = []
    for audio_name in audio_names:
      audio_path = os.path.join(data_dir, audio_name)
        
      # load audio and get its melspectrogram
      audio_wave, sr = librosa.load(audio_path, None)
      if audio_wave.shape[0]<5*sr:
        audio_wave = np.pad(audio_wave, int(np.ceil((5*sr-audio_wave.shape[0])/2)), mode='reflect')
      else:
        audio_wave = audio_wave[:5*sr]

      mel_spec = librosa.feature.melspectrogram(audio_wave, sr=sr, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300)
      
      mel_spec = librosa.power_to_db(mel_spec, top_db=80)
      mel_spec = normalize(mel_spec)
      specs.append(mel_spec[np.newaxis,...])

    return specs

In [None]:
mos_to_id = dict((mos,id) for id, mos in enumerate(categories_types))

In [None]:
# if we don't do preprocessing - saving specs - use this dataset 
class VCC2018MOSWavDatasetLoad(Dataset):
    def __init__(self, audio_names: list, labels: list):
        self.audio_names = audio_names
        self.labels = labels

        #self.label_to_id = dict((mos,id) for id, mos in enumerate(labels))
        
    def __len__(self):
        return len(self.audio_names)

    def __getitem__(self, idx):
        filename = self.audio_names[idx]
        waveform, sample_rate = torchaudio.load(filename)
        #target = self.label_to_id[self.labels[idx]]
        target = self.labels[idx]
        
        return {"wav": waveform, "target": target}

In [None]:
from torch.nn.utils.rnn import pad_sequence 

def wav_collate_fn(data):
    """
       data: is a list of tuples with (example, label, length)
             where 'example' is a tensor of arbitrary shape
             and label/length are scalars
    """
    features = [torch.tensor(d['wav']) for d in data] #(3)
    labels = torch.tensor([d['target']  for d in data]) 
    new_features = pad_sequence([f.T for f in features], batch_first=True).squeeze()

    return  {
        'wav': new_features.to(device), 
        'target': labels.to(device)
    }

In [None]:
dataset_train = VCC2018MOSWavDatasetLoad(audio_names_train, categories_train)
loader_train = DataLoader(dataset_train, batch_size=15, shuffle=True, collate_fn=wav_collate_fn)

dataset_test = VCC2018MOSWavDatasetLoad(audio_names_test, categories_test)
loader_test = DataLoader(dataset_test, batch_size=2, shuffle=True, collate_fn=wav_collate_fn)


In [None]:
for i, batch in enumerate(loader_test):
    print(type(batch['wav']))
    print(batch['wav'].shape)
    print(batch['wav'].get_device())
    print(type(batch['target']))
    print(batch['target'].shape)    
    print(batch['target'].get_device())
    break

  if __name__ == '__main__':


<class 'torch.Tensor'>
torch.Size([2, 40652])
0
<class 'torch.Tensor'>
torch.Size([2])
0


In [None]:
for i, batch in enumerate(loader_train):
    print(type(batch['wav']))
    print(batch['wav'].shape)
    print(batch['wav'].get_device())
    print(type(batch['target']))
    print(batch['target'].shape)    
    print(batch['target'].get_device())
    break
    

<class 'torch.Tensor'>
torch.Size([15, 84722])
0
<class 'torch.Tensor'>
torch.Size([15])
0


  if __name__ == '__main__':


In [None]:
one_spec = dataset_train[10]['wav']
one_label = dataset_train[10]['target']

In [None]:
#one_spec = torch.tensor(one_spec, dtype=torch.int32)
one_spec = one_spec.cpu().numpy() 

In [None]:
one_spec.shape

(1, 99200)

In [None]:
!pip install transformers

[0m

In [None]:
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

model_name_or_path = "facebook/wav2vec2-base-960h"
pooling_mode = "mean"

label_list = categories_types
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=len(categories_types),
    #label2id={label: i for i, label in enumerate(label_list)},
    #id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)

In [None]:
config.num_labels

21

In [None]:
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

class Wav2Vec2ClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = self.dropout(torch.tanh(x))
        x = self.classifier(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 21
        self.pooling_mode = nn.MaxPool2d(3)

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)
        
        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def forward(
            self,
            input_values,
            attention_mask=None
    ):
        outputs = self.wav2vec2(
            input_values
        )
        hidden_states = outputs.last_hidden_state
        hidden_states = torch.sum(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        #return torch.flatten(logits)
        return F.softmax(logits, 1)

In [None]:
os.listdir('/content/VCC2018_MOS_preprocessed/wav')[:10]

['N12_VCC2TF1_VCC2SM2_30002_HUB.wav',
 'N19_VCC2TM2_VCC2SM1_30027_HUB.wav',
 'N15_VCC2TF2_VCC2SF1_30020_HUB.wav',
 'N13_VCC2TM1_VCC2SM4_30028_SPO.wav',
 'N04_VCC2TM2_VCC2SM4_30005_SPO.wav',
 'S00_VCC2TFX_VCC2SM1_30012_NAT.wav',
 'N09_VCC2TM1_VCC2SF2_30034_HUB.wav',
 'D01_VCC2TM2_VCC2SF1_30016_HUB.wav',
 'N03_VCC2TF2_VCC2SF3_30034_SPO.wav',
 'N04_VCC2TM2_VCC2SM1_30009_HUB.wav']

In [None]:
wav2vec_classifier = Wav2Vec2ForSpeechClassification(config)
wav2vec_classifier = wav2vec_classifier.to(device)

In [None]:
ex_audio, sr = librosa.load('/content/VCC2018_MOS_preprocessed/wav/D03_VCC2TM1_VCC2SF1_30030_HUB.wav')
ex_audio.shape

(115089,)

In [None]:
ex_audio = torch.Tensor(ex_audio).unsqueeze(0)

In [None]:
ex_audio.shape

torch.Size([1, 115089])

In [None]:
output = wav2vec_classifier(ex_audio.to(device))
output.shape

torch.Size([1, 21])

In [None]:
output.shape

torch.Size([1, 21])

In [None]:
F.softmax(output, 1)

tensor([[0.0453, 0.0922, 0.0948, 0.0489, 0.0613, 0.0290, 0.0271, 0.0425, 0.0646,
         0.0136, 0.0354, 0.0263, 0.0269, 0.0607, 0.0276, 0.0410, 0.0733, 0.0407,
         0.0360, 0.0555, 0.0571]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [None]:
tmp = batch['wav']
tmp.shape

torch.Size([3, 64881])

In [None]:
output = wav2vec_classifier(tmp)
output.shape

torch.Size([3, 21])

In [None]:
len(loader_train)

6174

# Training

In [None]:
import time
import math
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
def train(model, iterator, optimizer, criterion, scheduler, epoch=0):
    model.train()
    
    epoch_loss, accuracy, f1, recall, precision = 0, 0, 0, 0, 0 
    total_steps = len(iterator)
    for i, batch in enumerate(iterator):
        specs = batch['wav'].to(device, dtype=torch.float32)
        labels = batch['target'].to(device)

        #specs = specs.unsqueeze(0)
        #print(specs.shape)
        #print(labels.shape)
        optimizer.zero_grad()
        
        output = model(specs)
        #print(output.shape)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        
        #calculate metrics
        #output = F.softmax(output, 1)
        result = output.argmax(1)
            
        accuracy += accuracy_score(result.cpu(), labels.cpu())
        f1 += f1_score(result.cpu(), labels.cpu(), average='micro')
        recall += recall_score(result.cpu(), labels.cpu(), average='micro')
        precision += precision_score(result.cpu(), labels.cpu(), average='micro')

        if i % ( int(total_steps / 100)) == 0:
          #step_loss = epoch_loss / (i+1)
          #step_acc = accuracy / (i+1)
          #step_f1 = f1 / (i+1)
          #step_recall = recall / (i+1)
          #step_precision = precision / (i+1)
          #print("Train step {0}  loss: {1:.5f} acc: {2:.5f} f1: {3:.5f} recall {4:.5f} precision: {5:.5f}".format(i, step_loss, step_acc, step_f1, step_recall, step_precision))
          acc = accuracy_score(result.cpu(), labels.cpu())
          print("Train step {0} / {1}  loss: {2:.5f} acc: {3:.5f}".format(i, total_steps, loss.item(), acc))
    #wandb.log({"loss_train": epoch_loss / (i+1), "accuracy_train": accuracy / (i+1), "f1_train": f1 / (i+1), "recall_train": recall / (i+1), "precision_train": precision / (i+1)})

    #accuracy /= (i+1)
    #f1 /= (i+1)
    #epoch_loss /= (i+1)
    #recall /= (i+1)
    #precision /= (i+1)
    accuracy /= total_steps
    epoch_loss /= total_steps
    recall /= total_steps
    precision /= total_steps

    return epoch_loss, accuracy, f1, recall, precision

def evaluate(model, iterator, criterion, epoch):
    model.eval()
    epoch_loss, accuracy, f1, recall, precision = 0, 0, 0, 0, 0 
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            specs = batch['wav'].to(device, dtype=torch.float32)
            batch_size = specs.shape[0]
            
            labels = batch['target'].to(device)
            output = model(specs)

            loss = criterion(output, labels)
            
            epoch_loss += loss.item()
            result = output.argmax(1)
            
            accuracy += accuracy_score(result.cpu(), labels.cpu())
            f1 += f1_score(result.cpu(), labels.cpu(), average='micro')
            recall += recall_score(result.cpu(), labels.cpu(), average='micro')
            precision += precision_score(result.cpu(), labels.cpu(), average='micro')
        
    #wandb.log({"loss_train": epoch_loss / (i+1), "accuracy_train": accuracy / (i+1), "f1_train": f1 / (i+1), "recall_train": recall / (i+1), "precision_train": precision / (i+1)})

    accuracy /= (i+1)
    f1 /= (i+1)
    epoch_loss /= (i+1)
    recall /= (i+1)
    precision /= (i+1)
    
    return epoch_loss, accuracy, f1, recall, precision

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# wandb.init(project='emble_audio_classification', entity='miana')
# config = wandb.config

# change model below
# model = CNNNetwork().to(device)
model = wav2vec_classifier.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-2)
criterion = nn.CrossEntropyLoss()

lambda2 = lambda epoch: epoch * 0.95
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,  lr_lambda=[lambda2])

In [None]:
N_EPOCHS = 10

train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
train_f1s, val_f1s = [], []

best_valid_loss = float('inf')

#wandb.watch(model)
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_accuracy, train_f1, train_recall, train_precision = train(model, loader_train, optimizer, criterion, scheduler, epoch)
    val_loss, val_accuracy, val_f1, val_recall, val_precision = evaluate(model, loader_test, criterion, epoch)
    
    end_time = time.time()
    
    # fill data
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_accuracy)
    train_f1s.append(train_f1)
    val_accuracies.append(val_accuracy)
    val_f1s.append(val_f1)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    torch.save(model.state_dict(), 'best-val-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins} m {epoch_secs} s')
    print(f'\tTrain Loss: {train_loss}, accuracy: {train_accuracy}, f1 {train_f1}, recall {train_recall}, precision {train_precision}')
    print(f'\t Val. Loss: {val_loss}, accuracy: {val_accuracy}, f1 {val_f1}, recall {val_recall}, precision {val_precision}')

Train step 0 / 1235  loss: 3.04410 acc: 0.06667
Train step 12 / 1235  loss: 3.12317 acc: 0.00000
Train step 24 / 1235  loss: 2.98984 acc: 0.13333
Train step 36 / 1235  loss: 3.12317 acc: 0.00000
Train step 48 / 1235  loss: 2.98984 acc: 0.13333
Train step 60 / 1235  loss: 3.12317 acc: 0.00000
Train step 72 / 1235  loss: 3.12317 acc: 0.00000
Train step 84 / 1235  loss: 3.05650 acc: 0.06667
Train step 96 / 1235  loss: 3.12317 acc: 0.00000
Train step 108 / 1235  loss: 3.12317 acc: 0.00000
Train step 120 / 1235  loss: 3.05650 acc: 0.06667
Train step 132 / 1235  loss: 3.05650 acc: 0.06667
Train step 144 / 1235  loss: 2.98984 acc: 0.13333
Train step 156 / 1235  loss: 2.98984 acc: 0.13333
Train step 168 / 1235  loss: 3.12317 acc: 0.00000
Train step 180 / 1235  loss: 3.05650 acc: 0.06667
Train step 192 / 1235  loss: 3.12317 acc: 0.00000
Train step 204 / 1235  loss: 3.12317 acc: 0.00000
Train step 216 / 1235  loss: 3.12317 acc: 0.00000
Train step 228 / 1235  loss: 3.05650 acc: 0.06667
Train step

KeyboardInterrupt: ignored