## Import Library

In [None]:
import os
import math
import time
import pickle
import random

import librosa
from scipy.io import wavfile
import numpy as np
import pandas as pd
from PIL import Image
import sklearn.metrics
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from fastprogress import master_bar, progress_bar
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms

## Const Variable

In [None]:
duration = 3
duration_long = 5
SAMPLE_RATE = 44100
SAMPLE_DATA_CNT = SAMPLE_RATE*duration # 220500
SAMPLE_DATA_CNT_LONG = SAMPLE_RATE*duration_long

## spectrogram
n_mels = 128
fmin = 20
fmax = SAMPLE_RATE // 2
n_mels = 128
n_fft = n_mels * 20

## Data Path

In [None]:
DATA_PATH = '../input/freesound-audio-tagging-2019/'

CURATED_CSV = DATA_PATH + 'train_curated.csv'
CURATED_DIR = DATA_PATH + 'train_curated/'
NOISY_CSV = DATA_PATH + 'train_noisy.csv'
NOISY_DIR = DATA_PATH + 'train_noisy/'
TEST_CSV = DATA_PATH + 'sample_submission.csv'
TEST_DIR = DATA_PATH + 'test/'


SPEC_DATA_PATH = "../input/preprocessed-data-spectrogram/"

BEST_50_CSV = SPEC_DATA_PATH + "trn_noisy_best50s.csv"
CURATED_TRAIN_MEL = SPEC_DATA_PATH + "mels_train_curated.pkl"
NOISY_BEST50_TRAIN_MEL = SPEC_DATA_PATH + "mels_trn_noisy_best50s.pkl"
CURATED_TRAIN_MFCC = SPEC_DATA_PATH + "mfcc_train_curated.pkl"
NOISY_BEST50_TRAIN_MFCC = SPEC_DATA_PATH + "mfcc_trn_noisy_best50s.pkl"

## Load preprocessed spectrogram

In [None]:
def load_pkl(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
%%time
# mels
x_train_curated_mels = load_pkl(CURATED_TRAIN_MEL)
x_train_noisy_mels = load_pkl(NOISY_BEST50_TRAIN_MEL)

# mfcc
x_train_curated_mfcc = load_pkl(CURATED_TRAIN_MFCC)
x_train_noisy_mfcc = load_pkl(NOISY_BEST50_TRAIN_MFCC)

train_curated_csv = pd.read_csv(CURATED_CSV)
train_noisy_csv = pd.read_csv(BEST_50_CSV)

## Drop corrupted / wrong label file

In [None]:
%%time
drop_audio_file = ['f76181c4.wav', '77b925c2.wav', '6a1f682a.wav', 'c7db12aa.wav', '7752cc8a.wav','1d44b0bd.wav']
drop_index = train_curated_csv.loc[  train_curated_csv['fname'].isin( drop_audio_file ) ].index.values
train_curated_csv = train_curated_csv.drop(drop_index, axis=0)

## Concatenate two lists

In [None]:
%%time
x_train_mels = x_train_curated_mels
x_train_mels.extend(x_train_noisy_mels)
del x_train_curated_mels, x_train_noisy_mels

x_train_mfcc = x_train_curated_mfcc
x_train_mfcc.extend(x_train_noisy_mfcc)
del x_train_curated_mfcc, x_train_noisy_mfcc

train_csv = pd.concat([train_curated_csv, train_noisy_csv], ignore_index=True)
y_train = train_csv.labels.str.get_dummies(sep=',')
del train_csv, train_curated_csv, train_noisy_csv

print(len(x_train_mels), len(x_train_mfcc), len(y_train))

## Preprocess Data

In [None]:
## OPEN AND TRIM THE AUDIO FILE
def read_audio(pathdir, trim_long_data):
    rate, data = wavfile.read(pathdir)
    data = data.astype(float)

    # workaround: 0 length causes error
    if 0 < len(data): 
        data, _ = librosa.effects.trim(data)  # trim, top_db=default(60)

    # make it unified length to SAMPLE_DATA_CNT
    if len(data) > SAMPLE_DATA_CNT_LONG:             
        if trim_long_data:
#             print(pathdir)
            # trim the audio where is nearby the maximun amplitude to a segment of 7 secs
            MaxAmp = np.where(abs(data) == max(abs(data)))[0]
            MaxAmp = MaxAmp[np.random.randint(len(MaxAmp))]
            max_offset = len(data) - SAMPLE_DATA_CNT_LONG
            if max(abs(data)) == 0:
                data = data[:SAMPLE_DATA_CNT_LONG]
            elif (len(data)-MaxAmp) < MaxAmp:
                offset = (np.random.randint(len(data)-MaxAmp) if len(data)-MaxAmp < max_offset else len(data)-MaxAmp-np.random.randint(max_offset))
                data = data[(MaxAmp-(SAMPLE_DATA_CNT_LONG-offset)):(MaxAmp+offset)]
            else:
                if MaxAmp == 0:
                    offset = 0
                elif MaxAmp < max_offset:
                    offset = np.random.randint(MaxAmp)
                else:
                    offset = MaxAmp-np.random.randint(max_offset)
                data = data[(MaxAmp-offset):(MaxAmp+(SAMPLE_DATA_CNT_LONG-offset))]
    else:                                     
        data = np.tile(data, math.floor(SAMPLE_DATA_CNT/data.shape[0])) 
        if len(data) < SAMPLE_DATA_CNT:
            max_offset = SAMPLE_DATA_CNT-len(data)
            offset = np.random.randint(max_offset)
            data = np.pad(data, (offset, SAMPLE_DATA_CNT-len(data)-offset), "constant")

    return data

def audio_to_melspectrogram(audio):
    spectrogram = librosa.feature.melspectrogram(audio,sr=SAMPLE_RATE,n_mels=n_mels,n_fft=n_fft,fmin=fmin,fmax=fmax)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    mfcc = librosa.feature.mfcc(S=spectrogram, n_mfcc=13)
    mfcc = librosa.feature.delta(mfcc, order=2)
    return spectrogram, mfcc

def read_as_melspectrogram(pathdir, trim_long_data):
    x = read_audio(pathdir, trim_long_data)
    mels, mfcc = audio_to_melspectrogram(x)
    return mels, mfcc

In [None]:
# change the array to RBG scaler
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

In [None]:
def convert_wav_to_image(csv_path):
    X_mels = []
    X_mfcc = []
    test_data = pd.read_csv(csv_path)
    for filename in test_data.fname:
        x_mels, x_mfcc = read_as_melspectrogram(TEST_DIR+filename , trim_long_data=True)
        x_mels_color = mono_to_color(x_mels)
        X_mels.append(x_mels_color)
        X_mfcc.append(x_mfcc)
    return X_mels, X_mfcc

def save_as_pkl_binary(obj, filename):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

## Whether CUDA is available

In [None]:
torch.cuda.is_available()

## NN model define

In [None]:
## convolution block
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )

        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = F.avg_pool2d(x, 2)
        return x

In [None]:
## Classifier
class Classifier(nn.Module):
    def __init__(self, num_classes, n_dim):
        super().__init__()
        
        self.conv = nn.Sequential(
            ConvBlock(in_channels=n_dim, out_channels=32),
            ConvBlock(in_channels=32, out_channels=64),
            ConvBlock(in_channels=64, out_channels=128),
            ConvBlock(in_channels=128, out_channels=256),
            ConvBlock(in_channels=256, out_channels=512),
        )
        
        self.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.PReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.1),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        x = self.conv(x)
        x = torch.mean(x, dim=3)
        x, _ = torch.max(x, dim=2)
        x = self.fc(x)
        return x

In [None]:
## Show the model structure
Classifier(80,3)

## lwlrap Function
 [reference](https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8?fbclid=IwAR0CDfIoN_LJyTFEmW50-wRLoqRJIsdh79oSV2xpir_3ajyJbzwoV2xK21U#scrollTo=5HfziEYbodWk)

In [None]:
def _one_sample_positive_class_precisions(scores, truth):
  """Calculate precisions for each true class for a single sample.
  
  Args:
    scores: np.array of (num_classes,) giving the individual classifier scores.
    truth: np.array of (num_classes,) bools indicating which classes are true.

  Returns:
    pos_class_indices: np.array of indices of the true classes for this sample.
    pos_class_precisions: np.array of precisions corresponding to each of those
      classes.
  """
  num_classes = scores.shape[0]
  pos_class_indices = np.flatnonzero(truth > 0)
  # Only calculate precisions if there are some true classes.
  if not len(pos_class_indices):
    return pos_class_indices, np.zeros(0)
  # Retrieval list of classes for this sample. 
  retrieved_classes = np.argsort(scores)[::-1]
  # class_rankings[top_scoring_class_index] == 0 etc.
  class_rankings = np.zeros(num_classes, dtype=np.int)
  class_rankings[retrieved_classes] = range(num_classes)
  # Which of these is a true label?
  retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
  retrieved_class_true[class_rankings[pos_class_indices]] = True
  # Num hits for every truncated retrieval list.
  retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
  # Precision of retrieval list truncated at each hit, in order of pos_labels.
  precision_at_hits = (
      retrieved_cumulative_hits[class_rankings[pos_class_indices]] / 
      (1 + class_rankings[pos_class_indices].astype(np.float)))
  return pos_class_indices, precision_at_hits

In [None]:
def calculate_per_class_lwlrap(truth, scores):
  """Calculate label-weighted label-ranking average precision.
  
  Arguments:
    truth: np.array of (num_samples, num_classes) giving boolean ground-truth
      of presence of that class in that sample.
    scores: np.array of (num_samples, num_classes) giving the classifier-under-
      test's real-valued score for each class for each sample.
  
  Returns:
    per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each 
      class.
    weight_per_class: np.array of (num_classes,) giving the prior of each 
      class within the truth labels.  Then the overall unbalanced lwlrap is 
      simply np.sum(per_class_lwlrap * weight_per_class)
  """
  assert truth.shape == scores.shape
  num_samples, num_classes = scores.shape
  # Space to store a distinct precision value for each class on each sample.
  # Only the classes that are true for each sample will be filled in.
  precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
  for sample_num in range(num_samples):
    pos_class_indices, precision_at_hits = (
      _one_sample_positive_class_precisions(scores[sample_num, :], 
                                            truth[sample_num, :]))
    precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
        precision_at_hits)
  labels_per_class = np.sum(truth > 0, axis=0)
  weight_per_class = labels_per_class / float(np.sum(labels_per_class))
  # Form average of each column, i.e. all the precisions assigned to labels in
  # a particular class.
  per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) / 
                      np.maximum(1, labels_per_class))
  # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
  #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
  #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
  #                = np.sum(per_class_lwlrap * weight_per_class)
  return per_class_lwlrap, weight_per_class

## DataSet

問題：<br>
15秒的音訊檔轉出來的spectrogram大小為(128,1292)<br>
而該kernel只能承受(128,128)的圖片大小<br>
如果使用resize是否會造成圖片太過壓縮?<br><br>

解決方式1: img.resize((width, height),Image.ANTIALIAS) //平滑處理 <br>
解決方式2: 先切割出一段區間的音訊檔，再使用resize

In [None]:
class FATTrainDataset(Dataset):
    def __init__(self, mels, labels, transforms):
        super().__init__()
        self.mels = mels
        self.labels = labels
        self.transforms = transforms
        
    def __len__(self):
        return len(self.mels)
    
    def __getitem__(self, idx):
        # crop 1sec
        image = Image.fromarray(self.mels[idx])
        time_dim, base_dim = image.size
        crop = random.randint(0, time_dim - 256)
        image = image.crop([crop, 0, crop + 256, base_dim])
#         image = image.crop([0, 0, 256, base_dim])
        
        image = self.transforms(image).div_(255)
        
        label = self.labels[idx]
        label = torch.from_numpy(label).float()
        
        return image, label

In [None]:
class FATTestDataset(Dataset):
    def __init__(self, fnames, mels, transforms, tta=5):
        super().__init__()
        self.fnames = fnames
        self.mels = mels
        self.transforms = transforms
        self.tta = tta
        
    def __len__(self):
        return len(self.fnames) * self.tta
    
    def __getitem__(self, idx):
        new_idx = idx % len(self.fnames)
        
        image = Image.fromarray(self.mels[new_idx])
        time_dim, base_dim = image.size
        crop = random.randint(0, time_dim - 256)
        image = image.crop([crop, 0, crop + 256, base_dim])
#         image = image.crop([0, 0, 256, base_dim])
        
        image = self.transforms(image).div_(255)
        fname = self.fnames[new_idx]
        
        return image, fname

In [None]:
tr = transforms.Compose([transforms.RandomHorizontalFlip(0.5),transforms.ToTensor()])

## Train class

In [None]:
def train_model(x_train, y_train, train_transforms, dataset='mels'):
    model_name = dataset + '_weight_best.pt'
    num_epochs = 80
    batch_size = 64
    test_batch_size = 256
    lr = 3e-3
    eta_min = 1e-5
    t_max = 10
    
    num_classes = y_train.shape[1]
    n_dim = 3 if dataset == 'mels' else 1
    
    x_trn, x_val, y_trn, y_val = train_test_split(x_train, y_train.values, test_size=0.2)
    
    train_dataset = FATTrainDataset(x_trn, y_trn, train_transforms)
    valid_dataset = FATTrainDataset(x_val, y_val, train_transforms)

    '''
    用於批次訓練
    '''
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=test_batch_size, shuffle=False)

    model = Classifier(num_classes=num_classes, n_dim=n_dim).cuda()
    criterion = nn.BCEWithLogitsLoss().cuda()
    optimizer = Adam(params=model.parameters(), lr=lr, amsgrad=False)
    scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)

    best_epoch = -1
    best_lwlrap = 0.
    mb = master_bar(range(num_epochs))

    for epoch in mb:
    
        start_time = time.time()
        
        '''
        訓練模型
        '''
        model.train()
        avg_loss = 0.        
        for x_batch, y_batch in progress_bar(train_loader, parent=mb):
            
            preds = model(x_batch.cuda())               # Forward propagation
            loss = criterion(preds, y_batch.cuda())     # 計算loss

            optimizer.zero_grad()                       # 梯度歸零
            loss.backward()                             # Back propagation
            optimizer.step()                            # update 參數

            avg_loss += loss.item() / len(train_loader)     
        model.eval()
        
        '''
        測試模型 用來看這次的訓練成效
        '''
        valid_preds = np.zeros((len(x_val), num_classes))
        avg_val_loss = 0.
        for i, (x_batch, y_batch) in enumerate(valid_loader):
            preds = model(x_batch.cuda()).detach()
            loss = criterion(preds, y_batch.cuda())

            preds = torch.sigmoid(preds)
            valid_preds[i * test_batch_size: (i+1) * test_batch_size] = preds.cpu().numpy()

            avg_val_loss += loss.item() / len(valid_loader)
     
        score, weight = calculate_per_class_lwlrap(y_val, valid_preds)
        lwlrap = (score * weight).sum()
        ### 測試模型尾巴
        
        scheduler.step()
        
        '''
        單純打印出來
        '''
        if (epoch + 1) % 5 == 0:
            elapsed = time.time() - start_time
            mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  val_lwlrap: {lwlrap:.6f}  time: {elapsed:.0f}s')
    
        if lwlrap > best_lwlrap:
            best_epoch = epoch + 1
            best_lwlrap = lwlrap
            torch.save(model.state_dict(), model_name)    # 只保存網路中的参数 (速度快, 占内存少)
      
      
    return {
        'best_epoch': best_epoch,
        'best_lwlrap': best_lwlrap,
    }

In [None]:
x_train_mels[0].shape

In [None]:
%%time
result_mels = train_model(x_train_mels, y_train, tr, 'mels')

In [None]:
%%time
result_mfcc = train_model(x_train_mfcc, y_train, tr, 'mfcc')

## Predict class

In [None]:
%%time
x_test_mels, x_test_mfcc = convert_wav_to_image(TEST_CSV)
save_as_pkl_binary(x_test_mels, 'x_test_mels.pkl')
save_as_pkl_binary(x_test_mfcc, 'x_test_mfcc.pkl')

In [None]:
y_test = pd.read_csv(TEST_CSV)

In [None]:
def predict_model(test_fnames, x_test, test_transforms, num_classes, *, tta=5, dataset='mels'):
    model_name = dataset + '_weight_best.pt'
    n_dim = 3 if dataset == 'mels' else 1
    batch_size = 256

    test_dataset = FATTestDataset(test_fnames.values, x_test, test_transforms, tta=tta)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = Classifier(num_classes=num_classes, n_dim=n_dim)
    model.load_state_dict(torch.load(model_name))
    model.cuda()
    model.eval()

    all_outputs, all_fnames = [], []

    pb = progress_bar(test_loader)
    for images, fnames in pb:
        preds = torch.sigmoid(model(images.cuda()).detach())
        all_outputs.append(preds.cpu().numpy())
        all_fnames.extend(fnames)

    test_preds = pd.DataFrame(data=np.concatenate(all_outputs),
                              index=all_fnames,
                              columns=map(str, range(num_classes)))
                              
    ## 因為使用dataload 同一個filename 可能會有兩個predict結果
    ## 所以要用groupby(level=0)組合起來再取平均
    test_preds = test_preds.groupby(level=0).mean() 

    return test_preds

In [None]:
%%time
test_preds_mels = predict_model(y_test.fname, x_test_mels, tr, 80, tta=20, dataset='mels')
test_preds_mfcc = predict_model(y_test.fname, x_test_mfcc, tr, 80, tta=20, dataset='mfcc')

In [None]:
labels = y_test.columns[1:].tolist()
y_test[labels] = test_preds_mels.values*0.8+test_preds_mfcc.values*0.2
y_test.to_csv('submission.csv', index=False)
y_test.head()