In [1]:
import numpy as np
import pandas as pd
import os
import copy
from torch.utils.data import Dataset
from torch.utils.data import SubsetRandomSampler, DataLoader
import re
import torch
import torch.nn as nn
import torch.optim as optim
from scipy.stats import spearmanr as corr

Step 9b: Multitask Learning

In [2]:
# Helper functions to read fused, mel, and chromagram
def read_fused_spectrogram(spectrogram_file):
    spectrogram = np.load(spectrogram_file)
    return spectrogram.T

def torch_train_val_split(
        dataset, batch_train, batch_eval,
        val_size=.2, shuffle=True, seed=None):
    # Creating data indices for training and validation splits:
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    val_split = int(np.floor(val_size * dataset_size))
    if shuffle:
        np.random.seed(seed)
        np.random.shuffle(indices)
    train_indices = indices[val_split:]
    val_indices = indices[:val_split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    train_loader = DataLoader(dataset,
                              batch_size=batch_train,
                              sampler=train_sampler)
    val_loader = DataLoader(dataset,
                            batch_size=batch_eval,
                            sampler=val_sampler)
    return train_loader, val_loader


class PaddingTransform(object):
    def __init__(self, max_length, padding_value=0):
        self.max_length = max_length
        self.padding_value = padding_value

    def __call__(self, s):
        if len(s) == self.max_length:
            return s

        if len(s) > self.max_length:
            return s[:self.max_length]

        if len(s) < self.max_length:
            s1 = copy.deepcopy(s)
            pad = np.zeros((self.max_length - s.shape[0], s.shape[1]), dtype=np.float32)
            s1 = np.vstack((s1, pad))
            return s1
        
        
# Pytorch Dataset Class for creating the dataset
class SpectrogramDatasetEmotion(Dataset):
    def __init__(self, path, target, train=True, max_length=-1, read_spec_fn=read_fused_spectrogram):
        t = 'train' if train else 'test'
        p = os.path.join(path, t)
        self.index = os.path.join(path, "{}_labels.txt".format(t))
        self.files, labels = self.get_files_labels(self.index, target)
        self.feats = [read_spec_fn(os.path.join(p, f)) for f in self.files]
        self.feat_dim = self.feats[0].shape[1]
        self.lengths = [len(i) for i in self.feats]
        self.max_length = max(self.lengths) if max_length <= 0 else max_length
        self.zero_pad_and_stack = PaddingTransform(self.max_length)
        if isinstance(labels, (list, tuple)):
            self.labels = np.array(np.array(labels).astype('float')).reshape(-1, 3)

    def get_files_labels(self, txt, target):
        with open(txt, 'r') as fd:
            lines = [l.rstrip().split(',') for l in fd.readlines()[1:]]
        files, labels = [], []
        for l in lines:
            if target=='valence':
                label = l[1]
            elif target=='energy':
                label = l[2]
            elif target=='danceability':
                label = l[3]
            else:
                label = l[1:]
            # Kaggle automatically unzips the npy.gz format so this hack is needed
            _id = l[0]
            npy_file = '{}.fused.full.npy'.format(_id)
            files.append(npy_file)
            labels.append(label)
        return files, labels
    
    def __getitem__(self, item):
        l = min(self.lengths[item], self.max_length)
        return self.zero_pad_and_stack(self.feats[item]), self.labels[item], l

    def __len__(self):
        return len(self.labels)

In [3]:
# load data with all three labels for multitask
specs_multi = SpectrogramDatasetEmotion(
         '../input/patreco3-multitask-affective-music/data/multitask_dataset/',
         target='multi',
         train=True,
         max_length=-1,
         read_spec_fn=read_fused_spectrogram)
    
train_loader_multi, val_loader_multi = torch_train_val_split(specs_multi, 32 ,32)

In [4]:
# CNN for multitask predictions with shared convolutional layers
class ConvBlock(nn.Module):
  def __init__(self, in_channels, out_channels):
      super(ConvBlock, self).__init__()

      self.conv =  nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
      self.batch = nn.BatchNorm2d(out_channels)
      self.relu = nn.ReLU()
      self.pool = nn.MaxPool2d(kernel_size=3, stride=3)
      
  def forward(self, x):
      return self.pool(self.relu(self.batch(self.conv(x))))
      

class MultitaskCNN(nn.Module):
    def __init__(self, output_dim):
        super(MultitaskCNN, self).__init__()

        self.conv = nn.Sequential(
            ConvBlock(in_channels=1, out_channels=16),
            ConvBlock(in_channels=16, out_channels=32),
            ConvBlock(in_channels=32, out_channels=64),
            ConvBlock(in_channels=64, out_channels=128),
        )
        
        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.2),
            nn.Linear(1920, 128),
            nn.PReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, output_dim),
        )
        self.fc2 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.2),
            nn.Linear(1920, 128),
            nn.PReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, output_dim),
        )
        self.fc3 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.2),
            nn.Linear(1920, 128),
            nn.PReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, output_dim),
        )

    def forward(self, x):
        x = x.view(x.size(0), 1, x.size(1), x.size(2)) # reshape to have 1 channel
        x = self.conv(x)
        #print(x.size())
        return self.fc1(x), self.fc2(x), self.fc3(x)

In [5]:
def train_cnn_multitask(train_dl, val_dl, checkpoint_name):
    
    # hyper parameters
    L2 = 0.0001
    EPOCHS = 100  # max epochs
    PATIENCE = 5  # for early stopping
    

    model = MultitaskCNN(output_dim=1)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=L2)
    
     # set device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)

    avg_train_losses = []  # track train loss in each epoch
    avg_val_losses = []  # track validation loss in each epoch
    min_val_loss = np.Inf
    epochs_no_improve = 0

    # train model
    model.train()
    for epoch in range(1, EPOCHS + 1):
        train_losses = []
        val_losses = []
        for i, data in enumerate(train_dl):
            X_batch, y_batch, lengths = data
            X_batch, y_batch, lengths = X_batch.to(device), y_batch.to(device), lengths.to(device)
            optimizer.zero_grad()
            out1, out2, out3 = model(X_batch.float())
            
            loss1, loss2, loss3 = criterion(out1, y_batch[:, 0].reshape(-1, 1).float()), criterion(out2, y_batch[:, 1].reshape(-1, 1).float()), criterion(out3, y_batch[:, 2].reshape(-1, 1).float())
            loss = 0.4*loss1 + 0.2*loss2 + 0.2*loss3
            loss.backward()
            optimizer.step()
            # track loss
            train_losses.append(loss.detach().item())
        avg_train_loss = np.average(train_losses)
        avg_train_losses.append(avg_train_loss)

        # calculate validation loss and accuracy
        with torch.no_grad():
            for i, data in enumerate(val_dl):
                X_batch, y_batch, lengths = data
                X_batch, y_batch, lengths = X_batch.to(device), y_batch.to(device), lengths.to(device)
                out1, out2, out3 = model(X_batch.float())
                loss1, loss2, loss3 = criterion(out1, y_batch[:, 0].reshape(-1, 1).float()), criterion(out2, y_batch[:, 1].reshape(-1, 1).float()), criterion(out3, y_batch[:, 2].reshape(-1, 1).float())
                loss = 0.4*loss1 + 0.2*loss2 + 0.2*loss3
                val_losses.append(loss.detach().item())
        avg_val_loss = np.average(val_losses)
        avg_val_losses.append(avg_val_loss)

        # print information
        print("Epoch: {}  -  loss: {}  -  val_loss: {}".format(epoch, avg_train_loss, avg_val_loss))

        # early stopping
        if avg_val_loss < min_val_loss:
            torch.save(model, checkpoint_name)  # save checkpoint
            epochs_no_improve = 0
            min_val_loss = avg_val_loss
        else:
            epochs_no_improve += 1
        if epoch > PATIENCE and epochs_no_improve == PATIENCE:
            print('Early stopping')
            break

In [6]:
def evaluate_cnn_multitask(test_dl, checkpoint_name):
    # load best model
    model = torch.load(checkpoint_name)
    
    # set device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)
    
    # predict test
    model.eval()
    test_predictions1 = []
    y_test1 = []
    test_predictions2 = []
    y_test2 = []
    test_predictions3 = []
    y_test3 = []
    with torch.no_grad():
        for i, data in enumerate(test_dl):
            X_batch, y_batch, lengths = data
            X_batch, y_batch, lengths = X_batch.to(device), y_batch.to(device), lengths.to(device)
            out1, out2, out3 = model(X_batch.float())

            test_predictions1.extend(out1.detach().cpu().numpy())
            y_test1.extend(y_batch[:, 0].tolist())
            test_predictions2.extend(out2.detach().cpu().numpy())
            y_test2.extend(y_batch[:, 1].tolist())
            test_predictions3.extend(out3.detach().cpu().numpy())
            y_test3.extend(y_batch[:, 2].tolist())
        
    corr1 = corr(np.array(y_test1), np.array(test_predictions1)).correlation
    corr2 = corr(np.array(y_test2), np.array(test_predictions2)).correlation
    corr3 = corr(np.array(y_test3), np.array(test_predictions3)).correlation
    
    print('Valence:', corr1)
    print('Energy:', corr2)
    print('Danceability:', corr3)
    print('Mean spearman correlation =', (corr1 + corr2 + corr3) / 3)

In [7]:
# train cnn for multitask
train_cnn_multitask(train_loader_multi, val_loader_multi, 'cnn_multi.th')

Epoch: 1  -  loss: 0.4936372668578707  -  val_loss: 0.2272947302886418
Epoch: 2  -  loss: 0.17821302701686992  -  val_loss: 0.155379661491939
Epoch: 3  -  loss: 0.1405390583235642  -  val_loss: 0.10373443790844508
Epoch: 4  -  loss: 0.1026848796112784  -  val_loss: 0.09816538648945945
Epoch: 5  -  loss: 0.0859972325892284  -  val_loss: 0.08278473679508482
Epoch: 6  -  loss: 0.0769959803799103  -  val_loss: 0.07660923153162003
Epoch: 7  -  loss: 0.0688127401316988  -  val_loss: 0.06652159935661725
Epoch: 8  -  loss: 0.07206780579069565  -  val_loss: 0.06907495962721961
Epoch: 9  -  loss: 0.0668307777879567  -  val_loss: 0.070331195635455
Epoch: 10  -  loss: 0.05698122238290721  -  val_loss: 0.06156805796282632
Epoch: 11  -  loss: 0.057076468293009136  -  val_loss: 0.05214791532073702
Epoch: 12  -  loss: 0.05206534433467635  -  val_loss: 0.05508314871362278
Epoch: 13  -  loss: 0.05386177694489216  -  val_loss: 0.05465691430228097
Epoch: 14  -  loss: 0.05068119056522846  -  val_loss: 0.04

In [8]:
# evaluate cnn for valence
valence_corr = evaluate_cnn_multitask(val_loader_multi, 'cnn_multi.th')

Valence: 0.5874439689660799
Energy: 0.7652111243929229
Danceability: 0.7091738898704709
Mean spearman correlation = 0.6872763277431578


Step 10: Kaggle submission

In [9]:
class TestDatasetEmotion(Dataset):
    def __init__(self, path, max_length=-1, read_spec_fn=read_fused_spectrogram):
        p = os.path.join(path, 'test')
        self.files = os.listdir(p)
        self.feats = [read_spec_fn(os.path.join(p, f)) for f in self.files]
        self.files = [file+'.gz' for file in os.listdir(p)]              
        self.feat_dim = self.feats[0].shape[1]
        self.lengths = [len(i) for i in self.feats]
        self.max_length = max(self.lengths) if max_length <= 0 else max_length
        self.zero_pad_and_stack = PaddingTransform(self.max_length)
    
    def __getitem__(self, item):
        l = min(self.lengths[item], self.max_length)
        return self.zero_pad_and_stack(self.feats[item]), self.files[item], l

    def __len__(self):
        return len(self.files)

In [10]:
# load data with all three labels for multitask
specs_multi_test = TestDatasetEmotion(
         '../input/patreco3-multitask-affective-music/data/multitask_dataset/',
         read_spec_fn=read_fused_spectrogram)

test_loader_multi = DataLoader(specs_multi_test, batch_size=32)

In [11]:
# create submission file
# load best model
model = torch.load('cnn_multi.th')
    
# set device
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
model.to(device)
    
# predict test
model.eval()
test_predictions1 = []
test_predictions2 = []
test_predictions3 = []
filenames = []
with torch.no_grad():
    for i, data in enumerate(test_loader_multi):
        X_batch, files, lengths = data
        X_batch, lengths = X_batch.to(device), lengths.to(device)
        out1, out2, out3 = model(X_batch.float())
        
        test_predictions1.extend(out1.detach().cpu().numpy())
        test_predictions2.extend(out2.detach().cpu().numpy())
        test_predictions3.extend(out3.detach().cpu().numpy())
        filenames.extend(files)
            
submission = pd.DataFrame()
submission["Id.fused.full.npy.gz"] = filenames
submission["valence"] = np.array(test_predictions1).flatten()
submission["energy"] = np.array(test_predictions2).flatten()
submission["danceability"] = np.array(test_predictions3).flatten()
submission.to_csv('solution2.txt', index=False)
        