In [1]:
import os
from pathlib import Path
import pickle
import random
import time
from io import StringIO
from csv import writer
import gc

import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import IPython
import IPython.display
# import PIL

import torch
import torch.nn as nn
import torch.nn.functional as F

from fastai import *
from fastai.vision import *
from fastai.vision.data import *

In [2]:
# start_time = time.time()

In [3]:
models_list = (
    (Path('../input/fat2019ssl4multistage/work/work'), 'stage-2_fold-{fold}.pkl'),
    (Path('../input/fat2019ssl4multistage/work/work'), 'stage-10_fold-{fold}.pkl'),
    (Path('../input/fat2019ssl4multistage/work/work'), 'stage-11_fold-{fold}.pkl'),
    (Path('../input/fat2019ssl8vgg16full/work/work'), 'stage-2_fold-{fold}.pkl'),
    (Path('../input/fat2019ssl8vgg16full/work/work'), 'stage-10_fold-{fold}.pkl'),
    (Path('../input/fat2019ssl8vgg16full/work/work'), 'stage-11_fold-{fold}.pkl'),
)

In [4]:
TTA_SHIFT = 48  # TTA: predict every TTA_SHIFT
n_splits = 10
DATA = Path('../input/freesound-audio-tagging-2019')
DATA_TEST = DATA/'test'
CSV_SUBMISSION = DATA/'sample_submission.csv'
test_df = pd.read_csv(CSV_SUBMISSION)

In [5]:
# # visually check everything is present in datasets
# for work, name in models_list:
#     print('-'*10, work, name, '-'*10)
#     !ls {work}

In [6]:
def read_audio(conf, pathname, trim_long_data):
    y, sr = librosa.load(pathname, sr=conf.sampling_rate)
    # trim silence
    if 0 < len(y): # workaround: 0 length causes error
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf.samples: # long enough
        if trim_long_data:
            y = y[0:0+conf.samples]
    else: # pad blank
        padding = conf.samples - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')
    return y

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

def show_melspectrogram(conf, mels, title='Log-frequency power spectrogram'):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf.sampling_rate, hop_length=conf.hop_length,
                            fmin=conf.fmin, fmax=conf.fmax)
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()

def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
    x = read_audio(conf, pathname, trim_long_data)
    mels = audio_to_melspectrogram(conf, x)
    if debug_display:
        IPython.display.display(IPython.display.Audio(x, rate=conf.sampling_rate))
        show_melspectrogram(conf, mels)
    return mels


class conf:
    # Preprocessing settings
    sampling_rate = 44100
    duration = 2
    hop_length = 347*duration # to make time steps 128
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 128
    n_fft = n_mels * 20
    samples = sampling_rate * duration

# example
# x = read_as_melspectrogram(conf, DATA_CURATED/'0006ae4e.wav', trim_long_data=False, debug_display=True)

In [7]:
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def convert_wav_to_image(df, source, img_dest):
    print(f'Converting {source} -> {img_dest}')
    X = []
    for i, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
        x = read_as_melspectrogram(conf, source/str(row.fname), trim_long_data=False)
        x_color = mono_to_color(x)
        X.append(x_color)
#     pickle.dump(X, open(img_dest, 'wb'))
    return X

In [8]:
X_test = convert_wav_to_image(test_df, source=DATA_TEST, img_dest=None)

Converting ../input/freesound-audio-tagging-2019/test -> None


HBox(children=(IntProgress(value=0, max=1120), HTML(value='')))




In [9]:
class MyMixUpCallback(LearnerCallback):
    def __init__(self, learn:Learner):
        super().__init__(learn)
#         self.num_mask=2
        self.masking_max_percentage=0.25
    
    def on_batch_begin(self, last_input, last_target, train, **kwargs):
        if not train: return

        shuffle = torch.randperm(last_target.size(0)).to(last_input.device)
        x1, y1 = last_input[shuffle], last_target[shuffle]

        batch_size, channels, height, width = last_input.size()
        h_percentage = np.random.uniform(low=0., high=self.masking_max_percentage, size=batch_size)
        w_percentage = np.random.uniform(low=0., high=self.masking_max_percentage, size=batch_size)
#         alpha = self.num_mask * (h_percentage + w_percentage) - (self.num_mask*self.num_mask) * ((h_percentage * w_percentage))
        alpha = (h_percentage + w_percentage) - (h_percentage * w_percentage)
        alpha = last_input.new(alpha)
        alpha = alpha.unsqueeze(1)
        
        new_input = last_input.clone()
        
        for i in range(batch_size):
            h_mask = int(h_percentage[i] * height)
            h = int(np.random.uniform(0.0, height - h_mask))
            new_input[i, :, h:h + h_mask, :] = x1[i, :, h:h + h_mask, :]

            w_mask = int(w_percentage[i] * width)
            w = int(np.random.uniform(0.0, width - w_mask))
            new_input[i, :, :, w:w + w_mask] = x1[i, :, :, w:w + w_mask]
        
#         new_target = torch.max(last_target, y1)
        new_target = (1-alpha) * last_target + alpha*y1
        return {'last_input': new_input, 'last_target': new_target}

In [10]:
# from official code https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8#scrollTo=cRCaCIb9oguU
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class


# Accumulator object version.

class lwlrap_accumulator(object):
  """Accumulate batches of test samples into per-class and overall lwlrap."""  

  def __init__(self):
    self.num_classes = 0
    self.total_num_samples = 0
  
  def accumulate_samples(self, batch_truth, batch_scores):
    """Cumulate a new batch of samples into the metric.
    
    Args:
      truth: np.array of (num_samples, num_classes) giving boolean
        ground-truth of presence of that class in that sample for this batch.
      scores: np.array of (num_samples, num_classes) giving the 
        classifier-under-test's real-valued score for each class for each
        sample.
    """
    assert batch_scores.shape == batch_truth.shape
    num_samples, num_classes = batch_truth.shape
    if not self.num_classes:
      self.num_classes = num_classes
      self._per_class_cumulative_precision = np.zeros(self.num_classes)
      self._per_class_cumulative_count = np.zeros(self.num_classes, 
                                                  dtype=np.int)
    assert num_classes == self.num_classes
    for truth, scores in zip(batch_truth, batch_scores):
      pos_class_indices, precision_at_hits = (
        _one_sample_positive_class_precisions(scores, truth))
      self._per_class_cumulative_precision[pos_class_indices] += (
        precision_at_hits)
      self._per_class_cumulative_count[pos_class_indices] += 1
    self.total_num_samples += num_samples

  def per_class_lwlrap(self):
    """Return a vector of the per-class lwlraps for the accumulated samples."""
    return (self._per_class_cumulative_precision / 
            np.maximum(1, self._per_class_cumulative_count))

  def per_class_weight(self):
    """Return a normalized weight vector for the contributions of each class."""
    return (self._per_class_cumulative_count / 
            float(np.sum(self._per_class_cumulative_count)))

  def overall_lwlrap(self):
    """Return the scalar overall lwlrap for cumulated samples."""
    return np.sum(self.per_class_lwlrap() * self.per_class_weight())


In [11]:
class Lwlrap(Callback):
    
    def on_epoch_begin(self, **kwargs):
        self.accumulator = lwlrap_accumulator()
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        self.accumulator.accumulate_samples(last_target.cpu().numpy(), torch.sigmoid(last_output).cpu().numpy())
    
    def on_epoch_end(self, last_metrics, **kwargs):
        return add_metrics(last_metrics, self.accumulator.overall_lwlrap())

In [12]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )

        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = F.avg_pool2d(x, 2)
        return x
    
class Classifier(nn.Module):
    def __init__(self, num_classes=1000): # <======== modificaition to comply fast.ai
        super().__init__()
        
        self.conv = nn.Sequential(
            ConvBlock(in_channels=3, out_channels=64),
            ConvBlock(in_channels=64, out_channels=128),
            ConvBlock(in_channels=128, out_channels=256),
            ConvBlock(in_channels=256, out_channels=512),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # <======== modificaition to comply fast.ai
        self.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.PReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.1),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        x = self.conv(x)
        #x = torch.mean(x, dim=3)   # <======== modificaition to comply fast.ai
        #x, _ = torch.max(x, dim=2) # <======== modificaition to comply fast.ai
        x = self.avgpool(x)         # <======== modificaition to comply fast.ai
        x = self.fc(x)
        return x

In [13]:
# !!! use globals CUR_X_FILES, CUR_X
def open_fat2019_image(fn, convert_mode, after_open)->Image:
    # open
    fname = fn.split('/')[-1]
    if '!' in fname:
        fname, crop_x = fname.split('!')
        crop_x = int(crop_x)
    else:
        crop_x = -1
    idx = CUR_X_FILES.index(fname)
    x = CUR_X[idx]
    # crop
    base_dim, time_dim, _ = x.shape
    if crop_x == -1:
        crop_x = random.randint(0, time_dim - base_dim)
    x = x[0:base_dim, crop_x:crop_x+base_dim, :]
    x = np.transpose(x, (1, 0, 2))
    x = np.transpose(x, (2, 1, 0))
    # standardize
    return Image(torch.from_numpy(x.astype(np.float32, copy=False)).div_(255))


vision.data.open_image = open_fat2019_image

In [14]:
CUR_X_FILES, CUR_X = list(test_df.fname.values), X_test

In [15]:
output = StringIO()
csv_writer = writer(output)
csv_writer.writerow(test_df.columns)

for _, row in tqdm_notebook(test_df.iterrows(), total=test_df.shape[0]):
    idx = CUR_X_FILES.index(row.fname)
    time_dim = CUR_X[idx].shape[1]
    s = math.ceil((time_dim-conf.n_mels) / TTA_SHIFT) + 1
    
    fname = row.fname
    for crop_x in [int(np.around((time_dim-conf.n_mels)*x/(s-1))) if s != 1 else 0 for x in range(s)]:
        row.fname = fname + '!' + str(crop_x)
        csv_writer.writerow(row)

output.seek(0)
test_df_multi = pd.read_csv(output)

del row, test_df, output, csv_writer; gc.collect();

HBox(children=(IntProgress(value=0, max=1120), HTML(value='')))




In [16]:
test = ImageList.from_df(test_df_multi, models_list[0][0])

for model_nb, (work, name) in enumerate(models_list):
    for fold in range(n_splits):
        learn = load_learner(work, name.format(fold=fold), test=test)
        preds, _ = learn.get_preds(ds_type=DatasetType.Test)
        preds = preds.cpu().numpy()
        if (fold == 0) and (model_nb == 0):
            predictions = preds
        else:
            predictions += preds

predictions /= (n_splits * len(models_list))

In [17]:
test_df_multi[learn.data.classes] = predictions
test_df_multi['fname'] = test_df_multi.fname.apply(lambda x: x.split('!')[0])

In [18]:
submission = test_df_multi.infer_objects().groupby('fname').mean().reset_index()

In [19]:
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,Burping_and_eructation,Bus,Buzz,Car_passing_by,Cheering,Chewing_and_mastication,Child_speech_and_kid_speaking,Chink_and_clink,Chirp_and_tweet,Church_bell,Clapping,Computer_keyboard,Crackle,Cricket,Crowd,Cupboard_open_or_close,Cutlery_and_silverware,Dishes_and_pots_and_pans,Drawer_open_or_close,Drip,Electric_guitar,Fart,Female_singing,Female_speech_and_woman_speaking,Fill_(with_liquid),Finger_snapping,Frying_(food),Gasp,Glockenspiel,Gong,...,Harmonica,Hi-hat,Hiss,Keys_jangling,Knock,Male_singing,Male_speech_and_man_speaking,Marimba_and_xylophone,Mechanical_fan,Meow,Microwave_oven,Motorcycle,Printer,Purr,Race_car_and_auto_racing,Raindrop,Run,Scissors,Screaming,Shatter,Sigh,Sink_(filling_or_washing),Skateboard,Slam,Sneeze,Squeak,Stream,Strum,Tap,Tick-tock,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,000ccb97.wav,0.001941,0.000921,0.002814,0.001162,0.00275,0.257441,0.009502,0.002431,0.007561,0.001752,0.002224,0.003056,0.001508,0.001223,0.001648,0.001843,0.007828,0.009056,0.001794,0.007403,0.00316,0.001917,0.011625,0.001001,0.006529,0.011305,0.005562,0.002285,0.006799,0.002401,0.004229,0.003069,0.010614,0.000623,0.105709,0.001859,0.003534,0.002745,0.008638,...,0.00167,0.194247,0.02032,0.009104,0.002947,0.001481,0.004079,0.006988,0.001554,0.001626,0.00237,0.002056,0.002939,0.001215,0.001276,0.008302,0.001354,0.020556,0.001235,0.005285,0.003444,0.001469,0.001105,0.001783,0.001757,0.002113,0.001626,0.002377,0.012088,0.003301,0.001544,0.001471,0.000544,0.001775,0.001731,0.002054,0.022429,0.007898,0.005344,0.001348
1,0012633b.wav,0.262763,0.001583,0.001652,0.003722,0.003652,0.003049,0.006014,0.001869,0.002695,0.003252,0.008313,0.038465,0.013339,0.003118,0.002842,0.00265,0.002586,0.001792,0.002682,0.002333,0.002395,0.004831,0.002565,0.007489,0.002759,0.001993,0.001596,0.009065,0.001293,0.002456,0.017001,0.002069,0.002191,0.000965,0.001536,0.003037,0.00876,0.000815,0.0019,...,0.003096,0.001783,0.025436,0.004147,0.001683,0.00633,0.007449,0.001261,0.004152,0.002136,0.003462,0.417701,0.003156,0.01084,0.039275,0.001315,0.003657,0.002529,0.00515,0.00422,0.004251,0.001988,0.004157,0.003661,0.004122,0.005163,0.001125,0.001295,0.001959,0.001023,0.002615,0.013086,0.000788,0.006448,0.002092,0.005499,0.001739,0.005254,0.00326,0.097219
2,001ed5f1.wav,0.002876,0.001479,0.001354,0.005268,0.002224,0.015066,0.001932,0.00439,0.003603,0.001229,0.006308,0.003078,0.002266,0.003651,0.001749,0.002326,0.010445,0.00287,0.002581,0.010817,0.014099,0.008141,0.00439,0.003157,0.028855,0.003681,0.006554,0.025327,0.00265,0.002,0.000637,0.001125,0.003362,0.003238,0.004076,0.005473,0.001246,0.001218,0.001938,...,0.001401,0.002721,0.01133,0.004906,0.037562,0.002154,0.003559,0.002684,0.003866,0.002902,0.0236,0.004579,0.005855,0.001568,0.002787,0.003123,0.466026,0.002709,0.003338,0.004109,0.001097,0.004155,0.005396,0.047048,0.002381,0.006022,0.005678,0.001149,0.018794,0.003556,0.002844,0.004499,0.004217,0.0135,0.004245,0.004288,0.001593,0.002299,0.002686,0.00199
3,00294be0.wav,0.00054,0.000288,0.001012,0.000787,0.001976,0.000868,0.00076,0.000561,0.000356,0.002222,0.000793,0.001446,0.000454,0.000509,0.0074,0.001071,0.001185,0.001357,0.000784,0.00074,0.000938,0.000818,0.005322,0.000974,0.000485,0.000797,0.000572,0.002646,0.00045,0.000489,0.001034,0.000947,0.001207,0.000666,0.000452,0.000458,0.000502,0.00048,0.000926,...,0.000456,0.000591,0.001139,0.000951,0.000946,0.000517,0.000578,0.001279,0.000421,0.045293,0.000623,0.000953,0.000864,0.917425,0.000399,0.000434,0.000781,0.002042,0.000325,0.000273,0.000174,0.001196,0.00047,0.000753,0.000404,0.001099,0.00085,0.001059,0.000507,0.002908,0.00087,0.000384,0.000584,0.001253,0.000761,0.000517,0.002146,0.00376,0.00099,0.007289
4,003fde7a.wav,0.001529,0.001032,0.002001,0.000763,0.004146,0.002796,0.001352,0.001338,0.630423,0.001866,0.002088,0.004402,0.001818,0.000978,0.000845,0.001794,0.010479,0.00106,0.003156,0.000905,0.000794,0.00148,0.001604,0.001319,0.001856,0.003416,0.002963,0.002895,0.00116,0.001337,0.000971,0.001895,0.001893,0.000944,0.001259,0.000981,0.002724,0.200024,0.003545,...,0.003709,0.002464,0.001913,0.005101,0.003128,0.001787,0.00286,0.016426,0.001309,0.001123,0.0061,0.001985,0.001972,0.000731,0.001183,0.000975,0.001614,0.000887,0.004373,0.00241,0.002797,0.00147,0.000805,0.001879,0.002384,0.001721,0.000688,0.001751,0.000609,0.001351,0.001213,0.001279,0.000746,0.001483,0.001316,0.000951,0.00103,0.001012,0.002485,0.001265


In [20]:
submission.set_index('fname').idxmax(1)

fname
000ccb97.wav                           Bass_drum
0012633b.wav                          Motorcycle
001ed5f1.wav                                 Run
00294be0.wav                                Purr
003fde7a.wav                        Bicycle_bell
0040ccc9.wav                     Electric_guitar
0046b732.wav                                Meow
004f3bbc.wav                         Bass_guitar
00526050.wav                           Bass_drum
00559da4.wav                   Zipper_(clothing)
00582bbe.wav                               Knock
0064aedf.wav                Drawer_open_or_close
0065512b.wav                                Slam
006a91d2.wav                              Stream
006ea9ee.wav                               Strum
006f9dca.wav                Water_tap_and_faucet
007450dc.wav                           Screaming
00979c8a.wav                                Purr
00992464.wav                  Fill_(with_liquid)
00b44a8a.wav              Cutlery_and_silverware
00bfaaaf.wav  

In [21]:
# print('Done in', time.time() - start_time, 'seconds')