In [1]:
import os
import sys
import shutil
import random
import numpy as np
import pandas as pd
import re

import h5py

data_hdf5 = h5py.File('work/FSDnoisy18k/data_hdf5/FSDnoisy18k_test.hdf5', mode='r')

args = {'extract': {
    'n_mels': 96,
    'patch_hop': 50,
    'patch_len': 101,
}}

def fetch_file_2_tensor(mel_spec, label_original, patch_hop, patch_len):
    """
    Given a mel_spec, perform slicing into T-F patches, and store them in a list. Create a list of labels of same shape
    inheriting clip-level labels.
    :param mel_spec:
    :return: two lists of patches and labels of same shape.
    """

    idx = 0
    start = 0
    im_TF_patches = []
    labels = []

    while (start + patch_len) <= mel_spec.shape[0]:
        im_TF_patches.append(mel_spec[start: start + patch_len])
        labels.append(label_original)
        # update indexes
        start += patch_hop
        idx += 1

    return im_TF_patches, labels



data = []
for i in range(len(data_hdf5['binary_data'])):
    im = np.ascontiguousarray(np.rec.array(np.frombuffer(data_hdf5['binary_data'][i])).reshape(-1, args['extract']['n_mels']))
    im_TF_patches, labels_TF_patches = fetch_file_2_tensor(im, 0, args['extract']['patch_hop'], args['extract']['patch_len'])
    data.append(np.asarray(im_TF_patches).astype('float32'))


In [3]:
labels_name = np.array(data_hdf5['labels'])

# create dicts such that key: value is as follows
# label: int
# int: label
list_labels = np.unique(labels_name)
label_to_int = {k: v for v, k in enumerate(list_labels)}
int_to_label = {v: k for k, v in label_to_int.items()}

labels_original = []
for k in labels_name:
    labels_original.append(label_to_int[k])
labels_original = np.asarray(labels_original)
per_class_samples_accum = np.zeros((20,))
per_class_samples_num = [[] for i in range(20)]
per_class_samples_idx = [[] for i in range(20)]
val_sum = 0.0
num_values = 0.0
## First loop to calculate data shape
for i in range(len(data_hdf5['binary_data'])):

    im = np.ascontiguousarray(np.rec.array(np.frombuffer(data_hdf5['binary_data'][i])).reshape(-1, args['extract']['n_mels']))
    val_sum += im.sum()
    num_values += im.size
    file_frames = float(im.shape[0])  # number of time frames
    # number of patches within clip
    nb_inst = np.maximum(1, int(np.ceil((file_frames - args['extract']['patch_len']) / args['extract']['patch_hop'])))
    per_class_samples_accum[labels_original[i]] += nb_inst
    per_class_samples_num[labels_original[i]].append(nb_inst) #number of TF patches for each class and spectogram
    per_class_samples_idx[labels_original[i]].append(i) #overall indexes per-class

spec_mean = val_sum / num_values
val_std = 0.0


train_size = int(per_class_samples_accum.sum())
# initialize data and labels to be returned
data = np.zeros((train_size, args['extract']['patch_len'], args['extract']['n_mels'])).astype('float32')
labels = np.zeros((train_size,)).astype(int)
train_count = 0
for i in range(len(data_hdf5['binary_data'])):
    im = np.ascontiguousarray(np.rec.array(np.frombuffer(data_hdf5['binary_data'][i])).reshape(-1, args['extract']['n_mels']))
    val_std += ((im - spec_mean) ** 2).sum()

spec_std = np.sqrt(val_std) / num_values

print(spec_mean, spec_std, np.mean(np.vstack(data)), num_values)

-1.3956809560512295 0.00033731180361859254 0.0 48888192.0


- Possible issue: std might be too small because dividing too big num_values.
- The mean/std calculated from the training set was: [-1.4, 6e-5].
    - Note that the std is much smaller than test set's.

In [5]:
from torchvision import transforms
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(-1.3956809560512295, 0.00033731180361859254),
])

In [7]:
import FSDnoisy18k.dataset.FSDnoisy18k
import yaml
args = yaml.load(open('config/params_supervised_lineval.yaml'))
testset = FSDnoisy18k.dataset.FSDnoisy18k.FSDnoisy18k(args, mode='test', hdf5_path='work/FSDnoisy18k/data_hdf5/')
testset.transform = test_transform
testset[0]

  args = yaml.load(open('config/params_supervised_lineval.yaml'))


(tensor([[[ 4142.4980,  3218.7227,  -731.8396,  ...,  4195.0850,
            4912.4370,  3942.5151],
          [ 6402.0044,  3796.8489,  -375.2035,  ...,  5460.7759,
            5689.5244,  4117.4517],
          [ 5786.2642,  3043.6792,  -145.8010,  ...,  5909.9009,
            6809.0645,  5598.4883],
          ...,
          [ 5782.3354,  5604.1104,  5531.8413,  ...,  9478.7383,
            9411.0635,  8599.8369],
          [ 9423.4443,  7158.6445,  4928.6548,  ..., 11146.4424,
           11065.3125,  9907.1582],
          [ 9782.8154,  5573.3979,  5427.0586,  ...,  9104.5752,
            8962.4521,  8343.3369]]]),
 array([9]),
 0)

In [9]:
x = testset[0][0]
x.mean(), x.std(), x.max(), x.min()

(tensor(580.5177), tensor(6605.0996), tensor(11756.6436), tensor(-14670.8662))