# Extract `logmelspectrogram_64` features for `fe_03_p1m`

## Setup

Requires some manual inputs

## Imports

In [1]:
# from __future__ import division, print_function  # not running this on py27
import os
import sys

sys.path.append(os.environ['RENNET_LOCAL_ROOT'])  # the supporting library

%load_ext autoreload
%autoreload 1  # reload only what has been marked for it

In [2]:
import glob
import warnings
from collections import namedtuple
from itertools import starmap

import librosa as lr
import numpy as np
import dask as d
import dask.array as da
from distributed import Client
import h5py as h

import dask.diagnostics as dg
import time
from bokeh.io import output_notebook

In [3]:
%aimport rennet.datasets.fisher
import rennet.datasets.fisher as fe

%aimport rennet.utils.audio_utils
import rennet.utils.audio_utils as au

%aimport rennet.utils.np_utils
import rennet.utils.np_utils as nu

In [4]:
# dask progress bars and other diagnostics
pb = dg.ProgressBar()
pb.register()

output_notebook()

pr = dg.Profiler()
pr.register()

rpr = dg.ResourceProfiler(dt=0.1)
rpr.register()

## Params 

For feature extraction and related tasks like, output file naming, etc.

In [5]:
# The parameters that will be used for extracting logmelspectrogram
win_sec = 0.032
hop_sec = 0.010
nmels = 64
window = 'hann'
sr = samplerate = 8000
nchannels = 1
chunking = 2**14
chunkovl = 2**10

n_fft = win_len = int(win_sec * sr)
hop_len = int(hop_sec * sr)
chunkstep = chunking - chunkovl
melfreq = lr.mel_frequencies(n_mels=nmels, fmax=sr//2)

print('win-len', win_len)
print('hop-len', hop_len)
print()
print('chunking', chunking)
print('chunkovl', chunkovl)
print('chunkstep', chunkstep)
print()
print('mel-frequencies ({})\n'.format(len(melfreq)), melfreq)

win-len 256
hop-len 80

chunking 16384
chunkovl 1024
chunkstep 15360

mel-frequencies (64)
 [    0.            37.21032838    74.42065675   111.63098513   148.8413135
   186.05164188   223.26197025   260.47229863   297.682627     334.89295538
   372.10328375   409.31361213   446.5239405    483.73426888   520.94459725
   558.15492563   595.365254     632.57558238   669.78591075   706.99623913
   744.20656751   781.41689588   818.62722426   855.83755263   893.04788101
   930.25820938   967.46853776  1004.83686549  1044.14602321  1084.99295281
  1127.43781185  1171.54311121  1217.37380723  1264.99739732  1314.48401939
  1365.90655514  1419.34073739  1474.86526164  1532.56190193  1592.51563132
  1654.814747    1719.55100033  1786.819732    1856.72001241  1929.35478758
  2004.83103078  2083.25990007  2164.75690202  2249.44206177  2337.44009988
  2428.88061595  2523.8982795   2622.63302834  2725.23027461  2831.84111897
  2942.62257316  3057.73779117  3177.35630958  3301.65429723  3430.814814

## Roots & Paths

### Sources 

Where the pre-split wav and label exports are

In [6]:
# Setting up roots for sources

rennet_x_root = os.environ['RENNET_X_ROOT']
print('RENNET_X', rennet_x_root, '', sep='\n')

working_data_dir = os.path.join(os.environ['RENNET_X_LOCAL_ROOT'], 'data', 'working')

provider = 'fisher'
dataset = 'fe_03_p1'
export_name = 'wav-8k-mono-000_012'


# Root path to where the splits are

splits_root = os.path.join(working_data_dir, provider, dataset, export_name)
# check if it exists
if not os.path.exists(splits_root):
    raise RuntimeError("SPLITS_ROOT not found at: \n {}".format(splits_root))
    
print('SPLITS_ROOT', splits_root, '', sep='\n')

# No errors means all okay

RENNET_X
/nm-raid/audio/work/abdullah/nm-rennet/rennet-x

SPLITS_ROOT
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012



In [7]:
# Making glob queries for audio, label and calldata files

export_val_dir = os.path.join(splits_root, 'val')
val_audios_dir = os.path.join(export_val_dir, 'audios', 'data')
val_audios_glob_str = str(os.path.join(val_audios_dir, "**", "*.wav"))
val_labels_dir = os.path.join(export_val_dir, 'labels', 'data')
val_labels_glob_str = str(os.path.join(val_labels_dir, "**", "*.txt"))

export_trn_dir = os.path.join(splits_root, 'train')
trn_audios_dir = os.path.join(export_trn_dir, 'audios', 'data')
trn_audios_glob_str = str(os.path.join(trn_audios_dir, "**", "*.wav"))
trn_labels_dir = os.path.join(export_trn_dir, 'labels', 'data')
trn_labels_glob_str = str(os.path.join(trn_labels_dir, "**", "*.txt"))

export_tst_dir = os.path.join(splits_root, 'test')
tst_audios_dir = os.path.join(export_tst_dir, 'audios', 'data')
tst_audios_glob_str = str(os.path.join(tst_audios_dir, "**", "*.wav"))
tst_labels_dir = os.path.join(export_tst_dir, 'labels', 'data')
tst_labels_glob_str = str(os.path.join(tst_labels_dir, "**", "*.txt"))

calldata_glob_str = str(os.path.join(export_val_dir, 
                                         'labels', "*calldata.tbl"))

print("\nCHECK IF THE QUERIES MAKE SENSE\n")
print("Calldata Query:\n", calldata_glob_str)
print("\n")
print("Validation Audios Query:\n", val_audios_glob_str)
print()
print("Validation Labels Query:\n", val_labels_glob_str)
print("\n")
print("Train Audios Query:\n", trn_audios_glob_str)
print()
print("Train Labels Query:\n", trn_labels_glob_str)
print("\n")
print("Test Audios Query:\n", tst_audios_glob_str)
print()
print("Test Labels Query:\n", tst_labels_glob_str)

# You should look at the queries


CHECK IF THE QUERIES MAKE SENSE

Calldata Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/labels/*calldata.tbl


Validation Audios Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/audios/data/**/*.wav

Validation Labels Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/labels/data/**/*.txt


Train Audios Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/audios/data/**/*.wav

Train Labels Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/labels/data/**/*.txt


Test Audios Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/test/audios/data/**/*.wav

Test Labels Query:
 /home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/test/labels/data/**/*.txt


In [8]:
# find the calldata.tbl file
# NOTE: We only use the calldata.tbl from the val split
# it is the same file for all the splits

calldata_fp = glob.glob(calldata_glob_str)

if len(calldata_fp) > 1:
    warnings.warn("More than one calldata file found")
elif len(calldata_fp) < 1:
    raise RuntimeError("Calldata file was not found")
else:
    calldata_fp = calldata_fp[0]
    print("CALLDATA filepath:", calldata_fp, sep='\n')
    
calldatas = fe.AllCallData.from_file(calldata_fp)

print("\n\nCALLDATA read for {} callids\n".format(len(calldatas.allcalldata)))
print("\nExample calldata:\n")
print(calldatas['00086'], sep='\n')

# No errors means all okay reading call data

CALLDATA filepath:
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/labels/fe_03_p1_calldata.tbl


CALLDATA read for 5850 callids


Example calldata:

fisher.CallData(callid='00086', topicid='ENG37', signalgrade=4.0, convgrade=4.0, channelspeakers=
		[fisher.Speaker(pin='9998', gender='m', dialect='a'), 
		 fisher.Speaker(pin='9399', gender='f', dialect='o')])


In [9]:
# helpers

AudioLabelPair = namedtuple('AudioLabelPair', 'audio,label')

def fn_for_fp(fp):
    return os.path.basename(fp).split(".")[0]

audio_metas = au.get_audio_metadata
parse_label = lambda fp: fe.ActiveSpeakers.from_file(fp, warn_duplicates=False, allcalldata=calldatas)

def find_validate_pair_split(audios_glob_str, labels_glob_str, split_name, n_expected_pairs=None):
    audio_fps = sorted(glob.glob(audios_glob_str))
    label_fps = sorted(glob.glob(labels_glob_str))
    
    if n_expected_pairs is not None:
        assert len(audio_fps) == n_expected_pairs,\
            "{} audio files found, expected {}, check your query".format(len(audio_fps), n_expected_pairs)
        assert len(label_fps) == n_expected_pairs,\
            "{} label files found, expected {}, check your query".format(len(label_fps), n_expected_pairs)
    else:
        assert len(audio_fps) > 0, "No audio files found, check your query"
        assert len(label_fps) > 0, "No label files found, check your query"

    assert len(audio_fps) == len(label_fps),\
        "\nMISMATCH: audios : {} v/s {} labels".format(len(audio_fps), len(label_fps))

    # make pairs with zip
    pairs = list(starmap(
        AudioLabelPair,
        zip(
            map(audio_metas, audio_fps), 
            map(parse_label, label_fps))))

    # Assert that all pairs have same callid
    # NOTE: here, we check the filename, which has the callid
    assert all(fn_for_fp(a.filepath) == fn_for_fp(l.sourcefile) for a, l in pairs),\
        "\nMISMATCH: callids between audio and label, check filenames, ordering, or missing/substitutions"

    # print some stats for feedback
    print("{}:#############################################################\n\n".format(split_name.upper()),
          "Total Pairs: {}\n\n".format(len(pairs)),
          "For Example:\n\n{}\n\n...".format(
              "\n...\n\n".join("{}\n{}".format(a, l[:3]) 
                        for a, l in pairs[:2])))
    return pairs

In [10]:
# VAL: Find all audio and label files, single list per split
val_pairs = find_validate_pair_split(val_audios_glob_str, val_labels_glob_str, 'val', 99)

# No errors means all okay for VAL

VAL:#############################################################

 Total Pairs: 99

 For Example:

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/audios/data/000/fe_03_00001.wav', format='wav', samplerate=8000, nchannels=1, seconds=608.484, nsamples=4867872)
Source filepath:
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/labels/data/000/fe_03_00001.txt

Calldata:
fisher.CallData(callid='00001', topicid='ENG34', signalgrade=2.0, convgrade=2.0, channelspeakers=
		[fisher.Speaker(pin='2602', gender='m', dialect='a'), 
		 fisher.Speaker(pin='1790', gender='f', dialect='a')])

fisher.ActiveSpeakers with sample rate 100
Start    - End      : Label
376.0000 - 554.0000 : [1 0]
554.0000 - 582.0000 : [0 0]
582.0000 - 648.0000 : [1 0]
...

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/val/audios/data/000/fe_03_00002.wav', format='wav', sample

In [11]:
# TRN: Find all audio and label files, single list per split
trn_pairs = find_validate_pair_split(trn_audios_glob_str, trn_labels_glob_str, 'trn', 1200)

# No errors means all okay for TRN

TRN:#############################################################

 Total Pairs: 1200

 For Example:

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/audios/data/001/fe_03_00100.wav', format='wav', samplerate=8000, nchannels=1, seconds=719.952, nsamples=5759616)
Source filepath:
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/labels/data/001/fe_03_00100.txt

Calldata:
fisher.CallData(callid='00100', topicid='ENG37', signalgrade=4.0, convgrade=4.0, channelspeakers=
		[fisher.Speaker(pin='7078', gender='f', dialect='a'), 
		 fisher.Speaker(pin='4295', gender='f', dialect='a')])

fisher.ActiveSpeakers with sample rate 100
Start    - End      : Label
89.0000  - 141.0000 : [0 1]
141.0000 - 184.0000 : [0 0]
184.0000 - 273.0000 : [1 0]
...

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/audios/data/001/fe_03_00101.wav', format='wav'

In [12]:
# TST: Find all audio and label files, single list per split
tst_pairs = find_validate_pair_split(tst_audios_glob_str, tst_labels_glob_str, 'tst', 551)

# No errors means all okay for TST

TST:#############################################################

 Total Pairs: 551

 For Example:

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/test/audios/data/053/fe_03_05300.wav', format='wav', samplerate=8000, nchannels=1, seconds=599.952, nsamples=4799616)
Source filepath:
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/test/labels/data/053/fe_03_05300.txt

Calldata:
fisher.CallData(callid='05300', topicid='ENG40', signalgrade=4.0, convgrade=4.0, channelspeakers=
		[fisher.Speaker(pin='58144', gender='m', dialect='a'), 
		 fisher.Speaker(pin='61421', gender='m', dialect='a')])

fisher.ActiveSpeakers with sample rate 100
Start    - End      : Label
64.0000  - 155.0000 : [0 1]
155.0000 - 176.0000 : [1 1]
176.0000 - 294.0000 : [1 0]
...

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/test/audios/data/053/fe_03_05301.wav', format='wav', 

### Sinks

Where the exported features will be saved

In [13]:
pickles_root = os.path.join(splits_root, 'pickles')  # Root where the directory with h5 will go
os.makedirs(pickles_root, exist_ok=True)

# Pattern for naming the dir inside the pickles root
datestamp = "20170804"  # time.strftime("%Y%m%d")
pattern = "{}-logmel{}-win{}ms-hop{}ms".format(datestamp, nmels, 
                                                        int(win_sec*1000), 
                                                        int(hop_sec*1000))

pickles_dir = os.path.join(pickles_root, pattern)

print("Export Directory (was created)\n", pickles_dir, sep='\n')
print('\n\n')

fn_h5 = lambda splitname: "{}.h5".format(splitname)
val_h5 = os.path.join(pickles_dir, fn_h5("val"))
trn_h5 = os.path.join(pickles_dir, fn_h5("trn"))
tst_h5 = os.path.join(pickles_dir, fn_h5("tst"))

print("Export Filepaths\n", 
      val_h5, '',
      trn_h5, '',
      tst_h5, '',
      sep='\n')

os.makedirs(pickles_dir, exist_ok=False)

Export Directory (was created)

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms



Export Filepaths

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms/val.h5

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms/trn.h5

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms/tst.h5



## Main Functions

These are the main functions that will be used in loading audio, extracting features, and inferring labels.

These will be used in pre-flight checks, and later `dask.delayed`.

In [14]:
# main Python / numpy functions

def readaudiodata(pair):
    # NOTE: In the pre-flight checks below, we will be testing the method below in 
    # more detail ... make any changes here if there's something wrong
    
    a, l = pair.audio, pair.label
    
    with l.samplerate_as(a.samplerate):
        s = int(max(0, l.min_start))
        e = int(min(a.nsamples, l.max_end))
                
    return lr.load(a.filepath, sr=None)[0][s:e]


def extractfeat(audiodata):
    return au.logmelspectrogram(y=audiodata, sr=sr, n_fft=n_fft, hop_len=hop_len, 
                                window=window, n_mels=nmels,
                                # win_len=win_len,  # not necessary
                               )

def readlabelsdata(pair):
    a, l = pair.audio, pair.label
    
    with l.samplerate_as(a.samplerate):
        nsamples = min(a.nsamples, l.max_end) - l.min_start
        
    endings = fe.samples_for_labelsat(nsamples=nsamples, hop_len=hop_len, win_len=win_len)
    
    with l.min_start_as(0, samplerate=a.samplerate):
        labels = l.labels_at(endings, samplerate=a.samplerate)
        
    return labels

def strided_feat(feat):
    return nu.strided_view(feat, win_shape=chunking, step_shape=chunkstep)

strided_label = strided_feat
    
def expected_featlen(pair):
    a, l = pair.audio, pair.label
    
    with l.samplerate_as(a.samplerate):
        nsamples = min(a.nsamples, l.max_end) - l.min_start
        
    return 1 + (nsamples - win_len) // hop_len

def expected_stridedfeat_shape(pair):
    featlen = expected_featlen(pair)
    nstrides = (featlen - chunkovl) // chunkstep
    return (nstrides, chunking, nmels)

def expected_stridedlabel_shape(pair):
    return (*expected_stridedfeat_shape(pair)[:-1], 2)

In [15]:
# dast.delayed versions of the above ... notice minimum number of args for each

d_readaudiodata = d.delayed(readaudiodata, name='audio')  # args=(pair,)

d_extractfeat = d.delayed(extractfeat, name='feat')  # args=(audiodata,)
# NOTE: Yes, we can split the multiple steps involved extractfeat
# and delay each of them ... but ... let's not
# Most amount of time is going to be spent in stft, so then
# it will make a lot more sense to do that dasky way ... 
# and that needs another round of making decisions after checks.
# Some other day ...

d_stridefeat = d.delayed(strided_feat, name='stridef')  # args=(feat,)

d_readlabelsdata = d.delayed(readlabelsdata, name='label')  # args=(pair,)

d_stridelabel = d.delayed(strided_label, name='stridel')

In [16]:
def steps(pair):
    # extract and stride features
    audiodata = d_readaudiodata(pair)
    feat = d_extractfeat(audiodata)
    feat = d_stridefeat(feat)
    
    # extract and stride labels
    label = d_readlabelsdata(pair)
    label = d_stridelabel(label)
    
    # for feat
    xfeatshape = expected_stridedfeat_shape(pair)
    feat = da.from_delayed(feat, xfeatshape, np.float64)
    feat = da.concatenate(feat)
    
    # for labels
    xlabelshape = expected_stridedlabel_shape(pair)
    label = da.from_delayed(label, xlabelshape, np.int)
    label = da.concatenate(label)
    
    # dset paths in hdf5
    callid = pair.label.callid
    groupid = fe.groupid_for_callid(callid)
    topath = "{}/{}".format(groupid, callid)
    
    apath = "audios/{}".format(topath)
    lpath = "labels/{}".format(topath)
    
    return (apath, feat), (lpath, label)

In [17]:
def do_job_for_split(pairs, tofile):
    dsets = dict()
    
    for pair in pairs:
        (ap, a), (lp, l) = steps(pair)
        
        dsets[ap] = a
        dsets[lp] = l
        
    da.to_hdf5(tofile, dsets, compression='lzf', fletcher32=True)

## Pre-Flight Checks

### Audios

In [18]:
# Detailed analysis : Do it for a few files, because it might take a lot of time


# First, the helpers
def validate_audio_detailed(pair):
    assert nchannels <= 2, "nchannels set by you is > 2, and librosa may do something weird"
    
    a, asr = lr.load(pair.audio.filepath, sr=None)
    ashape = a.shape
    amin, amax = a.min(axis=-1), a.max(axis=-1)
    amean = a.mean(axis=-1)
    del a
    
    # check if the samplerates match
    assert asr == samplerate, "samplerate mismatch {} v/s {}".format(asr, samplerate)
    assert asr == pair.audio.samplerate, "samplerate mismatch {} v/s {}".format(asr, pair.audio.samplerate)
    
    # check if nchannels match
    # HACK: don't rely on this, cuz librosa, I think, forces things to be at most stereo
    assert len(ashape) == nchannels, "nchannels mismatch {} v/s {}".format(a.shape, nchannels)
    assert len(ashape) == pair.audio.nchannels, "nchannels mismatch {} v/s {}".format(ashape, pair.audio.nchannels)
    
    # check if the nsamples <= labels.max_end
    with pair.label.samplerate_as(asr):
        me = pair.label.max_end
        
    assert ashape[-1] >= me, "nsamples mismatch {} v/s {}".format(ashape, me)
    
    # assert that the read audio was normalized
    assert np.all(amin >= -1), "amin not >= -1 at {}".format(amin)
    assert np.all(amax <=  1), "amax not <= +1 at {}".format(amax)
    assert np.allclose(amean, 0, atol=1e-3), "amean not close to zero at {}".format(amean)
    
    
def validate_audio_detailed_pairs(pairs):
    for i, pair in enumerate(pairs):
        try:
            validate_audio_detailed(pair)
        except AssertionError as e:
            print('AssertionErrors with pair at {}'.format(i))
            print(fn_for_fp(pair.audio.filepath))
            print(e)
            print()

In [19]:
validate_audio_detailed_pairs(val_pairs)

# No errors means all okay with the audio files ... hopefully

In [20]:
# The quick analysis basically assumes that our impl of the audio_metadata reader is reliable
# Yes, we should validate everything, and ... hopefully ... it will be all quick!

def validate_audio_quick(pair):
    a = pair.audio
    asr = a.samplerate
    ashape = (a.nchannels, a.nsamples) if a.nchannels > 1 else (a.nsamples,)
    
    # check if the samplerates match
    assert asr == samplerate, "samplerate mismatch {} v/s {}".format(asr, samplerate)
    
    # check if nchannels match
    # HACK: don't rely on this, cuz librosa, I think, forces things to be at most stereo
    assert len(ashape) == nchannels, "nchannels mismatch {} v/s {}".format(ashape, nchannels)
    
    # check if the nsamples <= labels.max_end
    with pair.label.samplerate_as(asr):
        me = pair.label.max_end
        
    assert ashape[-1] >= me, "nsamples mismatch {} v/s {}".format(ashape, me)
    
    
def validate_audio_quick_pairs(pairs):
    with_err = []
    for i, pair in enumerate(pairs):
        try:
            validate_audio_quick(pair)
        except AssertionError as e:
            print('AssertionErrors with pair at {}'.format(i))
            print(fn_for_fp(pair.audio.filepath))
            print(e)
            print()
            with_err.append(i)
            
    return with_err
    

In [21]:
validate_audio_quick_pairs(val_pairs)

# No errors means all okay with the audio files ... hopefully

[]

In [22]:
trn_with_err = validate_audio_quick_pairs(trn_pairs)
print(trn_with_err)
# No errors means all okay with the audio files ... hopefully

AssertionErrors with pair at 296
fe_03_00396
nsamples mismatch (4799616,) v/s 4799760

AssertionErrors with pair at 301
fe_03_00401
nsamples mismatch (4799616,) v/s 4799760

AssertionErrors with pair at 457
fe_03_00557
nsamples mismatch (4799616,) v/s 4799760

[296, 301, 457]


In [23]:
validate_audio_quick_pairs(tst_pairs)

# No errors means all okay with the audio files ... hopefully

AssertionErrors with pair at 148
fe_03_05448
nsamples mismatch (4799904,) v/s 4799920

AssertionErrors with pair at 233
fe_03_05533
nsamples mismatch (4799328,) v/s 4799360

AssertionErrors with pair at 245
fe_03_05545
nsamples mismatch (4799328,) v/s 4799360

AssertionErrors with pair at 264
fe_03_05564
nsamples mismatch (4799904,) v/s 4799920

AssertionErrors with pair at 293
fe_03_05593
nsamples mismatch (4799808,) v/s 4799840

AssertionErrors with pair at 298
fe_03_05598
nsamples mismatch (4799328,) v/s 4799360

AssertionErrors with pair at 311
fe_03_05611
nsamples mismatch (4799904,) v/s 4799920

AssertionErrors with pair at 314
fe_03_05614
nsamples mismatch (4799424,) v/s 4799440

AssertionErrors with pair at 318
fe_03_05618
nsamples mismatch (4799904,) v/s 4799920

AssertionErrors with pair at 329
fe_03_05629
nsamples mismatch (4799904,) v/s 4799920

AssertionErrors with pair at 371
fe_03_05671
nsamples mismatch (4799904,) v/s 4799920

AssertionErrors with pair at 446
fe_03_0574

[148, 233, 245, 264, 293, 298, 311, 314, 318, 329, 371, 446, 479, 499, 538]

#### Some errors?

In [24]:
print('Files with errors:', 
      *[(i, fn_for_fp(trn_pairs[p].audio.filepath)) 
        for i, p in enumerate(trn_with_err)], 
      sep='\n')
print()
validate_audio_detailed_pairs((trn_pairs[i] for i in trn_with_err))

Files with errors:
(0, 'fe_03_00396')
(1, 'fe_03_00401')
(2, 'fe_03_00557')

AssertionErrors with pair at 0
fe_03_00396
nsamples mismatch (4799616,) v/s 4799760

AssertionErrors with pair at 1
fe_03_00401
nsamples mismatch (4799616,) v/s 4799760

AssertionErrors with pair at 2
fe_03_00557
nsamples mismatch (4799616,) v/s 4799760



In [25]:
i = 0
el = trn_pairs[trn_with_err[i]].label
am = trn_pairs[trn_with_err[i]].audio
a, asr = lr.load(am.filepath, sr=None)

In [26]:
with el.min_start_as(0, samplerate=asr): print(el[-10:])

Source filepath:
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/labels/data/003/fe_03_00396.txt

Calldata:
fisher.CallData(callid='00396', topicid='ENG28', signalgrade=4.0, convgrade=3.0, channelspeakers=
		[fisher.Speaker(pin='2215', gender='f', dialect='o'), 
		 fisher.Speaker(pin='4722', gender='f', dialect='a')])

fisher.ActiveSpeakers with sample rate 8000
Start    - End      : Label
4746320.0000 - 4761200.0000 : [1 1]
4761200.0000 - 4772880.0000 : [1 0]
4772880.0000 - 4774960.0000 : [1 1]
4774960.0000 - 4776240.0000 : [0 1]
4776240.0000 - 4778160.0000 : [0 0]
4778160.0000 - 4783920.0000 : [1 0]
4783920.0000 - 4785280.0000 : [0 0]
4785280.0000 - 4785760.0000 : [1 0]
4785760.0000 - 4789040.0000 : [1 1]
4789040.0000 - 4789760.0000 : [1 0]


In [27]:
a.shape

(4799616,)

In [28]:
am

AudioMetadata(filepath='/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/audios/data/003/fe_03_00396.wav', format='wav', samplerate=8000, nchannels=1, seconds=599.952, nsamples=4799616)

In [29]:
with el.samplerate_as(1): print(el[-10:])

Source filepath:
/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/train/labels/data/003/fe_03_00396.txt

Calldata:
fisher.CallData(callid='00396', topicid='ENG28', signalgrade=4.0, convgrade=3.0, channelspeakers=
		[fisher.Speaker(pin='2215', gender='f', dialect='o'), 
		 fisher.Speaker(pin='4722', gender='f', dialect='a')])

fisher.ActiveSpeakers with sample rate 1
Start    - End      : Label
594.5400 - 596.4000 : [1 1]
596.4000 - 597.8600 : [1 0]
597.8600 - 598.1200 : [1 1]
598.1200 - 598.2800 : [0 1]
598.2800 - 598.5200 : [0 0]
598.5200 - 599.2400 : [1 0]
599.2400 - 599.4100 : [0 0]
599.4100 - 599.4700 : [1 0]
599.4700 - 599.8800 : [1 1]
599.8800 - 599.9700 : [1 0]


In [30]:
am.seconds

599.952

##### Report

It is possible that the labels are wrong and the final channels were chopped early.

It is also possible that our exporting procedure did this when merging the two channels from 
the original sph files. But there is no time to rectify that, even if there is a way.

Actually, our exporting procedure would have raised an error if the `sph2pipe` passed it along
with the missing samples ... but ... oh well ...

We will have to keep it in mind when slicing the audio data before feature extraction,
and hence, can only rely on the `nsamples` from the metadata (they are correct!), and not
from the `max_end` of the labels.

Files with errors:
0. 'fe_03_00396.wav'
1. 'fe_03_00401.wav'
2. 'fe_03_00557.wav'

### Features

We want to make sure that the features we will be calculating have the right shape.

The values ... well ... I hope you are using reliable ones, because we can't do that deterministically.

Finally, the validations will have to extract the features, and hence, will be slow.
You will have to settle for smaller set of pairs.

Do check for the pairs that caused issues earlier though!

In [31]:
def validate_featshape_detailed(pair):
    am, lb = pair.audio, pair.label
    
    ad = readaudiodata(pair)
    feat = extractfeat(ad)
    featshape = feat.shape
    
    strfeat = strided_feat(feat)
    strfeatshape = strfeat.shape
    
    del feat
    del strfeat
    
    # expectations
    xfeatshape = (expected_featlen(pair), nmels)
    xstrfeatshape = expected_stridedfeat_shape(pair)
    
    # assert that they are the expected shapes
    assert featshape == xfeatshape, "Mismatch in featshape: {} v/s {}".format(featshape, xfeatshape)
    assert strfeatshape == xstrfeatshape, "Mismatch in strfeatshape: {} v/s {}".format(strfeatshape, xstrfeatshape)
    
    
def validate_featshape_detailed_pairs(pairs):
    for i, pair in enumerate(pairs):
        try:
            validate_featshape_detailed(pair)
        except AssertionError as e:
            print('AssertionErrors with pair at {}'.format(i))
            print(fn_for_fp(pair.audio.filepath))
            print(e)
            print()

In [32]:
# This might be a long running one ... choose few

validate_featshape_detailed_pairs(val_pairs)#[:5])

# No errors or printouts means all okay with the shape of the features ... hopefully

In [33]:
# Do check for the earlier problem ones!

validate_featshape_detailed_pairs([trn_pairs[i] for i in trn_with_err])

# No errors or printouts means all okay with the shape of the features ... hopefully

### Labels

Validate the label shapes.

In [34]:
def validate_labelshape(pair):
    labels = readlabelsdata(pair)
    labelshape = labels.shape
    
    strlabels = strided_label(labels)
    strlabelshape = strlabels.shape
    
    xlabelshape = (expected_featlen(pair), 2)
    xstrlabelshape = expected_stridedlabel_shape(pair)
    
    assert labelshape == xlabelshape, "Mismatch in labelshape: {} v/s {}".format(labelshape, xlabelshape)
    assert strlabelshape == xstrlabelshape, "Mismatch in strlabelshape: {} v/s {}".format(strlabelshape, xstrlabelshape)
    
def validate_labelshape_pairs(pairs):
    for i, pair in enumerate(pairs):
        try:
            validate_labelshape(pair)
        except AssertionError as e:
            print('AssertionErrors with pair at {}'.format(i))
            print(fn_for_fp(pair.audio.filepath))
            print(e)
            print()

In [35]:
# This should be quick
validate_labelshape_pairs(val_pairs)

# No errors or printouts means all okay with the shape of the features ... hopefully

In [36]:
# This should be quick
validate_labelshape_pairs(trn_pairs)

# No errors or printouts means all okay with the shape of the features ... hopefully

In [37]:
# This should be quick
validate_labelshape_pairs(tst_pairs)

# No errors or printouts means all okay with the shape of the features ... hopefully

### Dask Steps

In [38]:
def validate_dasksteps_detailed(pair):
    xsfs = expected_stridedfeat_shape(pair)
    xsls = expected_stridedlabel_shape(pair)
    
    xdatachunks = ((xsfs[1], ) * xsfs[0], xsfs[2:])
    xlabelchunks = ((xsls[1], ) * xsls[0], xsls[2:])
    
    xdatashape = (xsfs[0] * xsfs[1], ) + xsfs[2:]
    xlabelshape = (xsls[0] * xsls[1], ) + xsls[2:]
    
    callid = pair.label.callid
    groupid = fe.groupid_for_callid(callid)
    xap = "{}/{}/{}".format('audios', groupid, callid)
    xlp = "{}/{}/{}".format('labels', groupid, callid)
    
    (ap, a), (lp, l) = steps(pair)
    
    # assert paths in h5
    assert ap == xap, "Mismatch in audios path: {} v/s {}".format(ap, xap)
    assert lp == xlp, "Mismatch in labels path: {} v/s {}".format(lp, xlp)
    
    # assert chunking
    assert a.chunks == xdatachunks, "Mismatch in audio chunks:\n{} v/s\n{}".format(a.chunks, xdatachunks)
    assert l.chunks == xlabelchunks, "Mismatch in label chunks:\n{} v/s\n{}".format(l.chunks, xlabelchunks)
    
    # assert shape
    assert a.shape == xdatashape, "Mismatch in audio shape: {} v/s {}".format(a.shape, xdatashape)
    assert l.shape == xlabelshape, "Mismatch in label shape: {} v/s {}".format(l.shape, xlabelshape)
    
    # assert label values
    xlabels = np.concatenate(strided_label(readlabelsdata(pair)))
    labels = l.compute()
    assert np.all(labels == xlabels), "Mismatch in labels data"
    del xlabels
    del labels
    
    # assert audio values
    xdata = np.concatenate(strided_feat(extractfeat(readaudiodata(pair))))
    data = a.compute()
    assert np.allclose(data, xdata), "Mismatch in audio data"
    del xdata
    del data
    
def validate_dasksteps_detailed_pairs(pairs):
    for i, pair in enumerate(pairs):
        try:
            validate_dasksteps_detailed(pair)
        except AssertionError as e:
            print('AssertionErrors with pair at {}'.format(i))
            print(fn_for_fp(pair.audio.filepath))
            print(e)
            print()

In [188]:
# This might be a long running one ... choose few
validate_dasksteps_detailed_pairs(val_pairs[:5])

# No errors means all okay with the dask steps results

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.5s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.7s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.7s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  1.0s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s


In [187]:
# This might be a long running one ... choose few
validate_dasksteps_detailed_pairs([trn_pairs[i] for i in trn_with_err])

# No errors means all okay with the dask steps results

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.6s


## HDF5 Export

If all the pre-flight checks passed, it's time we finally export the HDF5 files.

We may later check on the results ... however ... taking too long may be a bad sign already.

In [192]:
# pr.clear()
# rpr.clear()

In [39]:
%%time
print(val_h5)
do_job_for_split(val_pairs, tofile=val_h5)
dg.visualize([pr, rpr])
pr.clear()
rpr.clear()

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms/val.h5
[########################################] | 100% Completed |  1min  1.3s


CPU times: user 1min 58s, sys: 1min 5s, total: 3min 4s
Wall time: 1min 1s


In [40]:
%%time
print(trn_h5)
do_job_for_split(trn_pairs, tofile=trn_h5)
dg.visualize([pr, rpr])
pr.clear()
rpr.clear()

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms/trn.h5
[########################################] | 100% Completed | 38min 26.6s


CPU times: user 22min 24s, sys: 12min 21s, total: 34min 46s
Wall time: 38min 45s


In [41]:
%%time
print(tst_h5)
do_job_for_split(tst_pairs, tofile=tst_h5)
dg.visualize([pr, rpr])
pr.clear()
rpr.clear()

/home/aabdullah/delve/rennet-x/data/working/fisher/fe_03_p1/wav-8k-mono-000_012/pickles/20170804-logmel64-win32ms-hop10ms/tst.h5
[########################################] | 100% Completed | 20min 54.2s


CPU times: user 10min 33s, sys: 4min 59s, total: 15min 33s
Wall time: 20min 57s


## Check HDF5 & Add Label Infos

```
fisher.CallData(callid='00396', topicid='ENG28', signalgrade=4.0, convgrade=3.0, channelspeakers=
		[fisher.Speaker(pin='2215', gender='f', dialect='o'), 
		 fisher.Speaker(pin='4722', gender='f', dialect='a')])
         
```

In [42]:
def checkh5_addinfo(pair, h5):
    xsfs = expected_stridedfeat_shape(pair)
    xsls = expected_stridedlabel_shape(pair)
    
    xdatachunks = ((xsfs[1], ) + xsfs[2:])
    xlabelchunks = ((xsls[1], ) + xsls[2:])
    
    xdatashape = (xsfs[0] * xsfs[1], ) + xsfs[2:]
    xlabelshape = (xsls[0] * xsls[1], ) + xsls[2:]
    
    l = pair.label
    dp = fe.PerSampleDataProvider.for_callids(
        h5,
        l.callid,
    )
    
    assert dp.totlen == xdatashape[0], "Mismatch in audio len: {} v/s {}".format(dp.totlen, xdatashape)
    assert dp.totlen == xlabelshape[0], "Mismatch in label len: {} v/s {}".format(dp.totlen, xlabelshape)
    
    for c in dp.chunkings:
        lend = c.dataslice[0].stop - c.dataslice[0].start
        assert lend == xdatachunks[0], "Mismatch in audio chunking: {} v/s {}".format(lend, xdatachunks)
        
        lenl = c.labelslice[0].stop - c.labelslice[0].start
        assert lenl == xlabelchunks[0], "Mismatch in label chunking: {} v/s {}".format(lenl, xlabelchunks)
    
    # Add infos
    lp = dp.chunkings[0].labelpath
    ap = dp.chunkings[0].datapath
    calldata = l.calldata
    
    with h.File(h5, 'a') as f:
        f[lp].attrs['speaker_pins'] = np.array([np.string_(s.pin) for s in calldata.channelspeakers])
        f[lp].attrs['speaker_genders'] = np.array([np.string_(s.gender) for s in calldata.channelspeakers])
        f[lp].attrs['speaker_dialects'] = np.array([np.string_(s.dialect) for s in calldata.channelspeakers])
        
        f[lp].attrs['topicid'] = calldata.topicid
        f[lp].attrs['signalgrade'] = calldata.signalgrade
        f[lp].attrs['convgrade'] = calldata.convgrade
        
        # FUCK THIS SHIT
#         f[lp].dims[1].label = np.string_('active_speaker_channel')
#         f.create_dataset("labels/active_speaker_channels", data=[0, 1])
#         f[lp].dims[1].attach_scale(f["labels/active_speaker_channels"])
        
#         f[ap].dims[1].label = np.string_('mel_frequencies')
#         f.create_dataset("audios/mel_frequencies", data=melfreq)
#         f[lp].dims[1].attach_scale(f["audios/mel_frequencies"])
        
        f.flush()
        
    
def checkh5_addinfo_pairs(pairs, h5):
    for i, pair in enumerate(pairs):
        try:
            checkh5_addinfo(pair, h5)
        except AssertionError as e:
            print('AssertionErrors with pair at {}'.format(i))
            print(fn_for_fp(pair.audio.filepath))
            raise
#             print(e)
#             print()

In [43]:
checkh5_addinfo_pairs(val_pairs, val_h5)

In [44]:
checkh5_addinfo_pairs(trn_pairs, trn_h5)

In [45]:
checkh5_addinfo_pairs(tst_pairs, tst_h5)