In [1]:
import os
import soundfile
import librosa

import tensorflow as tf
import vggish_input
import vggish_params
import vggish_postprocess
import vggish_slim
slim = tf.contrib.slim

import numpy as np
import h5py

from collections import Counter
from pathlib import Path

import boto3

In [2]:
SEQ_LEN = 9
NNDATA_DIR = '/media/Extra_Drive/fushigi_nn_data'
NNDATA_NAME = '13Feb2019'
DATA_DIR = '/media/Extra_Drive/fushigi'
BUCKET = 'fushigi-audio'
USERS = ['dane']

In [3]:
# get users file names.
boto3.setup_default_session(profile_name='hlnetworks')
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
user2keys = {}
bucket = s3_resource.Bucket(BUCKET)
for o in bucket.objects.all():
    if not o.key.endswith('.pcm'):
        continue
    user = o.key.split('/')[0]
    if user not in USERS:
        continue
    if user not in user2keys:
        user2keys[user] = []
    user2keys[user].append(o.key)

In [4]:
def read_audio(path):
    (audio, fs) = soundfile.read(path)

    #if audio.ndim > 1:
    #    audio = np.mean(audio, axis=1)
        
    #if target_fs is not None and fs != target_fs:
    #    audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
    #    fs = target_fs
        
    return audio, fs

In [5]:
"""Extract log mel spectrogram features. 
"""

# Arguments & parameters
#mel_bins = vggish_params.NUM_BANDS
#sample_rate = vggish_params.SAMPLE_RATE
#input_len = vggish_params.NUM_FRAMES
#embedding_size = vggish_params.EMBEDDING_SIZE

'''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the 
hop size. '''

# Paths
checkpoint_path = os.path.join('../vggish_model.ckpt')
pcm_params_path = os.path.join('../vggish_pca_params.npz')

if not os.path.isfile(checkpoint_path):
    raise Exception('Please download vggish_model.ckpt from '
        'https://storage.googleapis.com/audioset/vggish_model.ckpt '
        'and put it in the root of this codebase. ')

if not os.path.isfile(pcm_params_path):
    raise Exception('Please download pcm_params_path from '
    'https://storage.googleapis.com/audioset/vggish_pca_params.npz '
    'and put it in the root of this codebase. ')

# Load model
sess = tf.Session()

vggish_slim.define_vggish_slim(training=False)
vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

pproc = vggish_postprocess.Postprocessor(pcm_params_path)

INFO:tensorflow:Restoring parameters from ../vggish_model.ckpt


In [6]:
# Read audio and map to embeddings
import re
def get_fname(key):
    return key.split('/')[-1]

outputs = []
clip_ids = []
errors = []

nndata_path = Path(NNDATA_DIR)
nndata_path.mkdir(exist_ok=True)
datapath = Path(DATA_DIR)

for user in USERS:
    for k, key in enumerate(user2keys[user]):
        pcm_fname = get_fname(key)
        audio_dir = datapath/user/pcm_fname  # (<-- a directory of wav files)
        ordered_fnames = sorted(os.listdir(audio_dir), 
                                key=lambda fn: int(re.search('(\d+).wav', fn).group(1))
                               )
        for wav_fname in ordered_fnames:
            try:
                audio_path = os.path.join(audio_dir, wav_fname)
                audio, sample_rate = read_audio(audio_path)

                # Extract log mel feature
                logmel = vggish_input.waveform_to_examples(audio, sample_rate)

                # Extract embedding feature
                [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: logmel})

                # PCA
                postprocessed_batch = pproc.postprocess(embedding_batch)

                clip_id = "%s_%s_%s" % (user, pcm_fname, wav_fname)
                clip_ids.append(clip_id)
                outputs.append(postprocessed_batch)
            except:
                print('error processing: ', str(audio_dir/wav_fname))
                errors.append(str(audio_dir/wav_fname))

        print('extracted embeddings from ', k, ' of ', len(user2keys[user]), ' recordings')

extracted embeddings from  0  of  48  recordings
extracted embeddings from  1  of  48  recordings
extracted embeddings from  2  of  48  recordings
extracted embeddings from  3  of  48  recordings
extracted embeddings from  4  of  48  recordings
extracted embeddings from  5  of  48  recordings
extracted embeddings from  6  of  48  recordings
extracted embeddings from  7  of  48  recordings
extracted embeddings from  8  of  48  recordings
extracted embeddings from  9  of  48  recordings
extracted embeddings from  10  of  48  recordings
error processing:  /media/Extra_Drive/fushigi/dane/recording_08_32_58_20190211.pcm/out00032.wav
extracted embeddings from  11  of  48  recordings
extracted embeddings from  12  of  48  recordings
extracted embeddings from  13  of  48  recordings
extracted embeddings from  14  of  48  recordings
extracted embeddings from  15  of  48  recordings
extracted embeddings from  16  of  48  recordings
extracted embeddings from  17  of  48  recordings
extracted embe

In [15]:
errors

['/media/Extra_Drive/fushigi/dane/recording_08_32_58_20190211.pcm/out00032.wav',
 '/media/Extra_Drive/fushigi/dane/recording_12_25_34_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_13_09_17_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_13_31_08_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_13_52_59_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_14_14_50_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_14_36_41_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_14_58_32_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_16_25_57_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_16_47_49_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_17_09_40_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi/dane/recording_17_31_31_20190208.pcm/out00262.wav',
 '/media/Extra_Drive/fushigi

In [16]:
num_feats = set([o.shape[1] for o in outputs])
assert len(num_feats) == 1
num_feats = num_feats.pop()

for i, o in enumerate(outputs):
    l = o.shape[0]
    if l > SEQ_LEN:
        outputs[i] = o[:SEQ_LEN]
    elif l < SEQ_LEN:
        outputs[i] = np.vstack([o, np.zeros([SEQ_LEN - l, num_feats])])

In [17]:
outshape = set([o.shape for o in outputs])
assert len(outshape) == 1
outshape

{(9, 128)}

In [18]:
x = np.dstack(outputs)
x = np.swapaxes(x, 0,2)
x = np.swapaxes(x, 1,2)
x.shape

(6212, 9, 128)

In [19]:
len(outputs), len(clip_ids)

(6212, 6212)

In [20]:
assert len(outputs) == len(clip_ids)

In [21]:
# create fake y's
y = np.array([[False]*527]*x.shape[0])
y.shape

(6212, 527)

In [22]:
with h5py.File((Path(NNDATA_DIR)/NNDATA_NAME).with_suffix('.h5'), 'w') as f:
    f.create_dataset('x', data=x)
    f.create_dataset('y', data=y)
    f.create_dataset('video_id_list', 
                     data=[bytes(f, encoding='utf8') for f in clip_ids])

In [23]:
clip_ids[:100]

['dane_recording_07_32_50_20190211.pcm_out00000.wav',
 'dane_recording_07_32_50_20190211.pcm_out00001.wav',
 'dane_recording_07_32_50_20190211.pcm_out00002.wav',
 'dane_recording_07_32_50_20190211.pcm_out00003.wav',
 'dane_recording_07_32_50_20190211.pcm_out00004.wav',
 'dane_recording_07_32_50_20190211.pcm_out00005.wav',
 'dane_recording_07_32_50_20190211.pcm_out00006.wav',
 'dane_recording_07_32_50_20190211.pcm_out00007.wav',
 'dane_recording_07_32_50_20190211.pcm_out00008.wav',
 'dane_recording_07_32_50_20190211.pcm_out00009.wav',
 'dane_recording_07_32_50_20190211.pcm_out00010.wav',
 'dane_recording_07_32_50_20190211.pcm_out00011.wav',
 'dane_recording_07_32_50_20190211.pcm_out00012.wav',
 'dane_recording_07_32_50_20190211.pcm_out00013.wav',
 'dane_recording_07_32_50_20190211.pcm_out00014.wav',
 'dane_recording_07_32_50_20190211.pcm_out00015.wav',
 'dane_recording_07_32_50_20190211.pcm_out00016.wav',
 'dane_recording_07_32_50_20190211.pcm_out00017.wav',
 'dane_recording_07_32_50_20