In [1]:
import pandas as pd
import numpy as np
import six
import soundfile
import tensorflow.compat.v1 as tf
import os
from tqdm.notebook import tqdm


from models.vggish import vggish_input, vggish_params, vggish_slim

In [2]:
VGGISH_EMBEDDING_DIR = "training_set/Features/VGGish"
AUDIO_DIR = "training_set/Audio"
PRETRAINED_VGGISH_CKPT = "models/vggish/vggish_model.ckpt"

In [3]:
video_ids = pd.read_csv("training_set/scores_v2.csv")["video_id"]

In [4]:
def extract_VGGish_features(video_ids, audio_dir = AUDIO_DIR, embedding_dir = VGGISH_EMBEDDING_DIR):
    
    if not os.path.exists(embedding_dir):
        os.mkdir(embedding_dir)

    wav_filenames = [f"{audio_dir}/{video_id}.wav" for video_id in video_ids]
    embedding_filenames = [f"{embedding_dir}/{video_id}.csv" for video_id in video_ids]

    lengths = []
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training = False)
        vggish_slim.load_vggish_slim_checkpoint(sess, PRETRAINED_VGGISH_CKPT)

        features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

        for wav_filename, embedding_filename in tqdm(list(zip(wav_filenames, embedding_filenames))):
            if os.path.exists(wav_filename) and not os.path.exists(embedding_filename):
                examples_batch = vggish_input.wavfile_to_examples(wav_filename)
                [embedding_batch] = sess.run([embedding_tensor],
                                            feed_dict={features_tensor: examples_batch})
                lengths.append(embedding_batch.shape[0])
                np.savetxt(embedding_filename, embedding_batch)
            else:
                print(f"Skipped extracting {embedding_filename} from {wav_filename}")
                
    if len(lengths):
        print("Max number of VGGish embedding for a video:", np.max(lengths))
        print("Min number of VGGish embedding for a video:", np.min(lengths))
        print("Avg number of VGGish embedding for a video:", np.mean(lengths))

In [5]:
extract_VGGish_features(video_ids)

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Flatten instead.
INFO:tensorflow:Restoring parameters from models/vggish/vggish_model.ckpt


HBox(children=(FloatProgress(value=0.0, max=590.0), HTML(value='')))

Skipped extracting training_set/Features/VGGish/8.csv from training_set/Audio/8.wav
Skipped extracting training_set/Features/VGGish/26.csv from training_set/Audio/26.wav
Skipped extracting training_set/Features/VGGish/33.csv from training_set/Audio/33.wav
Skipped extracting training_set/Features/VGGish/46.csv from training_set/Audio/46.wav
Skipped extracting training_set/Features/VGGish/64.csv from training_set/Audio/64.wav
Skipped extracting training_set/Features/VGGish/70.csv from training_set/Audio/70.wav
Skipped extracting training_set/Features/VGGish/74.csv from training_set/Audio/74.wav
Skipped extracting training_set/Features/VGGish/117.csv from training_set/Audio/117.wav
Skipped extracting training_set/Features/VGGish/139.csv from training_set/Audio/139.wav
Skipped extracting training_set/Features/VGGish/143.csv from training_set/Audio/143.wav
Skipped extracting training_set/Features/VGGish/148.csv from training_set/Audio/148.wav
Skipped extracting training_set/Features/VGGish/