In [1]:
import os
import sys

import numpy as np
import pandas as pd
import torch

from tfrecord.torch.dataset import TFRecordDataset

In [2]:
sys.path.append('..')

from preprocess import get_all_parents, preprocess_ontology
from utils import get_project_root

In [3]:
data_dir = get_project_root()

In [4]:
def no_sufix(query_id):
    sufix = query_id.split('_')[-1]
    return query_id[:-len(sufix) - 1]

In [6]:
audioset_train_strong = pd.read_csv(os.path.join(data_dir, 'metadata/audioset_train_strong.tsv'), sep='\t')
audioset_train_strong['video_id'] = audioset_train_strong['segment_id'].map(no_sufix)
audioset_train_strong = audioset_train_strong[['video_id', 'start_time_seconds', 'end_time_seconds', 'label']]

In [7]:
audioset_train_strong.head()

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,label
0,b0RFKhbpFJA,0.0,10.0,/m/03m9d0z
1,b0RFKhbpFJA,4.753,5.72,/m/05zppz
2,b0RFKhbpFJA,0.0,10.0,/m/07pjwq1
3,b0RFKhbpFJA,6.899,7.01,/m/07qjznt
4,b0RFKhbpFJA,8.534,9.156,/t/dd00092


In [8]:
index_to_id = {}
index_to_classname = {}
id_to_index = {}
class_name_to_index = {}
class_labels_indices = pd.read_csv(os.path.join(data_dir, 'metadata/class_labels_indices.csv'))

def add_to_index_map(series):
    index_to_id[series['index']] = series['mid']
    id_to_index[series['mid']] = series['index']
    index_to_classname[series['index']] = series['display_name']
    class_name_to_index[series['display_name']] = series['index']

class_labels_indices.apply(add_to_index_map, axis=1)
len(id_to_index)


527

In [9]:
context_description = {"video_id": "byte", "labels": "int"}
sequence_description = {"audio_embedding": "byte"}

ont, highest_category = preprocess_ontology(os.path.join(data_dir, 'metadata/'))

In [10]:
class_1 = []
class_2 = []
for id_ in highest_category:
    class_2.append(ont[id_]['name'])
    for id1_ in ont[id_]['child_ids']:
        class_1.append(ont[id1_]['name'])


class_1 = np.unique(class_1)
class_1 = np.concatenate((class_1[:29], class_1[30::]))
class_2 = np.unique(class_2)

print(class_1)
print(class_2)

class_to_index_1 = {class_1[i]: i for i in range(len(class_1))}
class_to_index_2 = {class_2[i]: i for i in range(len(class_2))}


['Acoustic environment' 'Alarm' 'Bell' 'Deformable shell' 'Digestive'
 'Domestic animals, pets' 'Domestic sounds, home sounds' 'Engine'
 'Explosion' 'Fire' 'Generic impact sounds' 'Glass' 'Hands'
 'Heart sounds, heartbeat' 'Human group actions' 'Human locomotion'
 'Human voice' 'Liquid' 'Livestock, farm animals, working animals'
 'Mechanisms' 'Miscellaneous sources' 'Music genre' 'Music mood'
 'Music role' 'Musical concepts' 'Musical instrument' 'Noise'
 'Onomatopoeia' 'Other sourceless' 'Respiratory sounds' 'Silence'
 'Sound reproduction' 'Specific impact sounds' 'Surface contact'
 'Thunderstorm' 'Tools' 'Vehicle' 'Water' 'Whistling' 'Wild animals'
 'Wind' 'Wood']
['Animal' 'Channel, environment and background' 'Human sounds' 'Music'
 'Natural sounds' 'Sounds of things' 'Source-ambiguous sounds']


In [11]:
def get_strong_labeled_data(features_dir, audioset_df_strong):
    video_id_set = set(audioset_df_strong['video_id'].values)
    tfrecord_files = os.listdir(data_dir + features_dir)
    data_strong = []
    labels_1_strong = []
    labels_2_strong = []
    data_point_added = 0
    for filename in tfrecord_files:
        tfrecord_path = os.path.join(data_dir, features_dir, filename)
        dataset = TFRecordDataset(tfrecord_path, index_path=None,
                                  description=context_description, sequence_description=sequence_description)
        loader = torch.utils.data.DataLoader(dataset, batch_size=1)

        for data in iter(loader):
            tag = data[0]
            video_id = "".join([chr(i) for i in tag['video_id'][0]])
            if video_id in video_id_set:
                labels_df = audioset_df_strong.loc[audioset_df_strong['video_id'] == video_id]
                # For audioset, where each data point is a 10 sec clip with features extracted at 1 Hz
                labels = [[] for _ in range(10)]

                def add_label(series):
                    starting_index = int(
                        np.floor(series['start_time_seconds']))
                    ending_index = int(np.ceil(series['end_time_seconds']))
                    if series['label'] in id_to_index.keys():
                        for i in range(starting_index, ending_index):
                            labels[i].append(id_to_index[series['label']])

                labels_df.apply(add_label, axis=1)

                label_1 = []
                label_2 = []
                for l in labels:
                    parent_names = []
                    for ll in l:
                        get_all_parents(index_to_id[ll], ont, parent_names)

                    class_1_label = list(set(class_1) & set(parent_names))
                    class_2_label = list(set(class_2) & set(parent_names))
                    target_1_label = [class_to_index_1[c]
                                      for c in class_1_label]
                    target_2_label = [class_to_index_2[c]
                                      for c in class_2_label]
                    label_1.append(target_1_label)
                    label_2.append(target_2_label)

                # Only add the data point if all segment have labeled in multiple levels
                if np.all([len(l) > 0 for l in label_1]) and np.all([len(l) > 0 for l in label_2]):
                    file_data = np.vstack([p.numpy()
                                          for p in data[1]['audio_embedding']])

                    # Edge pad the end of data point to of length 10 if needed
                    if len(file_data) < 10:
                        # import ipdb
                        # ipdb.set_trace()
                        file_data = np.concatenate(
                            [file_data, np.asanyarray([file_data[-1] for _ in range(10 - len(file_data))])])
                        # label_1 = np.concatenate(
                        #     [label_1, [label_1[-1] for _ in range(10 - len(label_1))]])
                        # label_2 = np.concatenate(
                        #     [label_2, [label_2[-1] for _ in range(10 - len(label_2))]])
                    data_strong.append(file_data)
                    labels_1_strong.append(label_1)
                    labels_2_strong.append(label_2)

                    data_point_added += 1
                    print(f'data point added: {data_point_added}')

    return np.array(data_strong), labels_1_strong, labels_2_strong


In [20]:
train_features_dir = 'audioset_v1_embeddings/bal_train/'
data_train_strong, labels_1_train_strong, labels_2_train_strong = get_strong_labeled_data(
    train_features_dir, audioset_train_strong)
data_train_strong.shape


data point added: 1
data point added: 2
data point added: 3
data point added: 4
data point added: 5
data point added: 6
data point added: 7
data point added: 8
data point added: 9
data point added: 10
data point added: 11
data point added: 12
data point added: 13
data point added: 14
data point added: 15
data point added: 16
data point added: 17
data point added: 18
data point added: 19
data point added: 20
data point added: 21
data point added: 22
data point added: 23
data point added: 24
data point added: 25
data point added: 26
data point added: 27
data point added: 28
data point added: 29
data point added: 30
data point added: 31
data point added: 32
data point added: 33
data point added: 34
data point added: 35
data point added: 36
data point added: 37
data point added: 38
data point added: 39
data point added: 40
data point added: 41
data point added: 42
data point added: 43
data point added: 44
data point added: 45
data point added: 46
data point added: 47
data point added: 48
d

(5169, 10, 128)

In [15]:
np.save(os.path.join(data_dir, 'data/audioset_strong_train_data.npy'), data_train_strong)
np.save(os.path.join(data_dir, 'data/audioset_strong_train_labels_1.npy'), labels_1_train_strong)
np.save(os.path.join(data_dir, 'data/audioset_strong_train_labels_2.npy'), labels_2_train_strong)


  return array(a, dtype, copy=False, order=order, subok=True)


In [12]:
audioset_eval_strong = pd.read_csv(
    os.path.join(data_dir, 'metadata/audioset_eval_strong.tsv'), sep='\t')
audioset_eval_strong['video_id'] = audioset_eval_strong['segment_id'].map(
    no_sufix)
audioset_eval_strong = audioset_eval_strong[[
    'video_id', 'start_time_seconds', 'end_time_seconds', 'label']]
audioset_eval_strong.head(10)


Unnamed: 0,video_id,start_time_seconds,end_time_seconds,label
0,s9d-2nhuJCQ,0.0,10.0,/m/04rlf
1,s9d-2nhuJCQ,2.627,7.237,/m/053hz1
2,s9d-2nhuJCQ,2.627,9.239,/m/03qtwd
3,s9d-2nhuJCQ,5.634,6.649,/m/01w250
4,s9d-2nhuJCQ,7.201,8.56,/m/0l15bq
5,s9d-2nhuJCQ,8.089,9.23,/m/01w250
6,YxlGt805lTA,0.0,9.378,/m/04rlf
7,YxlGt805lTA,2.331,2.591,/m/09l8g
8,YxlGt805lTA,2.782,3.905,/m/02zsn
9,YxlGt805lTA,3.523,3.714,/m/07rgkc5


In [15]:
eval_features_dir = 'audioset_v1_embeddings/eval/'
data_eval_strong, labels_1_eval_strong, labels_2_eval_strong = get_strong_labeled_data(
    eval_features_dir, audioset_eval_strong)
data_eval_strong.shape


data point added: 1
data point added: 2
data point added: 3
data point added: 4
data point added: 5
data point added: 6
data point added: 7
data point added: 8
data point added: 9
data point added: 10
data point added: 11
data point added: 12
data point added: 13
data point added: 14
data point added: 15
data point added: 16
data point added: 17
data point added: 18
data point added: 19
data point added: 20
data point added: 21
data point added: 22
data point added: 23
data point added: 24
data point added: 25
data point added: 26
data point added: 27
data point added: 28
data point added: 29
data point added: 30
data point added: 31
data point added: 32
data point added: 33
data point added: 34
data point added: 35
data point added: 36
data point added: 37
data point added: 38
data point added: 39
data point added: 40
data point added: 41
data point added: 42
data point added: 43
data point added: 44
data point added: 45
data point added: 46
data point added: 47
data point added: 48
d

(11564, 10, 128)

In [21]:
np.save(os.path.join(data_dir, 'data/audioset_strong_eval_data.npy'), data_eval_strong)
np.save(os.path.join(data_dir, 'data/audioset_strong_eval_labels_1.npy'), labels_1_eval_strong)
np.save(os.path.join(data_dir, 'data/audioset_strong_eval_labels_2.npy'), labels_2_eval_strong)


  return array(a, dtype, copy=False, order=order, subok=True)
