In [1]:
import os

import tensorflow as tf
import numpy as np

def round_even(x):
    if x % 2 == 0:
        return x + 1
    else:
        return x + 2
    
class ConvRegressor(tf.keras.Model):
    def __init__(
        self, filters, kernel_size, 
        conv_type=tf.keras.layers.Conv2D, activation=tf.nn.relu, 
        **kwargs
    ):
        super().__init__()
        self.filters = filters
        self.conv_type = conv_type
        self.convs = []
        for n_filter in filters:
            self.convs.append(
                conv_type(n_filter, round_even(kernel_size), strides=2, padding='SAME', activation=activation, **kwargs)
            )
            self.convs.append(conv_type(n_filter, kernel_size, padding='SAME', activation=activation, **kwargs))
        self.convs.append(tf.keras.layers.Conv2D(n_filter, kernel_size, padding='SAME'))

    @tf.function
    def call(self, x, training=False):
        y = x
        for conv in self.convs:
            y = conv(y, training=training)
        return y


In [2]:
import importlib
from libs.layers import resnet
from libs.layers import cbam_resnet
from libs.layers import conv_attention_resnet

ResNet = resnet.ResNet34
ResNetCBAM = cbam_resnet.ResNetCBAM34
AttentionResNet = conv_attention_resnet.AttentionResNet34
AxialAttentionResNet = conv_attention_resnet.AxialAttentionResNet34

In [3]:
fold_index = 0
task = 'home'
model_type = 'attention'

model_dir = './model'
data_dir = './data'

save_without_train = True
allow_memory_growth = False

In [4]:
tf.config.experimental.set_memory_growth(
    tf.config.list_physical_devices('GPU')[0], allow_memory_growth
)

In [5]:
from pathlib import Path
model_dir = Path(model_dir)
data_dir = Path(data_dir)
model_dir.mkdir(exist_ok=True, parents=True)

In [6]:
# Load training setup
import pandas as pd
from pathlib import Path
from libs.misc import wavio
from tqdm.notebook import tqdm

root_dir = data_dir / 'evaluation_setup'

data_cache = {}

def get_annotation(task, fold_index, target):
    df = pd.read_csv(
        root_dir / f'{task}_fold{fold_index+1}_{target}.txt', sep='\t', 
        header=None, names=['file', 'class', 'start', 'end', 'event']
    )
    df['id'] = df['file'].apply(lambda x: Path(x).stem)
    return df

def load_dataset(target):
    if target not in data_cache:
        df = get_annotation(task, fold_index, target)
        wav_dict = {}
        for file in tqdm(df['file'].unique()):
            wav_dict[Path(file).stem] = wavio.readwav(str(data_dir / file))
        data_cache[target] = (df, wav_dict)
        return df, wav_dict
    else:
        return data_cache[target]
df, wav_dict = load_dataset('train')

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [7]:
from itertools import product
import numpy as np

sound_events = []

for fold, task_name in product([0,1,2,3], ['train', 'evaluate', 'test']):
    sound_events.extend(get_annotation('home', fold, task_name)['event'].unique())
sound_events = np.unique(sound_events)
print(sound_events)

['(object) rustling' '(object) snapping' 'cupboard' 'cutlery' 'dishes'
 'drawer' 'glass jingling' 'nan' 'object impact' 'people walking'
 'washing dishes' 'water tap running']


In [8]:
from functools import partial

n_augmentation = 5
perturbation = (0.0, 0.05)

def parse_wave(series, wav_dict, perturbation=(0.0, 0.0), offset=0.0):
    sp, ep = np.random.normal(loc=perturbation[0], scale=perturbation[1], size=2)
    
    result = series.to_dict()
    sr, bw, audio = wav_dict[series['id']]
    result['start_perturbation'] = sp
    result['end_perturbation'] = ep
    
    start = int((series['start'] - sp + offset) * sr)
    end = int((series['end'] + ep + offset) * sr)
    result['start_index'] = max(start, 0)
    result['end_index'] = min(end, len(audio))
    result['sr'] = sr
    result['bw'] = bw
    result['audio'] = audio[result['start_index']:result['end_index']]
    return pd.Series(result)

def parse_wave_multiple(series, wav_dict, window_size, hop_size, perturbation=(0.0, 0.0)):
    dt = series['end'] - series['start']
    result = []
    if dt > window_size:
        for start in np.arange(0, dt, hop_size):
            if start + window_size > dt:
                break
            result.append(parse_wave(series, wav_dict, perturbation, offset=start))
    else:
        result.append(parse_wave(series, wav_dict, perturbation=perturbation))
    return result        

def parse_wave_with_augmentation(series, wav_dict, n_augmentation, with_multiple=False, **kwargs):
    if with_multiple:
        result = []
        for i in range(n_augmentation):
            result.extend(parse_wave_multiple(series, wav_dict, perturbation=perturbation, **kwargs))
        df = pd.DataFrame(result)
        df['original_index'] = series.name
    else:
        df = pd.DataFrame([
            parse_wave(series, wav_dict, perturbation=perturbation)
            for i in range(n_augmentation)
        ])
        df['original_index'] = series.name
    return df

_id = df.loc[0, 'id']
sr = wav_dict[_id][0]
# display(Audio(wav_dict[_id][2][:, 0], rate=sr))
# display(Audio(
#     parse_wave(df.loc[0], wav_dict=wav_dict)['audio'][:, 0], rate=sr
# ))
# audio_df = df.apply(partial(parse_wave, wav_dict=wav_dict, window_size=5.0, hop_size=1.25), axis=1)

In [9]:
from functools import partial
import librosa

n_mels = 128
n_sampling=4096
hop_length = n_sampling // 4
window='hann'
pad='constant'
max_time = 1.0
batch_size = 2

def preprocess(wav, sampling_rate):
    return np.concatenate([
        mono_preprocess(wav[..., 0], sampling_rate)[..., np.newaxis],
        mono_preprocess(wav[..., 1], sampling_rate)[..., np.newaxis],
    ], axis=-1)
    
def mono_preprocess(wav, sampling_rate):
#     F = librosa.stft(wav, n_sampling, hop_length=hopsize, window=window, pad_mode=pad)
#     mag = np.abs(F)[..., np.newaxis]
#     phase = np.angle(F)[..., np.newaxis]
#     mag_phase = np.concatenate([mag, phase], axis=2)
    
    mag = librosa.feature.melspectrogram(
        wav, sr=sampling_rate, hop_length=hop_length, n_mels=n_mels,
        fmin=0.0, fmax=20e3,
    )
    logmag = np.log(mag + 1e8)
    return logmag

def normalize_time(audio_df, max_time):
    results = []
    for audio, sr in zip(audio_df['audio'], audio_df['sr']):
        max_len = int(sr * max_time)

        pos = min(len(audio), max_len)
        result = np.zeros((max_len, 2), np.float32)
        result[:pos, :] = audio[:pos]
        results.append(result)
    return results


def get_dataset(target, n_augmentation=1, with_original=False):
    df, wav_dict = load_dataset(target)
    if n_augmentation != 1:
        audio_df = df.groupby(level=0).apply(
            lambda df: parse_wave_with_augmentation(
                df.iloc[0],
                wav_dict=wav_dict, n_augmentation=n_augmentation,
            )
        ).reset_index()
    else:
        results = []
        for i in range(len(df)):
            results.extend(
                parse_wave_multiple(
                    df.loc[i], wav_dict=wav_dict, window_size=max_time, 
                    hop_size=max_time / 4
                )
            )
        audio_df = pd.DataFrame(results)

    encoding_dict = {c: i for i, c in enumerate(sound_events)}

    normed_audio = normalize_time(audio_df, max_time=5.0)
    audios = np.concatenate([
        preprocess(audio, sr)[np.newaxis] 
        for audio, sr in tqdm(zip(normed_audio, audio_df['sr']), total=len(normed_audio))
    ], axis=0)
    
    events = audio_df['event'].replace({
        event: i for i, event in enumerate(sound_events)
    })
    if with_original:
        return (
            audios.astype(np.float32), 
            np.array([audio for audio in normed_audio]).astype(np.float32),
            events.to_numpy().astype(np.int32),
        )
    else:
        return audios.astype(np.float32), events.to_numpy().astype(np.int32)

def get_tf_dataset(target, batch_size=32, shuffle=False, **kwargs):
    data = get_dataset(target, **kwargs)
    ds = tf.data.Dataset.from_tensor_slices(data).batch(
        batch_size, drop_remainder=shuffle)
    if shuffle:
        ds = ds.shuffle(buffer_size=1000, reshuffle_each_iteration=True)
    return ds


In [10]:
train_dataset = get_tf_dataset('train', batch_size=batch_size, n_augmentation=1, with_original=True)
test_dataset = get_tf_dataset('evaluate', batch_size=batch_size, with_original=True)

HBox(children=(FloatProgress(value=0.0, max=1885.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1046.0), HTML(value='')))




In [11]:
from collections import defaultdict

class PlotCallback(tf.keras.callbacks.Callback):
    is_higher_better = {
        'accuracy'
    }
    is_linear = {
        'accuracy',
        'sparse_categorical_accuracy',
    }
    def __init__(self, targets=None, n_step=1):
        super().__init__()
        self.fig = None
        self.axes = None
        self.axes_index = {}
        self.n_step = n_step
        self.targets = targets
        self.epochs = []
        self.history = defaultdict(list)

    def plot_and_display(self):
        for ax in self.axes.flat:
            ax.clear()
        for i, (label, values) in enumerate(self.history.items()):
            if any(name in label for name in self.is_higher_better):
                get_best_value = np.amax
            else:
                get_best_value = np.amin
            
            if label.startswith('val_'):
                _label = label[4:]
            else:
                _label = label
            
            ax = self.axes.flat[self.axes_index[_label]]
            ax.plot(self.epochs, values, label=label, color=f'C{i}')
            best_value = get_best_value(values)
            ax.axhline(best_value, linestyle='--', color=f'C{i}')
            ax.text(0.0, best_value, f'{best_value:.3f}')
            
            if _label not in self.is_linear:
                ax.set_yscale('log')

        if self.epochs[-1] == 0:
            self.fig.legend()

        io = BytesIO()
        self.fig.savefig(io, format='png')

        clear_output(wait=True)
        display_png(Image(io.getvalue()))
        
    def on_epoch_end(self, epoch, logs):
        if epoch == 0:
            self.fig, self.axes = plt.subplots(len(logs) // 2, 1, figsize=(8, 4 * len(logs) // 2))
            self.axes_index = {}
            for label in logs:
                if label.startswith('val_'):
                    _label = label[4:]
                else:
                    _label = label
                if _label not in self.axes_index:
                    self.axes_index[_label] = len(self.axes_index)

        for key, value in logs.items():
            self.history[key].append(value)

        self.epochs.append(epoch)
        if (epoch % self.n_step) == 0:
            self.plot_and_display()

class BalancedSparseCategoricalAccuracy(tf.keras.metrics.SparseCategoricalAccuracy):
    def __init__(self, name='balanced_sparse_categorical_accuracy', dtype=None):
        super().__init__(name, dtype=dtype)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_flat = y_true
        if y_true.shape.ndims == y_pred.shape.ndims:
            y_flat = tf.squeeze(y_flat, axis=[-1])
        y_true_int = tf.cast(y_flat, tf.int32)

        cls_counts = tf.math.bincount(y_true_int)
        cls_counts = tf.math.reciprocal_no_nan(tf.cast(cls_counts, self.dtype))
        weight = tf.gather(cls_counts, y_true_int)
        return super().update_state(y_true, y_pred, sample_weight=weight)
    
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [12]:
def create_cnn_model():
    return tf.keras.Sequential([
        ConvRegressor([8, 16, 32, 64, 128, 256], kernel_size=7),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(len(sound_events)),
    ])

def create_resnet_cnn_model():
    return tf.keras.Sequential([
        ResNet(kernel_size=5),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(len(sound_events)),
    ])

def create_cbam_model():
    return tf.keras.Sequential([
        ResNetCBAM(kernel_size=5),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(len(sound_events)),
    ])

def create_attention_model():
    return tf.keras.Sequential([
        AttentionResNet(kernel_size=1),
        tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
        tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
        tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(len(sound_events)),
    ])

def create_axial_attention_model():
    return tf.keras.Sequential([
        AxialAttentionResNet(kernel_size=1),
        tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
        tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(1000, activation=tf.nn.relu),
        tf.keras.layers.Dense(len(sound_events)),
    ])

model_type = 'attention'
if model_type == 'cnn':
    learning_rate = 1e-4
    create_model = create_cnn_model
elif model_type == 'resnet':
    learning_rate = 1e-4
    create_model = create_resnet_cnn_model
elif model_type == 'cbam':
    learning_rate = 1e-4
    create_model = create_cbam_model
elif model_type == 'attention':
    create_model = create_attention_model
    learning_rate = CustomSchedule(128)
elif model_type == 'axial-attention':
    learning_rate = 1e-4
    create_model = create_axial_attention_model
else:
    raise NotImplementedError()

In [13]:
# net = tf.keras.Sequential([
#         AttentionResNet(kernel_size=1),
#         tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
#         tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
#         tf.keras.layers.Conv2D(1024, kernel_size=7, strides=2, activation=tf.nn.relu),
# ])
# print(net(np.zeros((4, 128, 216, 2), dtype=np.float32)).shape)

In [None]:
import IPython 
from collections import defaultdict 
from itertools import product
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
import time
import seaborn as sns
from io import BytesIO
import imageio
from IPython.display import Image, display_png, clear_output

#%matplotlib widget
%matplotlib inline
#%matplotlib notebook
if 'model' in globals():
    del model
epochs = 2000
model = create_model()
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.5, beta_2=0.99)

checkpoint_name = f'{task}_cnn_{model_type}_fold{fold_index}'
cur_model_dir = model_dir / checkpoint_name
cur_model_dir.mkdir(exist_ok=True, parents=True)

plot_callback = PlotCallback(n_step=3)
with tf.device('/GPU:0'):
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.SUM),
        optimizer=optimizer,
        metrics=BalancedSparseCategoricalAccuracy(),
#         options=tf.distribute.RunOptions(report_tensor_allocations_upon_oom = True)
    )
    
    mode = 'max'
    model.fit(
        train_dataset.map(lambda *vars_list: (vars_list[0], vars_list[-1])),
        batch_size=batch_size, epochs=2000, shuffle=True,
        validation_data=test_dataset.map(lambda *vars_list: (vars_list[0], vars_list[-1])),
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                patience=10, 
                #monitor='val_sparse_categorical_accuracy',
                monitor='val_balanced_sparse_categorical_accuracy',
                mode=mode
            ),
            plot_callback,
            tf.keras.callbacks.TerminateOnNaN(),
            tf.keras.callbacks.ModelCheckpoint(
                str(cur_model_dir / (checkpoint_name + '.model')),
                monitor='val_balanced_sparse_categorical_accuracy', 
#                 monitor='val_loss',
                save_best_only=True,
                save_weights_only=True,
                mode=mode, 
            )
        ]
    )

plot_callback.fig.tight_layout()
plot_callback.fig.savefig(str(cur_model_dir / (checkpoint_name + '.png')))

Epoch 1/2000

In [None]:
from itertools import product
from sklearn.metrics import accuracy_score
accuracy_data = []
model = create_model()
model.load_weights(str(cur_model_dir / (checkpoint_name + '.model')))
model.compile()

results = {}
for target_name, dataset in zip(['train', 'test'], (train_dataset, test_dataset)):
    pred_logits = model.predict(
        dataset.map(lambda audios, norm_audios, labels: (audios, labels))
    )
    pred_labels = tf.argmax(tf.nn.softmax(pred_logits, axis=1), axis=1)
    
    audio = []
    truth_labels = []
    for batch in dataset:
        audio.extend(batch[1].numpy())
        truth_labels.extend(batch[2].numpy())
    metric = tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(
        tf.convert_to_tensor(np.array(truth_labels).astype(np.int32)), 
        tf.convert_to_tensor(pred_logits.astype(np.float32))
    )).numpy()

    truth_labels = np.array(sound_events).take(truth_labels)
    pred_labels = np.array(sound_events).take(pred_labels)
    
    agg_df = pd.crosstab(
        pd.Series(truth_labels, name='Truth'),
        pd.Series(pred_labels, name='Prediction'),
    )
    agg_df = agg_df.reindex(columns=sound_events, index=sound_events, fill_value=0)
    display(target_name)
    display(agg_df)
    
    accuracy = {}
    for name in sound_events:
        mask = truth_labels == name
        accuracy[name] = accuracy_score(truth_labels[mask], pred_labels[mask]) 
    accuracy['Metric'] = metric
    accuracy_data.append(pd.Series(accuracy, name=target_name))
    
    results[target_name] = {
        'Audio': None if save_without_train and target_name == 'train' else audio,
        'Prediction': pd.DataFrame({
            'Prediction': pred_labels,
            'Truth': truth_labels,
        }),
        'Agg': agg_df,
        'Accuracy': accuracy_data,
    }
accuracy_df = pd.DataFrame(accuracy_data)
accuracy_df['Mean'] = accuracy_df.mean(axis=1)
display(accuracy_df.T)

In [77]:
import cloudpickle as pickle
with open(cur_model_dir / f'result_metric.pickle', 'wb+') as fp:
    pickle.dump(results, fp)

In [75]:
save_without_train and 'train' == 'train'

True