# Transfer learning with Yamnet

### Colab preparations

In [None]:
! git clone https://github.com/fbnspl/rfcx-rainforest.git
%cd rfcx-rainforest/yamnet/
! pip install tensorflow-io==0.16

### Imports

In [None]:
# Imports.
import numpy as np
import pandas as pd
import librosa
import glob
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import tensorflow_io as tfio
import tensorflow as tf
import tensorflow.keras.layers.experimental.preprocessing as kp

from sklearn.model_selection import train_test_split

from functions.augment import time_mask, freq_mask, mixup_one_hot
from functions.metrics import LWLRAP

import params as yamnet_params
import yamnet as yamnet_model


### Functions: LRAP Metric and tensorflow audio read

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import label_ranking_average_precision_score

def get_lrap(X_val, y_one_hot_val, clf):
    y_prob = clf.predict_proba(X_val)
    y_pred = clf.predict(X_val)
    y_val = np.argmax(y_one_hot_val, axis=1)

    print('LRAP: %s' % label_ranking_average_precision_score(y_one_hot_val, y_prob))
    print(classification_report(y_val, y_pred))
    
def get_lrap_keras(X_val, y_one_hot_val, clf):
    y_prob = clf.predict(X_val)
    y_pred = np.argmax(y_prob, axis=1)
    y_val = np.argmax(y_one_hot_val, axis=1)

    print('LRAP: %s' % label_ranking_average_precision_score(y_one_hot_val, y_prob))
    print(classification_report(y_val, y_pred))
    
def tf_read_audio(path, sr, target_sr):
    # Read in the audio.
    audio = tfio.audio.AudioIOTensor(path)
    audio = tf.squeeze(audio[:], axis=[-1])
    audio = tfio.audio.resample(audio, sr, target_sr)
    audio = tf.cast(audio, tf.float32)
    audio = audio / 32768.0
    return audio.numpy()

---

## 0. Load data dict from pickle

In [None]:
with open('data/data.pickle', 'rb') as f:
    data = pickle.load(f)

print(data.keys())

In [None]:
with open('data/data_train.pickle', 'rb') as f:
    data_train = pickle.load(f)

print(data_train.keys())

In [None]:
with open('data/data_val.pickle', 'rb') as f:
    data_val = pickle.load(f)

print(data_val.keys())

---

## 1. Get data paths and split to sets, sample rate 

### Paths

In [None]:
data_path = '/Users/fabianseipel/aic/data/rfcx-species-audio-detection/'

### Params

In [None]:
# setup sample rates
sr = 48000
target_sr = 16000

### Data Dict

In [None]:
# init data dict
data = {}

### Csvs

In [None]:
# get all csvs
csvs_all = sorted(glob.glob(data_path + 'train_tp/*.csv'))

# split files to validation and 
data['csvs_train'], data['csvs_val'] = train_test_split(csvs_all, test_size=0.2, random_state=11)

---

## 2. Get yamnet model with in-built preprocessing

In [None]:
# The graph is designed for a sampling rate of 16 kHz, but higher rates should work too.
# We also generate scores at a 10 Hz frame rate.
# Set up the YAMNet model.
params = yamnet_params.Params(sample_rate=16000, patch_hop_seconds=0.1)
class_names = yamnet_model.class_names('yamnet_class_map.csv')
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')

---

## 3. Get features and labels only from active 1-sec parts of true positives

#### Training and validation data

In [None]:
# itearet over splits
for split in ['train', 'val']:
    csvs = data['csvs_' + split]
    
    # init lists
    X, y = [], []
    
    # iterate through csvs
    for csv in tqdm(csvs, position=0, leave=True):

        # get single csvs
        gt = pd.read_csv(csv, index_col=0).to_numpy()

        # get frames and classes where true positives are
        specs, frames = gt.nonzero()

        # load feature embeddings
        audio = tf_read_audio(data_path + f'train/{Path(csv).stem}.flac', 48000, 16000)
        audio_frames = librosa.util.frame(audio, frame_length=16000, hop_length=16000)

        # iterate through active tp frames
        for spec, frame in zip(specs, frames):
            audio_frame = audio_frames[int(0.02*16000):-int(0.02*16000), frame]
            _, _, spec = yamnet(audio_frame)
            spec = spec.numpy()
            spec = np.expand_dims(spec, axis=-1)
            X.append(spec)
            y.append(gt[:, frame])


    # convert to numpy array and one-hot
    data['X_' + split] = np.array(X)
    data['y_' + split] = np.argmax(np.array(y), axis=1)
    data['y_one_hot_' + split] = np.array(y)
    
    print(data['X_' + split].shape, data['y_' + split].shape)


---

## 4. Get features and labels on whole 60-seconds

## Training and validation data

In [None]:
# itearet over splits
for split in ['train', 'val']:
    csvs = data['csvs_' + split]
    
    # init lists
    X, y = [], []
    
    # iterate through csvs
    for csv in tqdm(csvs, position=0, leave=True):

        # get single csvs
        gt = pd.read_csv(csv, index_col=0).to_numpy()
        print(gt.shape)
        y.append(gt)
        
        # read audio
        audio = tf_read_audio(data_path + f'train/{Path(csv).stem}.flac', 48000, 16000)
        audio_frames = librosa.util.frame(audio, frame_length=16000, hop_length=16000)
        X_list = []

        # iterate through all frames
        for frame in range(audio_frames.shape[1]):
            audio_frame = audio_frames[int(0.02*16000):-int(0.02*16000), frame]
            _, _, spec = yamnet(audio_frame)
            spec = spec.numpy()
            spec = np.expand_dims(spec, axis=-1)
            X_list.append(spec)

        X_list = np.array(X_list)
        X.append(X_list)


    # convert to numpy array and one-hot
    data['X_60_' + split] = np.array(X)
    data['y_60_' + split] = np.argmax(np.array(y), axis=1)
    data['y_60_one_hot_' + split] = np.array(y)
    
    print(data['X_60_' + split].shape, data['y_60_one_hot_' + split].shape)


#### Test data

In [None]:
# init lists
X = []

# get all true positive csvs
files = sorted(glob.glob(data_path + 'test/*.flac'))

# iterate through csvs
for file in tqdm(files, position=0, leave=True):
    
    # load feature embeddings
    audio = tf_read_audio(file, 48000, 16000)
    audio_frames = librosa.util.frame(audio, frame_length=16000, hop_length=16000)
    specs = []
    
    # iterate through all frames
    for frame in range(audio_frames.shape[1]):
        audio_frame = audio_frames[int(0.02*16000):-int(0.02*16000), frame]
        _, _, spec = yamnet(audio_frame)
        spec = spec.numpy()
        spec = np.expand_dims(spec, axis=-1)
        specs.append(spec)
        
    specs = np.array(specs)
    X.append(specs)

# convert to numpy array and one-hot
data['X_60_test'] = np.array(X)

print(data['X_60_test'].shape)


---

## 5. Save data

#### save all data dict to pickle

In [None]:
'''
with open('data/data.pickle', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
'''

In [None]:
print(data.keys())

#### save train/val data dict to pickle

In [None]:
data_train = {}
data_train['X_train'] = data['X_train']
data_train['y_train'] = data['y_train']
data_train['y_one_hot_train'] = data['y_one_hot_train']

with open('data/data_train.pickle', 'wb') as f:
    pickle.dump(data_train, f, pickle.HIGHEST_PROTOCOL)

data_val = {}
data_val['X_val'] = data['X_val']
data_val['y_val'] = data['y_val']
data_val['y_one_hot_val'] = data['y_one_hot_val']

with open('data/data_val.pickle', 'wb') as f:
    pickle.dump(data_val, f, pickle.HIGHEST_PROTOCOL)


---

## 6. Transfer learn with yamnet

In [None]:
def build_model():
    # The graph is designed for a sampling rate of 16 kHz, but higher rates should work too.
    # We also generate scores at a 10 Hz frame rate.
    # Set up the YAMNet model.
    params = yamnet_params.Params(sample_rate=16000, patch_hop_seconds=0.1)
    class_names = yamnet_model.class_names('yamnet_class_map.csv')
    yamnet = yamnet_model.yamnet_frames_model(params)
    yamnet.load_weights('yamnet.h5')

    # get layers from yamnet
    layers = [l for l in yamnet.layers]
    core_layers = layers[79:-2]

    # add new imput layer
    input_layer = tf.keras.Input(shape=(96, 64, 1), name='Input')
    x = kp.RandomContrast(factor=0.2)(input_layer)

    # attach layer again from convolutions on
    for i, layer in enumerate(core_layers):
        x = layer(x)
        
    # add new prediction layer
    x = tf.keras.layers.Dense(24, activation='sigmoid')(x)

    # construct model
    yamnet_tl = tf.keras.Model(inputs=input_layer, outputs=x)

    '''
    # freeze some layers 
    for layer in yamnet_tl.layers[:50]:
        layer.trainable =  False
    '''
    
    return yamnet_tl

#### Make dataset

In [None]:
# autotune computation
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = tf.data.Dataset.from_tensor_slices((data_train['X_train'], data_train['y_one_hot_train']))
n_mels, n_frames, n_channels = train_dataset.element_spec[0].shape


train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(buffer_size=4096)
train_dataset = train_dataset.map(lambda mel_spec, y: (tf.cast(mel_spec, tf.float32), tf.cast(y, tf.float32)), num_parallel_calls=AUTOTUNE)
train_dataset = train_dataset.map(lambda mel_spec, y: (tf.squeeze(mel_spec, axis=2), y), num_parallel_calls=AUTOTUNE)

# MIXUP
train_dataset = train_dataset.batch(32)
train_dataset = train_dataset.map(lambda mel_spec, y: mixup_one_hot(mel_spec, y, 0.5), num_parallel_calls=AUTOTUNE)
train_dataset = train_dataset.unbatch()

# SPEC AUGMENTATIONS
# train_dataset = train_dataset.map(lambda mel_spec, y: (tf.roll(mel_spec, tf.random.uniform((), minval=-15, maxval=15, dtype=tf.dtypes.int32), axis=1), y), num_parallel_calls=AUTOTUNE)
# train_dataset = train_dataset.map(lambda mel_spec, y: (time_mask(mel_spec, param=int(n_frames * 0.1)), y), num_parallel_calls=AUTOTUNE)
# train_dataset = train_dataset.map(lambda mel_spec, y: (time_mask(mel_spec, param=int(n_frames * 0.1)), y), num_parallel_calls=AUTOTUNE)
# train_dataset = train_dataset.map(lambda mel_spec, y: (freq_mask(mel_spec, param=int(n_mels * 0.1)), y), num_parallel_calls=AUTOTUNE)
# train_dataset = train_dataset.map(lambda mel_spec, y: (freq_mask(mel_spec, param=int(n_mels * 0.1)), y), num_parallel_calls=AUTOTUNE)
# train_dataset = train_dataset.map(lambda mel_spec, y: (freq_mask(mel_spec, param=int(n_mels * 0.1)), y), num_parallel_calls=AUTOTUNE)


train_dataset = train_dataset.map(lambda mel_spec, y: (tf.expand_dims(mel_spec, axis=2), y), num_parallel_calls=AUTOTUNE)
train_dataset = train_dataset.batch(32)

print(train_dataset)
print(data_train['X_train'].shape)
print(data_train['y_one_hot_train'].shape)


In [None]:
# get model
yamnet_tl = build_model()
# yamnet_tl.summary()

# lwrap metric
metrics = [LWLRAP(num_classes=24), 
           tf.metrics.Precision(), 
           tf.metrics.Recall(), 
           tf.metrics.CategoricalAccuracy()]

# callbacks
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='val_lwlrap', 
                                                     min_delta=0, 
                                                     patience=25, 
                                                     verbose=1, 
                                                     mode='auto', 
                                                     baseline=None, 
                                                     restore_best_weights=True)

reduce_lro_cb = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_lwlrap', 
                                                     factor=0.1, patience=10, verbose=1, 
                                                     mode='auto', min_delta=0.0001, 
                                                     cooldown=0, min_lr=0)

# optimizer
opt = tf.keras.optimizers.Nadam(lr=0.001, clipnorm=1.0)

# compile model
yamnet_tl.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=metrics)
# train model
yamnet_tl.fit(
              # data['X_train'], data['y_one_hot_train'],
              train_dataset,
              epochs=50, 
              verbose=1,
              validation_data=(data_val['X_val'], data_val['y_one_hot_val']))

---

## 6. Predict on 60-seconds features

#### Predict validation data

In [None]:
y_prob_val = []
X_60_val = data['X_60_val']

for n in tqdm(range(X_60_val.shape[0]), position=0, leave=True):
    y_prob = yamnet_tl.predict(X_60_val[n, :])
    y_prob_val.append(y_prob)

    
y_prob_val = np.array(y_prob_val)
y_prob_val.shape

#### Aggregate probabilities over 60-sec with max an mean

In [None]:
y_prob_val_max = np.max(y_prob_val, axis=1)
y_prob_val_mean = np.mean(y_prob_val, axis=1)
y_prob_val_min = np.min(y_prob_val, axis=1)
y_prob_val_max.shape

#### Get ground truth, aggregate over 60-sec with max

In [None]:
y_60_one_hot_val = data['y_60_one_hot_val']
y_60_one_hot_val_max = np.max(y_60_one_hot_val, axis=2)
y_60_one_hot_val_max.shape

In [None]:
print('LRAP Max  : %s' % label_ranking_average_precision_score(y_60_one_hot_val_max, y_prob_val_max))
print('LRAP Mean : %s' % label_ranking_average_precision_score(y_60_one_hot_val_max, y_prob_val_mean))
print('LRAP Min. : %s' % label_ranking_average_precision_score(y_60_one_hot_val_max, y_prob_val_min))

## Predict test data

In [None]:
y_test_prob = yamnet_tl.predict(X_train[3, :])
y_test_prob.shape

In [None]:
plt.figure(figsize=(20,6))
plt.plot(y_test_prob)
print()