# Setup

## Installs

In [2]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Imports

In [20]:
import gdown
import glob
import csv
import os
import re
import wandb
import random
import math
import librosa
import shutil
import sed_eval
import dcase_util
import tensorflow as tf
import numpy as np
import soundfile as sf
from subprocess import Popen, PIPE
from tqdm import tqdm
from zipfile import ZipFile
from keras.regularizers import l2
from SpecAugment import spec_augment_tensorflow

## Download

In [4]:
output1 = "/content/new-dataset/test_data1.zip"
gdown.download(id='17PvyXLEkpIgBLxRkuTIk25MGL3uM3kS1', output=output1, quiet=False)

output2= "/content/new-dataset/test_data2.zip"
gdown.download(id='14abMPBH3EVmcU-3jPD4jWEpa4pQ38OY9', output=output2, quiet=False)

output3 = "/content/new-dataset/test_data3.zip"
gdown.download(id='1TU4CoJuFy40-zJopo3R4U-YgZFKFrtxB', output=output3, quiet=False)

output4 = "/content/new-dataset/test_data4.zip"
gdown.download(id='1E5595RX2NwpuckXvl2o1V_dm9Ja58arF', output=output4, quiet=False)

output5 = "/content/new-dataset/test_data5.zip"
gdown.download(id='16VJhkCV2-ILcHxiF8a2ygEiHxXyF9RtM', output=output5, quiet=False)

output6 = "/content/new-dataset/test_data6.zip"
gdown.download(id='1kZyXyZVTHnSTg-gdrlf91V5ioYMr50Mp', output=output6, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=17PvyXLEkpIgBLxRkuTIk25MGL3uM3kS1
To: /content/new-dataset/test_data1.zip
100%|██████████| 1.38G/1.38G [00:16<00:00, 84.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=14abMPBH3EVmcU-3jPD4jWEpa4pQ38OY9
To: /content/new-dataset/test_data2.zip
100%|██████████| 1.38G/1.38G [00:10<00:00, 133MB/s]
Downloading...
From: https://drive.google.com/uc?id=1TU4CoJuFy40-zJopo3R4U-YgZFKFrtxB
To: /content/new-dataset/test_data3.zip
100%|██████████| 1.34G/1.34G [00:10<00:00, 126MB/s]
Downloading...
From: https://drive.google.com/uc?id=1E5595RX2NwpuckXvl2o1V_dm9Ja58arF
To: /content/new-dataset/test_data4.zip
100%|██████████| 1.32G/1.32G [00:10<00:00, 124MB/s] 
Downloading...
From: https://drive.google.com/uc?id=16VJhkCV2-ILcHxiF8a2ygEiHxXyF9RtM
To: /content/new-dataset/test_data5.zip
100%|██████████| 1.29G/1.29G [00:11<00:00, 112MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kZyXyZVTHnSTg-gdrlf91V5ioYMr50Mp
To: /content/new-datas

'/content/new-dataset/test_data6.zip'

In [5]:
def unzip_data(download_path, extract_path):

  # create glob
  final_glob = glob.glob(f"{download_path}*.zip")

  for zip_name in final_glob:
    with ZipFile(zip_name, 'r') as zip:
      zip.extractall(extract_path)

In [6]:
download_test_path = '/content/new-dataset/'
extract_test_path = '/content/extracted-data'

unzip_data(download_test_path, extract_test_path)

# Preprocessing

## Parse files

In [21]:
def convert_annotations_to_events(filename):
    events = []
    with open(filename, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for row in spamreader:
            row.append(row[0])
            row.pop(0)
            row[1] = str((float(row[1])/1000))
            row[0] = str((float(row[0])/1000))
            events.append(row)
    return events

In [22]:
events = convert_annotations_to_events("/content/extracted-data/outputs/0.txt")
print(events)

[['0.0', '8.027', 'footsteps'], ['8.027', '10.922', 'rainforest'], ['10.922', '16.762', 'car'], ['16.762', '25.854', 'footsteps'], ['25.854', '34.946', 'footsteps'], ['34.946', '38.773', 'crowds'], ['38.773', '48.773', 'aircraft'], ['48.773', '58.633', 'car'], ['58.633', '68.084', 'rainforest'], ['68.084', '75.751', 'aircraft'], ['75.751', '80.37', 'clocks'], ['80.37', '83.271', 'car']]


In [23]:
audio_files = glob.glob("/content/extracted-data/outputs/*.wav")
text_files = glob.glob("/content/extracted-data/outputs/*.txt")

In [24]:
target_dir = os.path.dirname(audio_files[0]).replace("outputs", "outputs-mono")

if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [25]:
for sound in tqdm(audio_files):
  temp_file = sound.replace("outputs", "outputs-mono")
  command = command = "sox " + sound + " " + temp_file + " channels 1"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()

100%|██████████| 1000/1000 [02:16<00:00,  7.33it/s]


In [26]:
audio_files_mono = glob.glob("/content/extracted-data/outputs-mono/*.wav")
random.shuffle(audio_files_mono)

In [27]:
print(audio_files_mono)

['/content/extracted-data/outputs-mono/789.wav', '/content/extracted-data/outputs-mono/718.wav', '/content/extracted-data/outputs-mono/197.wav', '/content/extracted-data/outputs-mono/26.wav', '/content/extracted-data/outputs-mono/522.wav', '/content/extracted-data/outputs-mono/923.wav', '/content/extracted-data/outputs-mono/44.wav', '/content/extracted-data/outputs-mono/178.wav', '/content/extracted-data/outputs-mono/124.wav', '/content/extracted-data/outputs-mono/232.wav', '/content/extracted-data/outputs-mono/458.wav', '/content/extracted-data/outputs-mono/800.wav', '/content/extracted-data/outputs-mono/640.wav', '/content/extracted-data/outputs-mono/710.wav', '/content/extracted-data/outputs-mono/422.wav', '/content/extracted-data/outputs-mono/659.wav', '/content/extracted-data/outputs-mono/772.wav', '/content/extracted-data/outputs-mono/102.wav', '/content/extracted-data/outputs-mono/223.wav', '/content/extracted-data/outputs-mono/464.wav', '/content/extracted-data/outputs-mono/109

## Construct Dataset

In [28]:
fold1_train_files = []
fold1_val_files = []
fold1_test_files = []
print(len(audio_files_mono))
audio_files_mono = audio_files_mono[:100]
i = 0
for f in audio_files_mono:
  if i < 70:
    fold1_train_files.append(f)
  elif i < 90:
    fold1_val_files.append(f)
  else:
    fold1_test_files.append(f)
  i += 1

1000


In [29]:
def construct_examples(audio_path, win_len = 2.56, hop_len = 1.0, sr = 44100.0):
  # here win_len is the window_length and hop_len is the hop_length between the examples.
  # sr is the sampling rate

  window_length_t = win_len
  hop_length_t = hop_len

  window_length = int(sr*window_length_t)
  hop_length = int(sr*hop_length_t)

  audio, sr = sf.read(audio_path)

  # handle padding
  if audio.shape[0] < window_length:
    audio_padded = np.zeros((window_length, ))
    audio_padded[0:audio.shape[0]] = audio 

  else:
    no_of_hops = math.ceil((audio.shape[0] - window_length) / hop_length)
    audio_padded = np.zeros((int(window_length + hop_length*no_of_hops), ))
    audio_padded[0:audio.shape[0]] = audio  

  audio_example = [audio_padded[i - window_length : i] for i in range(window_length, audio_padded.shape[0]+1, hop_length)]
  win_ranges = [((i - window_length)/sr, i/sr) for i in range(window_length, audio_padded.shape[0]+1, hop_length)]

  return audio_example, win_ranges

In [30]:
def construct_labels(annotation_path, win_start, win_end, win_len):
  # takes the annotation_path, window_start, window_end and window_length
  events = convert_annotations_to_events(annotation_path)

  annotation_vals = [[float(e[0]), float(e[1]), e[2]] for e in events]

  curr_annotation = []

  for annotation in annotation_vals:
    if annotation[1] > win_start and annotation[0] <= win_end: 
      curr_start = max(annotation[0] - win_start, 0.0)
      curr_end = min(annotation[1] - win_start, win_len)
      curr_annotation.append([curr_start, curr_end, annotation[2]])    

  # get current class set from annotations
  class_set = set([c[2] for c in curr_annotation])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in curr_annotation:
    class_wise_events[c[2]].append(c)
    
  max_event_silence = 0.0
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    # round all the values so that they are not arbitarily long
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events

In [31]:
def get_universal_labels(events, class_dict, ex_length = 10.0, no_of_div = 32):
  # returns all labels from events
  win_length = ex_length/no_of_div
  labels = np.zeros((no_of_div, len(class_dict.keys()) * 3))
  
  for e in events:

    start_time = float(e[0])
    stop_time = float(e[1])
    if (float(e[0]) == 2.56):
      start_time = float(e[0] - 0.00001)
    if (float(e[1] == 2.56)):
      stop_time = float(e[1]- 0.000001)
     

    start_bin = int(start_time // win_length)
    stop_bin = int(stop_time // win_length)

    start_time_2 = start_time - start_bin * win_length
    stop_time_2 = stop_time - stop_bin * win_length

    n_bins = stop_bin - start_bin

    if n_bins == 0:
      labels[start_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, start_time_2, stop_time_2]    

    elif n_bins == 1:
      labels[start_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, start_time_2, win_length]

      if stop_time_2 > 0.0:
        labels[stop_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, 0.0, stop_time_2]

    elif n_bins > 1:
      labels[start_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, start_time_2, win_length]

      for i in range(1, n_bins):
        labels[start_bin + i, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, 0.0, win_length]

      if stop_time_2 > 0.0:
        labels[stop_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, 0.0, stop_time_2]

  # divide all time values by window_length
  for labelIndex in range(len(labels)):
    for valIndex in range(len(labels[labelIndex])):
      if valIndex % 3 != 0:
        labels[labelIndex][valIndex] /= win_length

  return labels

In [32]:
CLASS_ENCODING = {"car": 0, "aircraft": 1, "crowds": 2, "footsteps": 3, "clocks": 4, "rainforest": 5}

In [33]:
def construct_data_set(fold_files, path):
  shutil.rmtree(path, ignore_errors=True)
  os.mkdir(path)

  window_length = 2.56
  hop_length = 1.0
  a_examples_train = []
  a_labels_train = []


  for i, audio in enumerate(fold_files):
    a, window_ranges = construct_examples(audio,win_len=window_length, hop_len=hop_length)
    a_examples_train += a

    for w in window_ranges:
      labels_t = construct_labels(audio.replace(".wav", ".txt").replace('outputs-mono', 'outputs'), w[0], w[1], win_len=window_length)
      ll = get_universal_labels(labels_t, CLASS_ENCODING, ex_length=window_length, no_of_div = 9)
      a_labels_train.append(ll)
  return a_examples_train, a_labels_train

In [34]:
examples_train, labels_train = construct_data_set(fold1_train_files, '/content/train-data')
examples_val, labels_val = construct_data_set(fold1_val_files, '/content/val-data')
examples_test, labels_test = construct_data_set(fold1_test_files, '/content/test-data')

## Extract MelSpectrogram

In [35]:
def get_log_melspectrogram(audio, sr = 44100, hop_length = 441, win_length = 1764, n_fft = 2048, n_mels = 40, fmin = 0, fmax = 22050):
    """Return the log-scaled Mel bands of an audio signal."""
    audio_2 = librosa.util.normalize(audio)
    bands = librosa.feature.melspectrogram(
        y=audio_2, sr=sr, hop_length=hop_length, win_length = win_length, n_fft=n_fft, n_mels=n_mels)
    return librosa.core.power_to_db(bands)

In [44]:
# save melspectrograms for entire set
def save_example_mel(example_set, save_path):
  for i, audio in tqdm(enumerate(example_set), total=len(example_set)):
    M = get_log_melspectrogram(audio).T
    # print(M.shape)
    np.save(save_path + str(i) + ".npy", M)

# save labels in numpy format
def save_labels_np(label_set, save_path):
  for i, audio in tqdm(enumerate(label_set), total=len(label_set)):
    np.save(save_path + str(i) + ".npy", audio)

In [45]:
# save labels for entire train set
train_path_ex = '/content/train-data/ex-'
train_path_labels = '/content/train-data/label-'
val_path_ex = '/content/val-data/ex-'
val_path_labels = '/content/val-data/label-'
test_path_ex = '/content/test-data/ex-'
test_path_labels = '/content/test-data/label-'

save_example_mel(examples_train, train_path_ex)

save_labels_np(labels_train, train_path_labels)
save_example_mel(examples_val, val_path_ex)
save_labels_np(labels_val, val_path_labels)
save_example_mel(examples_test, test_path_ex)
save_labels_np(labels_test, test_path_labels)

100%|██████████| 4926/4926 [01:15<00:00, 65.17it/s]
100%|██████████| 4926/4926 [00:00<00:00, 7872.61it/s]
100%|██████████| 1356/1356 [00:19<00:00, 70.28it/s]
100%|██████████| 1356/1356 [00:00<00:00, 7959.70it/s]
100%|██████████| 698/698 [00:09<00:00, 70.00it/s]
100%|██████████| 698/698 [00:00<00:00, 8353.40it/s]


## Sort and Partition Datasets


In [46]:
def intOrVal(s):
    try:
        return int(s)
    except ValueError:
        return s
    
def alphanum_key(init_string):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [intOrVal(c) for c in re.split('([0-9]+)', init_string)]

def sort_nicely(l):
    """ Sort the given list in the way that humans expect.
    """
    l.sort(key=alphanum_key)

In [47]:
def get_sorted_data(regex_path):
  data = glob.glob(regex_path) 
  sort_nicely(data)
  return data

"""
Load the individual numpy arrays into partition
"""
train_data_examples_regex_path = "/content/train-data/ex-*.npy"
train_data_labels_regex_path = "/content/train-data/label-*.npy"
val_data_examples_regex_path = "/content/val-data/ex-*.npy"
val_data_labels_regex_path = "/content/val-data/label-*.npy"
test_data_examples_regex_path = "/content/test-data/ex-*.npy"
test_data_labels_regex_path = "/content/test-data/label-*.npy"

train_data = get_sorted_data(train_data_examples_regex_path)
train_labels = get_sorted_data(train_data_labels_regex_path)

val_data = get_sorted_data(val_data_examples_regex_path)
val_labels = get_sorted_data(val_data_labels_regex_path)

test_data = get_sorted_data(test_data_examples_regex_path)
test_labels = get_sorted_data(test_data_labels_regex_path)

training_examples = [(train_data[i], train_labels[i]) for i in range(len(train_data))]
validation_examples = [(val_data[i], val_labels[i]) for i in range(len(val_data))]
test_examples = [(test_data[i], test_labels[i]) for i in range(len(test_data))]

# shuffle all training examples
random.seed(7)
random.shuffle(training_examples)
random.shuffle(validation_examples)
random.shuffle(test_examples)
partition = {}
partition['train'] = training_examples
partition['validation'] = validation_examples
partition['test'] = test_examples

## Setup Data Generators

In [48]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, list_examples, batch_size=128, shuffle=True):
        # self.dim = (1,)
        self.batch_size = batch_size
        self.list_examples = list_examples
        self.shuffle = shuffle

        # initial shuffle
        self.on_epoch_end()

    def __len__(self):
        '''Denotes the number of batches per epoch'''
        return int(np.floor(len(self.list_examples) / self.batch_size))

    def __getitem__(self, index):
        '''Generate one batch of data'''
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_examples[k] for k in indexes]

        # Generate data
        X, Y = self.generate_data(list_IDs_temp)

        return X, Y
        
    def on_epoch_end(self):
      self.indexes = np.arange(len(self.list_examples))

      # shuffle indexes at end of epoch
      if self.shuffle == True:
          np.random.shuffle(self.indexes)

    def generate_data(self, list_IDs_temp):
        # 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        X = np.empty([self.batch_size, 257, 40, 1], dtype=np.float64)
        Y = np.empty([self.batch_size, 9, 18], dtype=np.float64)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
          # Store sample
          # load npy array
          np_x = np.load(ID[0])

          X[i, :, :, 0] = np_x

          # load class label
          np_y = np.load(ID[1])
          Y[i, :, :] = np_y

        tau = X.shape[1]          
        v = X.shape[2]

        # frequency and time masking of X values
        warped_frequency_spectrogram = spec_augment_tensorflow.frequency_masking(X, v=v,  frequency_masking_para=8, frequency_mask_num=1)
        warped_frequency_time_spectrogram = spec_augment_tensorflow.time_masking(warped_frequency_spectrogram, tau=tau, time_masking_para=25, time_mask_num=2)

        X = warped_frequency_time_spectrogram

        return X, Y

In [49]:
# Parametersa
params = {'batch_size': 128, 'shuffle': True}

# Generators
training_generator = DataGenerator(partition['train'], **params)
validation_generator = DataGenerator(partition['validation'], **params)
test_generator = DataGenerator(partition['test'], **params)

# Model Setup

## Loss function

In [52]:
def square_difference_loss(y_true, y_pred):
  squared_difference = tf.square(y_true - y_pred)

  ss_True = squared_difference[:, :, 0] * 0 + 1

  # get every 3 value of y_true
  ss_0 = y_true[:, :, 0]
  ss_1 = y_true[:, :, 3]
  ss_2 = y_true[:, :, 6]
  ss_3 = y_true[:, :, 9]
  ss_4 = y_true[:, :, 12]
  ss_5 = y_true[:, :, 15]

  # stack values
  stacked_ss = tf.stack((ss_True, ss_0, ss_0,
                         ss_True, ss_1, ss_1,
                         ss_True, ss_2, ss_2,
                         ss_True, ss_3, ss_3,
                         ss_True, ss_4, ss_4,
                         ss_True, ss_5, ss_5), axis = 2)
  
  squared_difference =  tf.multiply(squared_difference, stacked_ss)

  return tf.reduce_sum(squared_difference, axis=[-1, -2])

## Training/Inference

In [53]:
# Creates mel spctrograms for validation fold for training
def create_train_melspectrograms():
  win_length = 2.56
  hop_size = 1.96
  mss_ins = []
  win_ranges_list = []


  for ii, audio in enumerate(fold1_val_files): # why val?
    a, win_ranges = construct_examples(audio, win_len=win_length,hop_len=hop_size)

    mss_in = np.zeros((len(a), 257, 40))

    preds = np.zeros((len(a), 9, 18))

    for i in range(len(a)):
      M = get_log_melspectrogram(a[i])
      mss_in[i, :, :] = M.T
    mss_ins.append(mss_in)
    win_ranges_list.append(win_ranges)
  return mss_ins,win_ranges_list


BASE_MSS_INS, BASE_WIN_RANGE = create_train_melspectrograms()

def mk_preds_YOHO_mel(model, ind, window_range_list=BASE_WIN_RANGE, mss_ins=BASE_MSS_INS, no_of_div = 9, hop_size = 1.96, discard = 0.3, win_length = 2.56, max_event_silence = 0.3, sampling_rate = 44100):
  preds = model.predict(mss_ins[ind])
  events = []

  for i in range(len(preds)):
    p = preds[i, :, :]
    events_curr = []
    win_width = win_length / no_of_div
    for j in range(len(p)):
      for jjj in range(0, 6):
        if p[j][jjj*3] >= 0.5:
          start = win_width * j + win_width * p[j][jjj*3+1] + window_range_list[ind][i][0]
          end = p[j][jjj*3+2] * win_width + start
          events_curr.append([start, end, rev_class_list[jjj]])

    events += events_curr


  class_set = set([c[2] for c in events])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in events:
    class_wise_events[c[2]].append(c)
    
  
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events

In [54]:
rev_class_list = list(CLASS_ENCODING.keys())
print(rev_class_list)

['car', 'aircraft', 'crowds', 'footsteps', 'clocks', 'rainforest']


In [55]:
def frames_to_time(f, sr = 44100.0, hop_size = 441):
  return f * hop_size / sr

def preds_to_se(p, win_start, audio_clip_length = 2.56):
  start_dicts = [-100, -100, -100, -100, -100, -100]
  stop_dicts = [-100, -100, -100, -100, -100, -100]


  start_speech = -100
  start_music = -100
  stop_speech = -100
  stop_music = -100

  audio_events = []

  n_frames = p.shape[0]

  for j in range(p.shape[1]):
    if p[0, j] >= 0.5:
      start_dicts[j] = 0

  for j in range(p.shape[1]):
    for i in range(n_frames - 1):
      if p[i, j] < 0.5 and p[i+1, j] >= 0.5:
        start_dicts[j] = i+1

      elif p[i, j] >= 0.5 and p[i + 1, j] < 0.5:
        stop_dicts[j] = i
        start_time = frames_to_time(start_dicts[j])
        stop_time = frames_to_time(stop_dicts[j])

        audio_events.append([start_time+win_start, stop_time+win_start, rev_class_list[j]])
        start_dicts[j] = -100
        stop_dicts[j] = -100

    if start_dicts[j] != -100:
      start_time = frames_to_time(start_dicts[j])
      stop_time = audio_clip_length
      audio_events.append([start_time+win_start, stop_time+win_start, rev_class_list[j]])
      start_dicts[j] = -100
      stop_dicts[j] = -100

  audio_events.sort(key = lambda x: x[0]) 
  return audio_events

In [56]:
def extract_eval_labels_2(annotation_path):
  events = convert_annotations_to_events(annotation_path)

  ann = [[float(e[0]), float(e[1]), e[2]] for e in events]
  
  n_label = "/content/eval-files-2/" + os.path.basename(annotation_path)

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in ann))

In [57]:
shutil.rmtree('/content/eval-files-2/', ignore_errors=True)
os.mkdir("/content/eval-files-2/")

In [58]:
for audio in fold1_val_files:
  extract_eval_labels_2(audio.replace(".wav", ".txt").replace("outputs-mono", "outputs"))

## YOHO Model

In [50]:
class YOHOBlock:
  def __init__(self, stride, num_filters, index, input):
      X = tf.keras.layers.DepthwiseConv2D(kernel_size = [3,3], 
                                          strides = stride, 
                                          depth_multiplier = 1, 
                                          padding = 'same', 
                                          use_bias = False,
                                          activation = None, 
                                          name = "layer" + str(index + 2) + "/depthwise_conv")(input)

      X = tf.keras.layers.BatchNormalization(center = True, 
                                             scale = False, 
                                             epsilon = 1e-4, 
                                             name = "layer" + str(index + 2) + "/depthwise_conv/bn")(X)

      X = tf.keras.layers.ReLU(name = "layer" + str(index + 2) + "/depthwise_conv/relu")(X)

      X = tf.keras.layers.Conv2D(filters = num_filters, 
                                 kernel_size = [1, 1], 
                                 strides = 1, 
                                 padding = 'same', 
                                 use_bias = False, 
                                 activation = None,
                                 name = "layer" + str(index + 2) + "/pointwise_conv",
                                 kernel_regularizer = l2(0.01), 
                                 bias_regularizer = l2(0.01))(X)
                                 
      X = tf.keras.layers.BatchNormalization(center = True, 
                                             scale = False, 
                                             epsilon = 1e-4, 
                                             name = "layer" + str(index + 2) + "/pointwise_conv/bn")(X)

      self.output = tf.keras.layers.ReLU(name = "layer" + str(index + 2) + "/pointwise_conv/relu")(X)

class Network:
  def __init__(self) -> None:
    self.NETWORK_BLOCK_LAYERS = [
      # (stride, num_filters)
      (1,   64),
      (2,  128),
      (1,  128),
      (2,  256),
      (1,  256),
      (2,  512),
      (1,  512),
      (1,  512),
      (1,  512),
      (1,  512),
      (1,  512),
      (2, 1024),
      (1, 1024),
      (1, 512),
      (1, 256),
      (1, 128),
    ]

    self.m_features = tf.keras.Input(shape = (257, 40), 
                                     name = "mel_input")
    X = self.m_features
    X = tf.keras.layers.Reshape((257, 40, 1))(X)
    X = tf.keras.layers.Conv2D(filters = 32, 
                               kernel_size = [3, 3], 
                               strides = 2, 
                               padding = 'same', 
                               use_bias = False,
                               activation = None, 
                               name = "layer1/conv",
                               kernel_regularizer = l2(1e-3), 
                               bias_regularizer = l2(1e-3))(X)
    X = tf.keras.layers.BatchNormalization(center = True, 
                                           scale = False, 
                                           epsilon = 1e-4, 
                                           name = "layer1/bn")(X)
    X = tf.keras.layers.ReLU(name = "layer1/relu")(X)

    X = tf.keras.layers.SpatialDropout2D(0.1)(X)
    
    for index in range(len(self.NETWORK_BLOCK_LAYERS)):
      X = YOHOBlock(stride = self.NETWORK_BLOCK_LAYERS[index][0], 
                    num_filters = self.NETWORK_BLOCK_LAYERS[index][1], 
                    index = index, 
                    input = X).output

    _, _, sx, sy = X.shape
    X = tf.keras.layers.Reshape((-1, int(sx * sy)))(X)
    self.pred = tf.keras.layers.Conv1D(18, 
                                       kernel_size = 1, 
                                       activation = "sigmoid")(X)


model_arch = Network()
model = tf.keras.Model(name = 'YOHO', 
                       inputs = model_arch.m_features, 
                       outputs = [model_arch.pred])

In [None]:
# model.summary()

In [59]:
class KerasFinalCallback(tf.keras.callbacks.Callback):
  def __init__(self):
    super(KerasFinalCallback, self).__init__()
    self.best_f1 = 0.0
    self.best_error = np.inf
    self.best_accuracy = 0.0
    
  def on_train_begin(self, logs=None):
    pass

  def on_train_end(self, logs=None):
    pass

  def on_epoch_end(self, epoch, logs=None):
    if epoch > 1:
      for ii, audio in enumerate(fold1_val_files):
        audio_file_path = audio
        see = mk_preds_YOHO_mel(self.model, ii)
        n_label = "/content/eval-files-2/" + os.path.basename(audio_file_path).replace(".wav" ,"") + "-se-prediction.txt"

        with open(n_label, 'w') as fp:
          fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))

      destination = "/content/eval-files-2/"
      test_set = glob.glob(destination + "*[0-9].txt")

      eval_path = "/content/"

      file_list = [{'reference_file': tt,
                    'estimated_file': tt.replace(".txt","-se-prediction.txt")}
                   for tt in test_set]

      data = []

      # Get used event labels
      all_data = dcase_util.containers.MetaDataContainer()
      for file_pair in file_list:
          reference_event_list = sed_eval.io.load_event_list(filename=file_pair['reference_file'])
          estimated_event_list = sed_eval.io.load_event_list(filename=file_pair['estimated_file'])

          data.append({'reference_event_list': reference_event_list,
                       'estimated_event_list': estimated_event_list})

          all_data += reference_event_list

      event_labels = all_data.unique_event_labels

      # Start evaluating

      # Create metrics classes, define parameters
      segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(event_label_list=event_labels,
                                                                       time_resolution=1.0)

      event_based_metrics = sed_eval.sound_event.EventBasedMetrics(event_label_list=event_labels,
                                                                   t_collar=1.0)

      # Go through files
      for file_pair in data:
          segment_based_metrics.evaluate(reference_event_list=file_pair['reference_event_list'],
                                         estimated_event_list=file_pair['estimated_event_list'])

          event_based_metrics.evaluate(reference_event_list=file_pair['reference_event_list'],
                                       estimated_event_list=file_pair['estimated_event_list'])
          

      # Get only certain metrics
      overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
      curr_f1 = overall_segment_based_metrics['f_measure']['f_measure']
      curr_error = overall_segment_based_metrics['error_rate']['error_rate']
      curr_accuracy = overall_segment_based_metrics['accuracy']['accuracy']
      
      if curr_f1 > self.best_f1:
          self.best_f1 = curr_f1
          self.model.save_weights("/content/model-best-f1.h5")

      if curr_error < self.best_error:
          self.best_error = curr_error
          self.model.save_weights("/content/model-best-error.h5")

      if curr_accuracy > self.best_accuracy:
          self.best_accuracy = curr_accuracy
          self.model.save_weights("/content/model-best-accuracy.h5")

      print("F-measure: {:.3f} vs {:.3f}".format(curr_f1, self.best_f1))
      print("Error rate: {:.3f} vs {:.3f}".format(curr_error, self.best_error))
      print("Accuracy: {:.3f}".format(curr_accuracy))

      wandb.log({
        'curr_f1': curr_f1,  
        'curr_error': curr_error,
        'curr_accuracy': curr_accuracy
      })


# Train Network

## Config

In [None]:
config = {}
config['experiment_name'] = 'yoho_new_data'
config['learning_rate'] = 1e-3

## WandB Setup

In [60]:
wandb.login(key="16d33cba1aa49db8432f676f71a538c59af40964")

[34m[1mwandb[0m: Currently logged in as: [33mmarlies-goes[0m ([33meshetty-11785[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [61]:
run = wandb.init(
    name = config['experiment_name'], ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = 'must' ### You need this to resume previous runs, but comment out reinit = True when using this
    project = 'project-runs', ### Project should be created in your wandb account 
    config = config, ### Wandb Config for your run
)

## Training

In [62]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=config['learning_rate']), 
              loss=square_difference_loss)

In [None]:
model.fit(training_generator, validation_data=validation_generator, epochs=1000, callbacks=[KerasFinalCallback()], verbose=1)

Epoch 1/1000




 6/38 [===>..........................] - ETA: 6s - loss: 77.7045







Epoch 2/1000
Epoch 3/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 4/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 5/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 6/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 7/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 8/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 9/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 10/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 11/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 12/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 13/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 14/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 15/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 16/1000
Error rate: 1.000 vs 1.000
Accuracy: 0.810
Epoch 17/1000
Error rate: 0.960 vs 0.960
Accuracy: 0.790
Epoch 18/1000
Error rate: 0.900 vs 0.900
Accuracy: 0.765
Epoch 19/1000
Error rate: 0.815 vs 0.815
Accuracy: 0.823
Epoch 20/1000
Error rate:

## Evaluation

In [None]:
loss = model.evaluate(test_generator)
print('loss:', loss)