# Extract Whistles From Box 2 Box

We need to extract whistles played through the ocean and recorded by a box.

### !!!! CAREFULL THE CUTS WONT BE PERFECT AND THE LABELING WILL NEITHER CHECK ALL FILES

## Basic Idea
Use the original file to find gaps.
Align the recorded file to the original file (manually).
Find all gaps that are large enough in the original file:
    $$(gap_{i, start}, gap_{i, stop})$$.
    
Use the gaps to find whistles. Since the extractor is not perfect, we classify each region using the trace and
a nearest neighbor based solution based on dtw.

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

from mimic_utils.spectrogram import *
from mimic_utils.whistle_tracer import *
from mimic_utils.params import * 
from scipy.io import wavfile
from scipy.spatial.distance import euclidean

from fastdtw import fastdtw


## The algorithms to extract the audio

In [None]:
def wtrace(audio):    
    spec = fwd_spectrogram(audio, FFT_WIN, FFT_STEP)
    whistle_trace, _  = trace(spec, TRACE_RAD, SMOOTH_ENT)
    whistle_trace = whistle_trace[:-1]
    whistle_trace = whistle_trace[whistle_trace > 0.0]
    hi  = np.max(whistle_trace)
    lo = np.min(whistle_trace)
    return (whistle_trace - lo) / (hi - lo)


class Classifier:
    
    def __init__(self, folder):
        self.templates = {}
        for filename in os.listdir(folder):
            if filename.endswith(".wav"):
                print("Loading Classifier Template: {}".format(filename))
                basename = filename.split("/")[-1].replace(".wav", "")
                _, data = wavfile.read("{}/{}".format(folder, filename))
                self.templates[basename] = wtrace(data[:, 0])
                
    def nn(self, trace):
        min_label = -1
        min_dist  = float('inf') 
        for label, template in self.templates.items():
            dist, path = fastdtw(trace, template, dist=euclidean)
            print("{} {}", label, dist)
            if dist < min_dist:
                min_dist = dist
                min_label = label
        print("Min Dist: {} Min Label: {}".format(min_dist, min_label))
        return min_label
namer = Classifier("originals/")

for label,t in namer.templates.items():
    plt.plot(t)
plt.show()

## Extract the 00 variations

In [None]:
def extract_all(original, recorded, output, classifier, border = 30000, min_gap_size = 100000, th = 0.2):
    basename = recorded.split("/")[-1].replace(".wav", "")
    print(basename)
    _,  data_original = wavfile.read(original)
    fs, data_box      = wavfile.read(recorded)
    data = (data_original[:, 0] + 32768) / (32768 + 32767)    

    last_sample = 0
    start_i = 0    
    gaps = []
    
    print("Using templates")
    for label, t in classifier.templates.items():
        plt.title(label)
        plt.plot(t)
        plt.show()
    
    print("Searching gaps")
    n = len(data_original)
    for i in range(10, n):
        sample = np.sum(data[i - 10:i]) / 10        
        if sample < th and last_sample >= th and i - start_i > min_gap_size:
            print("STOP {} {} {} %done {} {}".format(start_i / fs, i / fs, (i - start_i) / fs, (i / n) * 100, len(gaps)))
            gaps.append([start_i, i])        
        if sample >= th and last_sample < th:
            start_i = i
        last_sample = sample

    print("Processing gaps")
    for i in range(1, len(gaps)):  
        _, start = gaps[i - 1]
        stop, _  = gaps[i]
        print("Tracing {} {}".format(start / fs, stop / fs))
        trace = wtrace(data_original[start - border: stop + border, 0])
        print("Traced!")
        name = classifier.nn(trace)        
        filename = '{}/{}_{}_{}.wav'.format(output, name, basename, start)
        print("DETECTED: ", filename)
        wavfile.write(filename, fs, data_box[start - border: stop + border, 0])

output     = "00" 
original   = '/Users/daniel.kohlsdorf/Desktop/00-all-whistles-2019-synth--04--18.wav'
recorded   = '/Users/daniel.kohlsdorf/Desktop/00_chat1-2019-06-23T123304-192k.wav'
extract_all(original, recorded, output, namer)