In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm

import _pickle as pickle
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

import noisereduce as nr
import time
from copy import deepcopy
import random
from multiprocess import Pool
import gc
import librosa

from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from tensorflow.keras.optimizers import SGD,Adam

import simpleaudio as sa

from scipy.signal import resample

(dico_bird_label, dico_label_bird) = load('dico_labels')

def bird_to_label(labels):
    y = np.zeros((len(labels), len(dico_label_bird.keys())))
    
    for i,elt in enumerate(labels):
        list_bird = elt.split(' ')
        for bird in list_bird:
            try:
                y[i, dico_bird_label[bird]] = 1
            except:
                1
    return y

def label_to_bird(y, tres):
    labels = list(np.zeros(len(y)))
    
    for i, elt in enumerate(y):
        text = []
        for j, elt1 in enumerate(elt):
            if elt1 >= tres:
                text.append(dico_label_bird[j])
        text = " ".join(text)
        if text == "":
            text = 'nocall'
        labels[i] = text
    return labels

In [None]:
import pydub 
import numpy as np

def read(f, normalized=False):
    """MP3 to numpy array"""
    a = pydub.AudioSegment.from_mp3(f)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
        
        if y[:,1].max() > y[:,0].max():
            y = y[:,1]
        else:
            y = y[:,0]
        
    if normalized:
        return a.frame_rate, np.float32(y) / 2**15
    else:
        return a.frame_rate, y

def write(f, sr, x, normalized=False):
    """numpy array to MP3"""
    channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
    if normalized:  # normalized array - each item should be a float in [-1, 1)
        y = np.int16(x * 2 ** 15)
    else:
        y = np.int16(x)
    song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
    song.export(f, format="mp3", bitrate="320k")
    
# audio_file = 'XC2628.mp3'
# sr, x = read(audio_file)


## Learning to load and clean the data

In [None]:
sr1, x1 = read('adfly_XC2628.mp3')
sr2, x2 = read('amebit_XC127371.mp3')
sr3, x3 = read('Bullori.mp3')


In [None]:
def build_creneau(x):
    cr = np.zeros(len(x))
    
    windows = 0.5
    period = int(44100*windows)
    seconds = int(len(x)/(44100*windows))
    for elt in range(seconds):
        if x[elt*period:(elt+1)*period].max()>2500:
            cr[elt*period:(elt+1)*period] = 1
    return cr

In [None]:
clip = x2.astype('float')
sr = sr1
reduced_noise = nr.reduce_noise(audio_clip=clip, noise_clip=clip, verbose=False)
creneau = build_creneau(reduced_noise)

In [None]:
plt.plot(np.array(range(len(clip)))/sr, clip)

In [None]:
a = np.array(range(len(reduced_noise)))/sr
plt.plot(a, reduced_noise)
# plt.plot(a, np.ones(len(a))*2500)
plt.plot(a, creneau*5000)

## Collecting individual bird samples on 0.5 seconds

In [None]:
def f(element):
    import pydub 
    import numpy as np

    def read(f, normalized=False):
        """MP3 to numpy array"""
        a = pydub.AudioSegment.from_mp3(f)
        y = np.array(a.get_array_of_samples())
        if a.channels == 2:
            y = y.reshape((-1, 2))

            if y[:,1].max() > y[:,0].max():
                y = y[:,1]
            else:
                y = y[:,0]

        if normalized:
            return a.frame_rate, np.float32(y) / 2**15
        else:
            return a.frame_rate, y

    def write(f, sr, x, normalized=False):
        """numpy array to MP3"""
        channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
        if normalized:  # normalized array - each item should be a float in [-1, 1)
            y = np.int16(x * 2 ** 15)
        else:
            y = np.int16(x)
        song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
        song.export(f, format="mp3", bitrate="320k")
    
    import noisereduce as nr
    import time
    from copy import deepcopy
    try:
        label = []
        X = []
        bird = element[0]
        file = element[1]
        title  = bird
        sr1, x1 = read('./train_audio/'+str(bird)+'/'+str(file))
        clip = x1.astype('float')
        reduced_noise = nr.reduce_noise(audio_clip=clip, noise_clip=clip, verbose=False)

        x1 = deepcopy(reduced_noise)

        treshold = 2500

        windows = 0.5
        period = int(44100*windows)
        seconds = int(len(x1)/(44100*windows))
        for elt in range(seconds):

            if x1[elt*period:(elt+1)*period].max() < treshold/4:
                label.append('noise')
                X.append(x1[elt*period:(elt+1)*period])

        cond = True
        count = 0
        passe = 100
        for i in range(1,int(len(x1)/passe-period/passe)):
            if cond == True:
                if x1[passe*i-100:passe*i].max() >= treshold:
                    X.append(x1[passe*i - 5000:passe*i - 5000 + period])
                    label.append(title)
                    cond = False
                    count = 0
            count += passe
            if count >= period:
                cond = True
    except:
        label = []
        X = []
            
        
    return (label, X)

In [None]:
## Building a dataset of noises:
import gc
for bird in tqdm(os.listdir('./train_audio')[54:]):
    title = bird
    
    print(bird)
    
    p = Pool(6)
#     bird = 'aldfly'
    
    listing = [(bird, elt) for elt in os.listdir('./train_audio/'+str(bird))]
    
    results = p.map(f, listing)
    p.close()
    X = []
    label = []

    for elt in results:
        for elt1 in elt[0]:
            label.append(elt1)
        for elt2 in elt[1]:
            X.append(elt2)
    
    df = pd.DataFrame({'label' : label, 'audio' : X})
    save(df, title, 'raw_data')
    
    del X
    del label
    del df
    gc.collect()

In [None]:
os.listdir('./train_audio')[54:]

In [None]:
def integer(x):
    return x.astype('int32')

def shape(x):
    return x.shape[0]

In [None]:
df = load('aldfly', 'raw_data')
print(df.shape)
df['shape'] = df['audio'].apply(shape)
# df['audio'] = df['audio'].apply(integer)
df = df[df['shape'] != 0]
df = df[df['label'] == 'aldfly']
print(df.shape)
save(df, 'aldfly')

# del df
# import gc
# gc.collect()

In [None]:
birdname = 'aldfly'

def f(birdname):
    import pandas as pd
    import _pickle as pickle
    def save(file,name, folder = ""):
        if folder != "":
            outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
        else:
            outfile = open(name+'.pickle', 'wb')
        pickle.dump(file, outfile)
        outfile.close

    def load(name, folder = ""):
        if folder != "":
            outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
        else:
            outfile = open(name+'.pickle', 'rb')
        file = pickle.load(outfile)
        outfile.close
        return file
    
    def shape(x):
        return x.shape[0]
    
    birdname = birdname.replace('.pickle', '')
    df = load(birdname, 'raw_data1')
    df['shape'] = df['audio'].apply(shape)
    df = df[df['shape'] != 0]
    df_noise = df[df['label'] == 'noise'].sample(n = 150)
    df_bird = df[df['label'] == birdname]
    df_bird = df_bird.sample(n = min(600, df_bird.shape[0]))

    save(df_bird, birdname, 'raw_data')
    save(df_noise, 'noise_'+str(birdname), 'noise')

In [None]:
p = Pool(6)
p.map(f, os.listdir('./raw_data1'))
p.close()

## Building a first dataset made of individual sample

In [None]:
df = []


for elt in tqdm(os.listdir('noise')):
    df1 = load(elt.replace('.pickle', ''), 'noise')
    df1 = df1.sample(n=100)
    df.append(df1)
    
df = pd.concat(df)

save(df, 'noise_100', 'datasets')

In [None]:
df = []


for elt in tqdm(os.listdir('raw_data')):
    df1 = load(elt.replace('.pickle', ''), 'raw_data')
    df1 = df1.sample(n=250, replace = True)
    df.append(df1)
    
df = pd.concat(df)

save(df, 'birds_250', 'datasets')

In [None]:
del df

In [None]:
import gc
gc.collect()

## Predict with image features

In [None]:
df1 = load('noise_100', 'datasets')
df1['criterion'] = df1['audio'].apply(lambda x: x.max()/abs(x).mean())
df1['audio'] = df1['audio'].apply(lambda x : x*random.randint(1,30))
df1['label'] = 'nocall'
df1  =df1.sample(n = 1500)
print(df1['label'].unique())
# df = pd.concat([load('noise_100', 'datasets'), load('birds_100', 'datasets')])
df = load('birds_250', 'datasets')
df['audio'] = df['audio'].apply(lambda x : x*random.uniform(0.3,2))

df = pd.concat([df, df1])

df = df.sample(n = df.shape[0])
df = df[['label', 'audio']]

del df1
gc.collect()

In [None]:
X = np.array([np.array(x) for x in df['audio'].values])
Y = df['label'].values

y = np_utils.to_categorical(Y)
y = bird_to_label(Y)
del df
gc.collect()

In [None]:
X.shape

In [None]:
y

In [None]:
X1 = []
for elt in tqdm(X):
    mfccs = librosa.feature.mfcc(y=elt, sr=44100, n_mfcc=64)
    
    melspec = librosa.feature.melspectrogram(elt, sr=44100, n_mels = 64)
    melspec = librosa.power_to_db(melspec)
    
    chroma = librosa.feature.chroma_stft(y = elt, n_chroma = 64)
    
    temp = np.zeros((mfccs.shape[0], mfccs.shape[1], 3))
    temp[:,:,0] = mfccs
    temp[:,:,1] = melspec
    temp[:,:,2] = chroma
    
    X1.append(temp)
X = np.array(X1)

del X1
gc.collect()

In [None]:
save(X[:33000], 'test')

In [None]:
X1 = load('test')

In [None]:
X[:33000].shape

In [None]:
save(X[:33000], 'dataset_with_features1')
save(X[33000:], 'dataset_with_features2')
save((Y, y), 'dataset_with_features_labels')

In [None]:
X = np.concatenate([load('dataset_with_features1'), load('dataset_with_features2')], axis = 0)
Y, y =load('dataset_with_features_labels')

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
len(y_test)

In [None]:
X_train.shape

In [None]:
y_test.shape

In [None]:
class audio_file_generator:
    def __init__(self, 
                noise_min = 0.001,
                noise_max = 0.015,
                noise_p = 0.5,
                stretch_min_rate = 0.8,
                stretch_max_rate = 1.25,
                stretch_p = 0.5,
                pitch_min_semitones = -4,
                pitch_max_semitones = 4,
                pitch_p = 0.5,
                shift_min_fraction = -0.5,
                shift_max_fraction = 0.5,
                shift_p = 0.5,
                SAMPLE_RATE = 44100
                ):
        from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
        import librosa
        self.augmenter = Compose([
                AddGaussianNoise(min_amplitude=noise_min, max_amplitude=noise_max, p=noise_p),
                TimeStretch(min_rate=stretch_min_rate, max_rate=stretch_max_rate, p=stretch_p),
                PitchShift(min_semitones=pitch_min_semitones, max_semitones=pitch_max_semitones, p=pitch_p),
                Shift(min_fraction=shift_min_fraction, max_fraction=shift_max_fraction, p=shift_p),
            ])
        self.sample_rate = SAMPLE_RATE
        
    def get_batch(self, file, label, batch_size):
        batch_x = []
        batch_y = []
        
        batch_paths  = np.random.choice(a    = range(len(file)), 
                                          size = batch_size)
        
        for elt in batch_paths:
            sound = np.array(file[elt])
            sound_aug = self.augmenter(samples=sound, sample_rate=self.sample_rate)
            mfccs = librosa.feature.mfcc(y=sound_aug, sr=44100, n_mfcc=44)
            mfccs = mfccs.reshape(mfccs.shape[0], mfccs.shape[1], 1)
            batch_x.append(mfccs)
            batch_y.append(label[elt])
        batch_x = np.array(batch_x)
        batch_y = np.array(batch_y)
        return batch_x, batch_y
    
    def flow(self, file, label, batch_size):
        while True:
            batch_x = []
            batch_y = []

            batch_paths  = np.random.choice(a    = range(len(file)), 
                                              size = batch_size)

            for elt in batch_paths:
                sound = np.array(file[elt])
                sound_aug = self.augmenter(samples=sound, sample_rate=self.sample_rate)
                mfccs = librosa.feature.mfcc(y=sound_aug, sr=44100, n_mfcc=44)
                mfccs = mfccs.reshape(mfccs.shape[0], mfccs.shape[1], 1)
                batch_x.append(mfccs)
                batch_y.append(label[elt])
            batch_x = np.array(batch_x)
            batch_y = np.array(batch_y)

            yield( batch_x, batch_y )
        
    def valid_generator(self, file, label):
        batch_x = []
        batch_y = []
        batch_size = len(file)
        batch_paths  = range(len(file))
        
        for elt in batch_paths:
            sound = np.array(file[elt])
#             sound_aug = augmenter(samples=sound, sample_rate=self.sample_rate)
            mfccs = librosa.feature.mfcc(y=sound, sr=44100, n_mfcc=44)
            mfccs = mfccs.reshape(mfccs.shape[0], mfccs.shape[1], 1)
            batch_x.append(mfccs)
            batch_y.append(label[elt])
        batch_x = np.array(batch_x)
        batch_y = np.array(batch_y)
        
        return batch_x, batch_y

In [None]:
aug = audio_file_generator()



In [None]:
X_test, y_test = aug.valid_generator(file = X_test, label = y_test)

In [None]:
del X
del y
del Y

gc.collect()

In [None]:
import resnet
build = resnet.ResnetBuilder()
model = build.build_resnet_12((64,44,3),265)

In [None]:
model.summary()

In [None]:


optimizer = SGD(0.0001)

model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])


In [None]:
import tensorflow
stop = tensorflow.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0.001, patience=6, verbose=1, mode='auto',
    baseline=None, restore_best_weights=True
)
reduce = tensorflow.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, 
                                                     mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.0001)

import time
batch_size = 32
epochs = 50
t0 = time.time()
# history = model.fit_generator(aug.flow(X_train, y_train, batch_size=batch_size),
#     validation_data=(X_test, y_test), steps_per_epoch=len(X_train) // batch_size,
#     epochs=epochs, callbacks = [stop, reduce])

history = model.fit(X_train, y_train, batch_size=batch_size,
    validation_data=(X_test, y_test),  epochs=epochs, callbacks = [stop, reduce])


t1 = time.time()
print(t1-t0)

In [None]:
(t1-t0)/60

In [None]:
def top_5_accuracy(true, pred):
    pred = np.argsort(pred, axis = 1)[:,-5:]
    n1 = 0
    n2 = 0
    
    for i, elt in enumerate(true):
        if elt in pred[i]:
            n1 += 1
        n2 += 1
        
    return n1/n2

In [None]:
model.save_weights('./checkpoints/resnet for label/check')

In [None]:
pred = model.predict(X_test)
# 

In [None]:
true = np.argmax(y_test, axis = 1)

In [None]:
top_5_accuracy(true, pred)

In [None]:
pred = np.argmax(pred, axis = 1)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
plt.imshow(confusion_matrix(true, pred), cmap='hot')

In [None]:
len(true)

In [None]:
from sklearn.metrics import f1_score

f1_score(true, pred, average = 'macro')

In [None]:
(true == 0).sum()

In [None]:
confusion_matrix(true, pred)

In [None]:
dico_label

In [None]:
(true == 2).sum()

In [None]:
import seaborn as sns
sns.heatmap(confusion_matrix(true, pred))

## Create fake samples of 5 seconds with between 0 and 3 bird call and between 1 and call per bird type

In [None]:
df_noise = load('noise_100', 'datasets')
df_birds = load('birds_250', 'datasets')

noises = np.array([x for x in df_noise['audio'].values]).astype('int32')

del df_noise

bird_list = df_birds['label'].unique()

dico_bird = {}
for elt in tqdm(bird_list):
    dico_bird[elt] = np.array([x for x in df_birds[df_birds['label'] == elt]['audio'].values]).astype('int32')
    

In [None]:
dico_bird['aldfly'].shape

In [None]:
save((noises, dico_bird, bird_list), 'samples_for_creation')

In [None]:
(noises, dico_bird, bird_list) = load('samples_for_creation')

In [None]:
44100*5

In [None]:
np.random.randint(0,5,3)

In [None]:
import random
random.randint(0,3)

In [None]:
np.random.choice(bird_list, 3).astype(str)

In [None]:
import random
def generate_sample():
    ## create a first sample with noise
    noise_size = len(noises)
    ind = np.random.randint(0, noise_size - 1 , 10)
    
    sample = np.concatenate([noises[i] for i in ind])
    
    ## Chosing the number of birds to play
    n_birds = random.randint(0,4)
    
    bird_names = np.random.choice(bird_list, n_birds).astype(str)
    
    label = ' '.join([bird for bird in bird_names])
    if label == '':
        label = 'nocall'
        
    for bird in bird_names:
        ## define the number of calls of the given bird
        n_calls = random.choice([1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,4])
        
        for calls in range(n_calls):
            c = random.choice(dico_bird[bird])
            
            start = random.randint(0,len(sample) - len(c))
            
            sample[start:start+len(c)] = sample[start:start+len(c)] + c
    
    ## Building features of the sample
    sample = sample.astype('float')
    
    mfccs = librosa.feature.mfcc(y=sample, sr=44100, n_mfcc=64)
    melspec = librosa.feature.melspectrogram(sample, sr=44100, n_mels = 64)
    melspec = librosa.power_to_db(melspec)    
    chroma = librosa.feature.chroma_stft(y = sample, n_chroma = 64)
    
    feature = np.zeros((mfccs.shape[0], mfccs.shape[1], 3))
    feature[:,:,0] = mfccs
    feature[:,:,1] = melspec
    feature[:,:,2] = chroma
    
    return sample.astype(int), label, feature
    
    
    

In [None]:
import resnet
build = resnet.ResnetBuilder()
model = build.build_resnet_12((64,44,3),265)
model.load_weights('./checkpoints/resnet for label/check')

In [None]:
label

In [None]:
# def predict(audio):
#     features = []
#     windows = []
#     for i in tqdm(range(19)):
#         sample = audio[11025*i : 11025*i + 22050]
#         if sample.max() >= 2500:
#             sample = sample.astype('float')
#             windows.append((11025*i, 11025*i + 22050))
#             mfccs = librosa.feature.mfcc(y=sample, sr=44100, n_mfcc=64)
#             melspec = librosa.feature.melspectrogram(sample, sr=44100, n_mels = 64)
#             melspec = librosa.power_to_db(melspec)    
#             chroma = librosa.feature.chroma_stft(y = sample, n_chroma = 64)

#             feature = np.zeros((mfccs.shape[0], mfccs.shape[1], 3))
#             feature[:,:,0] = mfccs
#             feature[:,:,1] = melspec
#             feature[:,:,2] = chroma
#             features.append(feature)
#     features = np.array(features)
#     pred = model.predict(features)
#     return pred, windows

In [None]:
def predict(audio):      
    
    features = []
    windows1 = []
    x1 = audio
    treshold = 2000    
    windows = 0.5
    period = int(44100*windows)
    seconds = int(len(x1)/(44100*windows))
    offset = 5000

    count = 0
    passe = 100
    cond = True
    for i in tqdm(range(int(offset/passe)+1,int((len(audio) - period + 5000)/passe) )):
        if cond == True:
            if audio[passe*i-100:passe*i].max() >= treshold:
                sample = audio[passe*i - offset:passe*i - offset + period].astype(float)
                windows1.append((passe*i - offset,passe*i - offset + period))
                mfccs = librosa.feature.mfcc(y=sample, sr=44100, n_mfcc=64)
                melspec = librosa.feature.melspectrogram(sample, sr=44100, n_mels = 64)
                melspec = librosa.power_to_db(melspec)    
                chroma = librosa.feature.chroma_stft(y = sample, n_chroma = 64)

                feature = np.zeros((mfccs.shape[0], mfccs.shape[1], 3))
                feature[:,:,0] = mfccs
                feature[:,:,1] = melspec
                feature[:,:,2] = chroma
                features.append(feature)
                
                cond = False
                count = 0
        count += passe
        if count >= period:
            cond = True
    features = np.array(features)
    pred = model.predict(features)
    return pred, windows1

In [None]:
sample, label, feature1 = generate_sample()

In [None]:
(row, label38, data, features38) = load('validation_set')

In [None]:
ind = 36
sample = data[ind]
label = label38[ind]

In [None]:
pred, windows = predict(sample)

In [None]:
def add_windows(wind):
    x = []
    for elt in wind:
        s = np.zeros(len(sample))
        s[elt[0]:elt[1]] = 1
        x.append(s)
    return x

In [None]:
wind = add_windows(windows)

plt.figure(figsize=(20,10))
plt.plot(sample)

for elt in wind:
    plt.plot(elt*1000)

In [None]:
def label_to_bird_softmax(y):
    labels = list(np.zeros(len(y)))
    y1 = np.argmax(y,axis = -1)
    for i, elt in enumerate(y1):
        labels[i] = dico_label_bird[elt]
    
    return labels, y.max(axis = -1)

In [None]:
label

In [None]:
label1, proba = label_to_bird_softmax(pred)

In [None]:
for i, elt in enumerate(label1):
    print(elt + '      '+str(proba[i]))

In [None]:
label

In [None]:
plt.plot(sample)

In [None]:
plt.imshow(feature[:,:,0])

In [None]:
plt.imshow(feature[:,:,1])

In [None]:
plt.imshow(feature[:,:,2])

In [None]:
import numpy as np
import simpleaudio as sa
fs = 44100
play_obj = sa.play_buffer(sample, 1, 2, fs)
play_obj.wait_done()

In [None]:
for i in tqdm(range(10)):
    sample, label, feature = generate_sample()

In [None]:
import time
size = 1000
t0 = time.time()
X = list(np.zeros(size))
y = list(np.zeros(size))

for i in tqdm(range(size)):
    sample, label, feature = generate_sample()
    X[i] = feature
    y[i] = label

X = np.array(X)
y = np.array(y)

t1 = time.time()

In [None]:
save((X, y), 'testing_dataset')

In [None]:
for i in tqdm(range(4,17)):
    size = 6000
    X = list(np.zeros(size))
    y = list(np.zeros(size))
    for j in tqdm(range(size)):
        sample, label, feature = generate_sample()
        X[j] = feature
        y[j] = label
    X = np.array(X)
    y = np.array(y)
    save((X, y), 'D:/bird_recognition/dataset/batch_'+str(i))

## Preparation du set de validation

In [None]:
def read(f, normalized=False):
    """MP3 to numpy array"""
    a = pydub.AudioSegment.from_mp3(f)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
        
        if y[:,1].max() > y[:,0].max():
            y = y[:,1]
        else:
            y = y[:,0]
        
    if normalized:
        return a.frame_rate, np.float32(y) / 2**15
    else:
        return a.frame_rate, y

In [None]:
audio = './example_test_audio/BLKFR-10-CPL_20190611_093000.pt540.mp3'
audio1 = './example_test_audio/ORANGE-7-CAP_20190606_093000.pt623.mp3'

sr, sample = read(audio1, normalized=False)

In [None]:
def prepare_test_set():
    df = pd.read_csv('test_example.csv')
    
    row = []
    label = []
    data = []
    features = []
    
    
    for ids in df['audio_id'].unique():
        
        if ids == 'ORANGE-7-CAP_20190606_093000':
            
            df1 = df[df['audio_id'] == ids]
            
            sr, sample = read('./example_test_audio/'+ids+'.mp3')
            sample = sample - sample.mean()
            sample = sample*30000/sample.max()
            
            ## resample the data
            duration = len(sample) / sr
            new_duration = duration * 44100
            
            sample = resample(sample.astype(int), int(new_duration))
            
            sr = 44100
            ## clean
            clip = sample.astype(float)
            reduced_noise = nr.reduce_noise(audio_clip=clip, noise_clip=clip, verbose=False)
            
            ## cut and build features
            for i, line in df1.iterrows():
                row.append(line['row_id'])
                label.append(line['labels'])
                
                seconds = line['seconds']
                
                s = reduced_noise[(seconds-5)*sr:seconds*sr]
                
                if len(s) > 44100*5:
                    s = s[-44100*5:]
                if len(s) < 44100*5:
                    s = np.concatenate([np.zeros(44100*5-len(s)), s])
                
                data.append(s)
                ## build features
                mfccs = librosa.feature.mfcc(y=s, sr=sr, n_mfcc=64)
                melspec = librosa.feature.melspectrogram(s, sr=sr, n_mels = 64)
                melspec = librosa.power_to_db(melspec)    
                chroma = librosa.feature.chroma_stft(y = s,sr = sr, n_chroma = 64)

                feature = np.zeros((mfccs.shape[0], mfccs.shape[1], 3))
                feature[:,:,0] = mfccs
                feature[:,:,1] = melspec
                feature[:,:,2] = chroma
                features.append(feature)
    
    df_data = pd.DataFrame({'row_id' : row, 'label' : label, 'signal':data, 'features' : features})
    
    return row, label, data, features

In [None]:
row, label, data, features = prepare_test_set()

In [None]:
save((row, label, data, features), 'validation_set')

## Preparing the final classification

In [None]:
# dico_bird_label = {'nocall':0}
# dico_label_bird = {0:'nocall'}

# for i, elt in enumerate(bird_list):
#     dico_bird_label[elt] = i+1
#     dico_label_bird[i+1] = elt

(dico_bird_label, dico_label_bird) = load('dico_labels')

def bird_to_labels(labels):
    y = np.zeros((len(labels), len(dico_label_bird.keys())))
    
    for i,elt in enumerate(labels):
        list_bird = elt.split(' ')
        for bird in list_bird:
            try:
                y[i, dico_bird_label[bird]] = 1
            except:
                1
    return y

def labels_to_birds(y, tres):
    labels = list(np.zeros(len(y)))
    
    for i, elt in enumerate(y):
        text = []
        for j, elt1 in enumerate(elt):
            if elt1 >= tres:
                text.append(dico_label_bird[j])
        text = " ".join(text)
        if text == "":
            text = 'nocall'
        labels[i] = text
    return labels
        
save((dico_bird_label, dico_label_bird), 'dico_labels')   

## Training

In [None]:
(X, Y) = load('./batch/batch_'+str(0))

In [None]:
y = bird_to_labels(Y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

del X
del y
gc.collect()

In [None]:
import resnet
build = resnet.ResnetBuilder()
model = build.build_resnet_12((64,431,3),265)

In [None]:
model.summary()

In [None]:

optimizer = SGD(0.1)
import tensorflow as tf
model.compile(loss='categorical_crossentropy',#tf.nn.sigmoid_cross_entropy_with_logits,
              optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
import tensorflow
stop = tensorflow.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0.001, patience=6, verbose=1, mode='auto',
    baseline=None, restore_best_weights=True
)
reduce = tensorflow.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, 
                                                     mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.001)

import time
batch_size = 32
epochs = 30
t0 = time.time()
# history = model.fit_generator(aug.flow(X_train, y_train, batch_size=batch_size),
#     validation_data=(X_test, y_test), steps_per_epoch=len(X_train) // batch_size,
#     epochs=epochs, callbacks = [stop, reduce])

history = model.fit(X_train, y_train, batch_size=batch_size,
    validation_data=(X_test, y_test),  epochs=epochs, callbacks = [stop, reduce])


t1 = time.time()
print(t1-t0)

In [None]:
true = labels_to_birds(y_test,0.5)

In [None]:
pred = model.predict(X_test)

In [None]:
pred = labels_to_birds(pred,0.5)

In [None]:
true

In [None]:
pred

In [None]:
for i, elt in enumerate(pred):
    print(elt + '      '+true[i])
    

In [None]:
def row_wise_f1_score_micro(y_true, y_pred):
    """ author @shonenkov """
    F1 = []
    for preds, trues in zip(y_pred, y_true):
        TP, FN, FP = 0, 0, 0
        preds = preds.split()
        trues = trues.split()
        for true in trues:
            if true in preds:
                TP += 1
            else:
                FN += 1
        for pred in preds:
            if pred not in trues:
                FP += 1
        F1.append(2*TP / (2*TP + FN + FP))
    return np.mean(F1)

In [None]:
row_wise_f1_score_micro(true, pred)

In [None]:
(row, label, data, features) = load('validation_set')

In [None]:
label = bird_to_labels(label)
label = labels_to_birds(label, 0.5)

In [None]:
np.array(features).shape

In [None]:
pred_val = model.predict(np.array(features))
pred_val = labels_to_birds(pred_val,0.5)

In [None]:
plt.plot(data[0])

In [None]:
for i, elt in enumerate(pred_val):
    print(elt + '      '+label[i])

In [None]:
row_wise_f1_score_micro(label, pred_val)