# Imports

In [1]:
import os
from pathlib import Path

In [2]:
import tensorflow as tf
import tensorflow_io as tfio

In [3]:
from IPython.display import Audio

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact

In [5]:
%load_ext autoreload 
%autoreload 2
%reload_ext autoreload

In [6]:
from birds.preproc import generate_tensor, generate_spectrogram
from birds.preproc import generate_mel_spectrogram, generate_db_scale_mel_spectrogram

# Tensorflow Dataset

## Import functions from `spectrogram.py`

In [55]:
df_meta = pd.read_csv('/home/phil/code/El-Doro/birds/raw_data/metadata_train.csv')
dfmeta = dfmeta[['Species', 'Target']].drop_duplicates(keep='first')
dico = dict(zip(list(dfmeta2['Species']), list(dfmeta2['Target'])))
dico

{'Sonus naturalis': 0,
 'Fringilla coelebs': 1,
 'Parus major': 2,
 'Turdus merula': 3,
 'Turdus philomelos': 4,
 'Sylvia communis': 5,
 'Emberiza citrinella': 6,
 'Sylvia atricapilla': 7,
 'Emberiza calandra': 8,
 'Phylloscopus trochilus': 9,
 'Luscinia megarhynchos': 10,
 'Strix aluco': 11,
 'Phylloscopus collybita': 12,
 'Carduelis carduelis': 13,
 'Erithacus rubecula': 14,
 'Chloris chloris': 15,
 'Sylvia borin': 16,
 'Acrocephalus arundinaceus': 17,
 'Acrocephalus dumetorum': 18,
 'Oriolus oriolus': 19,
 'Troglodytes troglodytes': 20,
 'Bubo bubo': 21,
 'Ficedula parva': 22,
 'Linaria cannabina': 23,
 'Luscinia svecica': 24,
 'Alauda arvensis': 25,
 'Luscinia luscinia': 26,
 'Phoenicurus phoenicurus': 27,
 'Aegolius funereus': 28,
 'Cyanistes caeruleus': 29,
 'Hirundo rustica': 30,
 'Emberiza cirlus': 31,
 'Locustella naevia': 32,
 'Cuculus canorus': 33,
 'Sylvia curruca': 34,
 'Loxia curvirostra': 35,
 'Emberiza hortulana': 36,
 'Carpodacus erythrinus': 37,
 'Athene noctua': 38,


In [56]:
TARGET_DICT = {'Sonus naturalis': 0,
               'Fringilla coelebs': 1,
               'Parus major': 2,
               'Turdus merula': 3,
               'Turdus philomelos': 4,
               'Sylvia communis': 5,
               'Emberiza citrinella': 6,
               'Sylvia atricapilla': 7,
               'Emberiza calandra': 8,
               'Phylloscopus trochilus': 9,
               'Luscinia megarhynchos': 10,
               'Strix aluco': 11,
               'Phylloscopus collybita': 12,
               'Carduelis carduelis': 13,
               'Erithacus rubecula': 14,
               'Chloris chloris': 15,
               'Sylvia borin': 16,
               'Acrocephalus arundinaceus': 17,
               'Acrocephalus dumetorum': 18,
               'Oriolus oriolus': 19,
               'Troglodytes troglodytes': 20,
               'Bubo bubo': 21,
               'Ficedula parva': 22,
               'Linaria cannabina': 23,
               'Luscinia svecica': 24,
               'Alauda arvensis': 25,
               'Luscinia luscinia': 26,
               'Phoenicurus phoenicurus': 27,
               'Aegolius funereus': 28,
               'Cyanistes caeruleus': 29,
               'Hirundo rustica': 30,
               'Emberiza cirlus': 31,
               'Locustella naevia': 32,
               'Cuculus canorus': 33,
               'Sylvia curruca': 34,
               'Loxia curvirostra': 35,
               'Emberiza hortulana': 36,
               'Carpodacus erythrinus': 37,
               'Athene noctua': 38,
               'Crex crex': 39,
               'Acrocephalus schoenobaenus': 40,
               'Acrocephalus palustris': 41,
               'Periparus ater': 42,
               'Phylloscopus sibilatrix': 43,
               'Emberiza schoeniclus': 44,
               'Hippolais icterina': 45,
               'Pyrrhula pyrrhula': 46,
               'Caprimulgus europaeus': 47,
               'Ficedula hypoleuca': 48,
               'Glaucidium passerinum': 49}

In [10]:
directory = '../raw_data/data_30s/train/'
df = pd.read_csv(directory + 'y_train.csv')
df.head()

Unnamed: 0,Path,Target
0,Sonus-naturalis-447407_tens.ogg,0
1,Sonus-naturalis-387437_tens.ogg,0
2,Sonus-naturalis-383228_tens.ogg,0
3,Sonus-naturalis-358240_tens.ogg,0
4,Sonus-naturalis-397031_tens.ogg,0


In [84]:
def create_df_train_df_val_from_directory(directory, sound_filetype = 'ogg',train_val_ratio = 0.8):
    '''
    Objective : Generate two data frames (train and val) with list of audio files and associated target number,
                taking into account potential class imbalances
    Inputs : directory :  file directory containing audio files
             sound_file_type : by default ogg
             train_val_ratio : ratio used to split between training and validation data
    Output : train and val dataframes
    '''
    # create dataframe with directory audio file list, target names derived from files name and target number
    data = pd.DataFrame(sorted([file for file in os.listdir(directory) if file.endswith(sound_filetype)])
                        ,columns=['Path'])
    data['Target_name'] = data['Path'].apply(lambda x : ' '.join(x.split(sep='-')[0:2]))
    target_list = list(pd.unique(data['Target_name']))
    # data['Target'] = data['Target_name'].apply(lambda x: target_list.index(x))
    data['Target'] = data['Target_name'].map(TARGET_DICT) # On récupère les numéros de classe originaux
    
    # create intermediate dataframe to calculate split indexes by target using train_val_ratio
    subdf_count_by_target = pd.pivot_table(data,index=['Target_name'],aggfunc={'Target' : 'count'})\
                                            .rename(columns={'Target':'target_size'})
    subdf_cummul_sum_by_target = data.groupby(by=['Target_name']).sum()\
                                            .rename(columns={'Target':'start_index'})
    split_index_df = subdf_count_by_target.merge(subdf_cummul_sum_by_target, left_index=True, right_index=True)
    split_index_df['split_index'] = split_index_df['start_index']+round(train_val_ratio*split_index_df['target_size'])
    print(split_index_df.head())
    
    # create train and val from first dataframe using indexes calculated in split_index_df
    df_train = data.iloc[0:0]
    for target in list(pd.unique(data['Target_name'])):
        start_index = int(split_index_df.loc[target].start_index)
        split_index = int(split_index_df.loc[target].split_index)
        df_train = df_train.append(data.iloc[start_index:split_index])

    df_val = pd.concat([data,df_train]).drop_duplicates(keep=False)
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    
    return df_train, df_val, split_index_df

In [85]:
df_train, df_val, split_index_df = create_df_train_df_val_from_directory(directory)

                            target_size  start_index  split_index
Target_name                                                      
Acrocephalus arundinaceus            33          561        587.0
Acrocephalus dumetorum               33          594        620.0
Acrocephalus palustris               33         1353       1379.0
Acrocephalus schoenobaenus           33         1320       1346.0
Aegolius funereus                    33          924        950.0


In [86]:
split_index_df

Unnamed: 0_level_0,target_size,start_index,split_index
Target_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acrocephalus arundinaceus,33,561,587.0
Acrocephalus dumetorum,33,594,620.0
Acrocephalus palustris,33,1353,1379.0
Acrocephalus schoenobaenus,33,1320,1346.0
Aegolius funereus,33,924,950.0
Alauda arvensis,33,825,851.0
Athene noctua,33,1254,1280.0
Bubo bubo,33,693,719.0
Caprimulgus europaeus,33,1551,1577.0
Carduelis carduelis,33,429,455.0


In [83]:
df_train

Unnamed: 0,Path,Target_name,Target
0,Emberiza-citrinella-131944_tens.ogg,Emberiza citrinella,6
1,Emberiza-citrinella-167478_tens.ogg,Emberiza citrinella,6
2,Emberiza-citrinella-176746_tens.ogg,Emberiza citrinella,6
3,Emberiza-citrinella-176810_tens.ogg,Emberiza citrinella,6
4,Emberiza-citrinella-177985_tens.ogg,Emberiza citrinella,6
...,...,...,...
1295,Aegolius-funereus-508094_tens.ogg,Aegolius funereus,28
1296,Aegolius-funereus-508096_tens.ogg,Aegolius funereus,28
1297,Aegolius-funereus-508108_tens.ogg,Aegolius funereus,28
1298,Aegolius-funereus-508274_tens.ogg,Aegolius funereus,28


In [77]:
df_train['Target_name'] = df_train['Target_name'].str.replace('-',' ')
df_train['Target_number'] = df_train['Target_name'].map(TARGET_DICT)
df_train

Unnamed: 0,Path,Target_name,Target,Target_number
0,Acrocephalus-arundinaceus-131536_tens.ogg,Acrocephalus arundinaceus,0,17
1,Acrocephalus-arundinaceus-136005_tens.ogg,Acrocephalus arundinaceus,0,17
2,Acrocephalus-arundinaceus-178787_tens.ogg,Acrocephalus arundinaceus,0,17
3,Acrocephalus-arundinaceus-178793_tens.ogg,Acrocephalus arundinaceus,0,17
4,Acrocephalus-arundinaceus-178869_tens.ogg,Acrocephalus arundinaceus,0,17
...,...,...,...,...
1295,Turdus-philomelos-309586_tens.ogg,Turdus philomelos,49,4
1296,Turdus-philomelos-317110_tens.ogg,Turdus philomelos,49,4
1297,Turdus-philomelos-325362_tens.ogg,Turdus philomelos,49,4
1298,Turdus-philomelos-336236_tens.ogg,Turdus philomelos,49,4


In [72]:
df_train['Target3'] = df_train['Target_name'].apply(lambda x : x.split()[0])
df_train['Target3'].nunique()

34

In [27]:
df_train = df_train.reset_index(drop=True)
df_train.head(60)

Unnamed: 0,Path,Target_name,Target
0,Acrocephalus-arundinaceus-131536_tens.ogg,Acrocephalus-arundinaceus,0
1,Acrocephalus-arundinaceus-136005_tens.ogg,Acrocephalus-arundinaceus,0
2,Acrocephalus-arundinaceus-178787_tens.ogg,Acrocephalus-arundinaceus,0
3,Acrocephalus-arundinaceus-178793_tens.ogg,Acrocephalus-arundinaceus,0
4,Acrocephalus-arundinaceus-178869_tens.ogg,Acrocephalus-arundinaceus,0
5,Acrocephalus-arundinaceus-178959_tens.ogg,Acrocephalus-arundinaceus,0
6,Acrocephalus-arundinaceus-181959_tens.ogg,Acrocephalus-arundinaceus,0
7,Acrocephalus-arundinaceus-181961_tens.ogg,Acrocephalus-arundinaceus,0
8,Acrocephalus-arundinaceus-215950_tens.ogg,Acrocephalus-arundinaceus,0
9,Acrocephalus-arundinaceus-233487_tens.ogg,Acrocephalus-arundinaceus,0


In [22]:
df_train.reset_index(drop=True)
df_train.head(60)

Unnamed: 0,index,Path,Target_name,Target
0,0,Acrocephalus-arundinaceus-131536_tens.ogg,Acrocephalus-arundinaceus,0
1,1,Acrocephalus-arundinaceus-136005_tens.ogg,Acrocephalus-arundinaceus,0
2,2,Acrocephalus-arundinaceus-178787_tens.ogg,Acrocephalus-arundinaceus,0
3,3,Acrocephalus-arundinaceus-178793_tens.ogg,Acrocephalus-arundinaceus,0
4,4,Acrocephalus-arundinaceus-178869_tens.ogg,Acrocephalus-arundinaceus,0
5,5,Acrocephalus-arundinaceus-178959_tens.ogg,Acrocephalus-arundinaceus,0
6,6,Acrocephalus-arundinaceus-181959_tens.ogg,Acrocephalus-arundinaceus,0
7,7,Acrocephalus-arundinaceus-181961_tens.ogg,Acrocephalus-arundinaceus,0
8,8,Acrocephalus-arundinaceus-215950_tens.ogg,Acrocephalus-arundinaceus,0
9,9,Acrocephalus-arundinaceus-233487_tens.ogg,Acrocephalus-arundinaceus,0


In [14]:
df_train.head(60)

Unnamed: 0,Path,Target_name,Target
0,Acrocephalus-arundinaceus-131536_tens.ogg,Acrocephalus-arundinaceus,0
1,Acrocephalus-arundinaceus-136005_tens.ogg,Acrocephalus-arundinaceus,0
2,Acrocephalus-arundinaceus-178787_tens.ogg,Acrocephalus-arundinaceus,0
3,Acrocephalus-arundinaceus-178793_tens.ogg,Acrocephalus-arundinaceus,0
4,Acrocephalus-arundinaceus-178869_tens.ogg,Acrocephalus-arundinaceus,0
5,Acrocephalus-arundinaceus-178959_tens.ogg,Acrocephalus-arundinaceus,0
6,Acrocephalus-arundinaceus-181959_tens.ogg,Acrocephalus-arundinaceus,0
7,Acrocephalus-arundinaceus-181961_tens.ogg,Acrocephalus-arundinaceus,0
8,Acrocephalus-arundinaceus-215950_tens.ogg,Acrocephalus-arundinaceus,0
9,Acrocephalus-arundinaceus-233487_tens.ogg,Acrocephalus-arundinaceus,0


In [15]:
df_val

Unnamed: 0,Path,Target_name,Target
26,Acrocephalus-arundinaceus-417152_tens.ogg,Acrocephalus-arundinaceus,0
27,Acrocephalus-arundinaceus-417156_tens.ogg,Acrocephalus-arundinaceus,0
28,Acrocephalus-arundinaceus-422018_tens.ogg,Acrocephalus-arundinaceus,0
29,Acrocephalus-arundinaceus-422019_tens.ogg,Acrocephalus-arundinaceus,0
30,Acrocephalus-arundinaceus-448264_tens.ogg,Acrocephalus-arundinaceus,0
...,...,...,...
1645,Turdus-philomelos-463476_tens.ogg,Turdus-philomelos,49
1646,Turdus-philomelos-481254_tens.ogg,Turdus-philomelos,49
1647,Turdus-philomelos-528246_tens.ogg,Turdus-philomelos,49
1648,Turdus-philomelos-529788_tens.ogg,Turdus-philomelos,49


In [8]:
import os
def create_filepath_target_df(directory, soundfiletype = 'ogg'):
    data = pd.DataFrame(sorted([file for file in os.listdir(directory) if file.endswith(soundfiletype)])
                        ,columns=['Path'])
    data['Target_name'] = data['Path'].apply(lambda x : '-'.join(x.split(sep='-')[0:2]))
    data['Target'] = data['Target_name'].apply(lambda x: list(pd.unique(data['Target_name'])).index(x))
    return data

data = create_filepath_target_df(directory)

In [12]:
def create_filepath_target_train_val_df(directory, sound_filetype = 'ogg',train_val_ratio = 0.8):
    '''
    Objective : Generate two data frames (train and val) with list of audio files and associated target number
    Inputs : directory :  file directory containing audio files
             sound_file_type : by default ogg
             train_val_ratio : ratio used to split between training and validation data
    Output : train and val dataframes
    '''
    
    # create dataframe with directory audio file list, target names derived from files name and target number
    data = pd.DataFrame(sorted([file for file in os.listdir(directory) if file.endswith(sound_filetype)])
                        ,columns=['Path'])
    data['Target_name'] = data['Path'].apply(lambda x : '-'.join(x.split(sep='-')[0:2]))
    target_list = list(pd.unique(data['Target_name']))
    data['Target'] = data['Target_name'].apply(lambda x: target_list.index(x))
    
    
    # create intermediate dataframe to calculate split indexes by target using train_val_ratio
    subdf_count_by_target = pd.pivot_table(data,index=['Target_name'],aggfunc={'Target' : 'count'})\
                                            .rename(columns={'Target':'target_size'})
    subdf_cummul_sum_by_target = data.groupby(by=['Target_name']).sum()\
                                            .rename(columns={'Target':'start_index'})
    split_index_df = subdf_count_by_target.merge(subdf_cummul_sum_by_target, left_index=True, right_index=True)
    split_index_df['split_index'] = split_index_df['start_index']+round(train_val_ratio*split_index_df['target_size'])
    print(split_index_df.head())
    
    # create train and val from first dataframe using indexes calculated in split_index_df
    df_train = data.iloc[0:0]
    for target in list(pd.unique(data['Target_name'])):
        start_index = int(split_index_df.loc[target].start_index)
        split_index = int(split_index_df.loc[target].split_index)
        df_train = df_train.append(data.iloc[start_index:split_index])
    
    df_val = pd.concat([data,df_train]).drop_duplicates(keep=False)
    
    return df_train, df_val
    

In [13]:
df_train, df_val = create_filepath_target_train_val_df(directory)
df_train.head(50)

                            target_size  start_index  split_index
Target_name                                                      
Acrocephalus-arundinaceus            33            0         26.0
Acrocephalus-dumetorum               33           33         59.0
Acrocephalus-palustris               33           66         92.0
Acrocephalus-schoenobaenus           33           99        125.0
Aegolius-funereus                    33          132        158.0


Unnamed: 0,Path,Target_name,Target
0,Acrocephalus-arundinaceus-131536_tens.ogg,Acrocephalus-arundinaceus,0
1,Acrocephalus-arundinaceus-136005_tens.ogg,Acrocephalus-arundinaceus,0
2,Acrocephalus-arundinaceus-178787_tens.ogg,Acrocephalus-arundinaceus,0
3,Acrocephalus-arundinaceus-178793_tens.ogg,Acrocephalus-arundinaceus,0
4,Acrocephalus-arundinaceus-178869_tens.ogg,Acrocephalus-arundinaceus,0
5,Acrocephalus-arundinaceus-178959_tens.ogg,Acrocephalus-arundinaceus,0
6,Acrocephalus-arundinaceus-181959_tens.ogg,Acrocephalus-arundinaceus,0
7,Acrocephalus-arundinaceus-181961_tens.ogg,Acrocephalus-arundinaceus,0
8,Acrocephalus-arundinaceus-215950_tens.ogg,Acrocephalus-arundinaceus,0
9,Acrocephalus-arundinaceus-233487_tens.ogg,Acrocephalus-arundinaceus,0


In [None]:
sum(df_train.value_counts('Path'))

In [None]:
file_paths = directory + df['Path'].values
labels = df['Target'].values
print(file_paths[50],labels[50])
generate_mel_spectrogram(file_paths[50],labels[50])

In [None]:
ds_train = tf.data.Dataset.from_tensor_slices((file_paths, labels))

ds_train_mel = ds_train.map(generate_mel_spectrogram).batch(5)
ds_train_db_mel = ds_train.map(generate_db_scale_mel_spectrogram).batch(5)

In [None]:
spectrogram, label = next(iter(ds_train))

In [None]:
spectrogram.numpy().shape

In [None]:
plt.imshow(tf.math.log(spectrogram.numpy()[3, :, :]));

# First model

In [None]:
ds_train = tf.data.Dataset.from_tensor_slices((file_paths, labels))
# Splitting the dataset for training and testing.
def is_test(x, _):
    return x % 4 == 0

def is_train(x, y):
    return not is_test(x, y)

recover = lambda x, y: y

# Split the dataset for training.
test_dataset = ds_train.enumerate() \
    .filter(is_test) \
    .map(recover)

# Split the dataset for testing/validation.
train_dataset = ds_train.enumerate() \
    .filter(is_train) \
    .map(recover)

In [None]:
from tensorflow.keras import models, layers, optimizers

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(6, kernel_size=(3, 3), activation='relu', input_shape=(128, 625, 1)))
model.add(layers.Conv2D(4, kernel_size=(3, 3), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(50, activation='softmax')) 
model.summary()

In [None]:
opt = optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [None]:
from tensorflow.keras import callbacks
es = callbacks.EarlyStopping(patience=1, restore_best_weights=True)

# history = model.fit(ds_train,
                    batch_size=16,
                    epochs=2,
                    callbacks=[es],
                    verbose=1)