# Machine Learning of Bird Calls

In [None]:
import os
import time
import json
import pickle
import numpy as np

import tensorflow as tf
import tensorflow.keras as keras

import tensorflow.keras.models as models
import tensorflow.keras.layers as layers
import tensorflow.keras.preprocessing as preproc
import tensorflow.keras.callbacks as callbacks

In [None]:
#Load preprocessed data
checklistdir = '/home/dphogan/data_science/species_list'
datafile = 'traintestvalid.p'
datapath = os.path.join(checklistdir, datafile)
(xtrain, ytrain, xvalid, yvalid, xtest, ytest, molabels) = pickle.load(open(datapath, 'rb'))
#Save original y-values (turns out these are needed for computing weights below)
ztrain = ytrain
zvalid = yvalid
ztest = ytest
#Change output from index lists to one-hot vectors
ytrain = keras.utils.to_categorical(ytrain)
yvalid = keras.utils.to_categorical(yvalid)
ytest = keras.utils.to_categorical(ytest)
#Add a length-1 dimension to input data  (so it will have same format as a grayscale image)
xtrain = xtrain.reshape(xtrain.shape[0], xtrain.shape[1], xtrain.shape[2], 1)
xvalid = xvalid.reshape(xvalid.shape[0], xvalid.shape[1], xvalid.shape[2], 1)
xtest = xtest.reshape(xtest.shape[0], xtest.shape[1], xtest.shape[2], 1)
#Don't accidentally use test data yet
xtest = ytest = 0
print(np.shape(xtrain))
print(np.shape(ytrain))

In [None]:
#Compute weights (counts, weights by element, weights by class)
ctrain = np.bincount(ztrain)
cvalid = np.bincount(zvalid)
ctest = np.bincount(ztest)

wtrain = (float(len(ztrain))/len(ctrain))/ctrain[ztrain]
wvalid = (float(len(zvalid))/len(cvalid))/cvalid[zvalid]
wtest = (float(len(ztest))/len(ctest))/ctest[ztest]

ltrain = (float(len(ztrain))/len(ctrain))/ctrain
lvalid = (float(len(zvalid))/len(cvalid))/cvalid
ltest = (float(len(ztest))/len(ctest))/ctest

dtrain = dict(zip(range(len(ctrain)),ltrain))
dvalid = dict(zip(range(len(cvalid)),lvalid))
dtest = dict(zip(range(len(ctest)),ltest))

In [None]:
#Dataset augmentation
datagen = preproc.image.ImageDataGenerator(
    data_format = 'channels_last',
    width_shift_range = 6,
    height_shift_range = 2,
    fill_mode = 'constant',
    cval = 0
    )
datagen.fit(xtrain)

In [None]:
#"Checkpoint" callback to save model either after every epoch or after best test accuracy
#checkpointpath = 'model_epoch{epoch:02d}.hdf5'
checkpointpath = 'model.hdf5'
checkpoint = callbacks.ModelCheckpoint(checkpointpath, monitor='val_loss', mode='min',
                                       verbose=0, save_best_only=True,
                                       save_weights_only=False, period=1)

In [None]:
#Model definition
model = models.Sequential()
model.add(layers.Conv2D(filters=128, kernel_size=(3,5), padding='same', data_format='channels_last', activation='relu', input_shape=np.shape(xtrain)[1:] ))
model.add(layers.MaxPooling2D(pool_size=(1,2), padding='same', data_format='channels_last'))
model.add(layers.Conv2D(filters=128, kernel_size=(3,5), padding='same', data_format='channels_last', activation='relu' ))
model.add(layers.MaxPooling2D(pool_size=(2,2), padding='same', data_format='channels_last'))
#model.add(layers.Conv2D(filters=128, kernel_size=(3,5), padding='same', data_format='channels_last', activation='relu' ))
#model.add(layers.MaxPooling2D(pool_size=(2,2), padding='same', data_format='channels_last'))

model.add(layers.Flatten())
model.add(layers.Dense(units=512, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(units=512, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(units=np.shape(ytrain)[1], activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#Training
epochs = 25
batchsize = 32
starttime = time.time()
history = model.fit_generator(datagen.flow(xtrain, ytrain, batch_size=batchsize, shuffle=True),
                              class_weight=dtrain,
                              steps_per_epoch=len(xtrain)/batchsize,
                              epochs=epochs,
                              verbose=1,
                              validation_data=(xvalid, yvalid, wvalid),
                              callbacks=[checkpoint]
                             )
endtime = time.time()
print('Time: %.2fs' % (endtime-starttime))

In [None]:
#Save model definition (but not weights) to file
modeljson = model.to_json()
json.dump(modeljson, open('./model.json', 'w'))

In [None]:
#Save model to files (json+hdf5)
#modeljson = model.to_json()
#json.dump(modeljson, open('./model.json', 'w'))
#model.save_weights('./model.hdf5')

In [None]:
#Load model from files (json+hdf5)
overwritemodelvar = False
if overwritemodelvar:
    modeljson = json.load(open('model.json', 'r'))
    model = models.model_from_json(modeljson)
    model.load_weights('model.hdf5')

In [None]:
#Make predictions
#Currently used on validation data, but applicable to test data once model is finalized.
predictions = model.predict(xvalid, batch_size=128)

In [None]:
corrects = np.argmax(predictions,axis=1)==np.argmax(yvalid,axis=1)
correctrate = sum(corrects)/len(corrects)
print('%.2f' % (correctrate*100) + '%')