# BirdCLEF TFWriter

The following is an approach to writing TFRecords for BirdCLEF.

# Imports/Setup

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import tensorflow as tf
import librosa
import tensorflow.keras as keras
import tensorflow_io as tfio

In [2]:
taxonomy = '/kaggle/input/birdclef-2023/eBird_Taxonomy_v2021.csv'
metadata = '/kaggle/input/birdclef-2023/train_metadata.csv'

In [3]:
dftax = pd.read_csv(taxonomy)
dfmeta = pd.read_csv(metadata)

# Exclude Some Labels    : (

I have chosen to exclude any label with less than 10 instances. 

Later on I will import instances from **xeno-canto.org** to ensure every label has the bare minimum.

In [4]:
exclude_lst = dfmeta['primary_label'].value_counts()[
    dfmeta['primary_label'].value_counts() < 10
].index.to_list()

In [5]:
def include(row):
    if row['primary_label'] in exclude_lst:
        return False
    return True

In [6]:
include = dfmeta.apply(include, axis=1)
dfmeta_ex = dfmeta[include]

# Handle the Data

Since we are dealing with such an inbalance of so many classes the easiest approach is to group the elements by label and then sample each group equally.

In [7]:
dfmeta_ex_g = dfmeta_ex.groupby("primary_label")
dft = dfmeta_ex_g.sample(frac=0.8, random_state=42)
dfv = dfmeta_ex.drop(dft.index)

In [8]:
dft['primary_label'].value_counts()

thrnig1    400
barswa     400
wlwwar     400
comsan     400
eaywag1    400
          ... 
rostur1      8
whbcan1      8
brcale1      8
gybfis1      8
purgre2      8
Name: primary_label, Length: 217, dtype: int64

In [9]:
dfv['primary_label'].value_counts()

thrnig1    100
barswa     100
wlwwar     100
comsan     100
eaywag1    100
          ... 
bltapa1      2
purgre2      2
refbar2      2
yenspu1      2
augbuz1      2
Name: primary_label, Length: 217, dtype: int64

Just to make sure we got what we want...

# One Hot Encoding
Next we need to setup one hot encoding. Create a tensorflow TextVect layer and predict.

In [10]:
unique_lst = dft['primary_label'].unique().tolist()
v_len = len(unique_lst)

In [11]:
text_vec =  keras.layers.TextVectorization(
    max_tokens=v_len+1,
    output_mode='multi_hot',
    vocabulary=unique_lst
)

By default tensorflow has a [UNK] token. To deal with it we add a lambda layer taking everything after index 0.

In [12]:
model = keras.models.Sequential()
model.add(keras.Input(shape=(1,), dtype=tf.string))
model.add(text_vec)
model.add(keras.layers.Lambda(lambda x: x[:, 1:]))

# IO Functions
The function will create a TFExample to write to a record.

In [13]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

def get_example(audio_path, label):
    audio, sr = librosa.load(audio_path, sr=None)
    a = tf.convert_to_tensor(audio)
    b = tf.expand_dims(a, axis=1)
    
    return Example(
        features=Features(
            feature={
                'audio': Feature(bytes_list=BytesList(value=[tfio.audio.encode_mp3(b, sr).numpy()])),
                'sr': Feature(int64_list=Int64List(value=[sr])),
                'label': Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(label).numpy()]))
            }
        )
    )

# TFWriter
Finally, we create a set of paths, setup our writers/stack, and iterate through the dataset. At each iteration we choose the record based on how it will % into the n_shards (adding more randomness) and then compute the image and one-hot encoded label.

> making sure to set verbose=0 so we can write in peace

In [14]:
from contextlib import ExitStack
from tqdm import tqdm

def write_tfrecords(name, dataset, n_shards=50):
    paths = ["{}.tfrecord-{:02d}-of-{:02d}".format(name, index, n_shards) for index in range(n_shards)]
    
    with ExitStack() as stack: 
        writers = [stack.enter_context(tf.io.TFRecordWriter(path)) for path in paths]
        
        for i, row in tqdm(dataset.iterrows()):
            shard = i % n_shards
            audio_path = '/kaggle/input/birdclef-2023/train_audio/' + row['filename']
            label = model.predict([row['primary_label']], verbose=0).tolist()[0]
            example = get_example(audio_path, label)
            writers[shard].write(example.SerializeToString())
            
    print('Done writing ' + name + '.')

In [15]:
write_tfrecords('train', dft)
write_tfrecords('valid', dfv, 25)

13357it [1:35:55,  2.32it/s]


Done writing train.


3337it [24:09,  2.30it/s]

Done writing valid.





Sanity check...

# TO BE CONTINUED...