<a href="https://www.kaggle.com/code/davidjohnmillard/birdclef-mp3-tfrecord?scriptVersionId=125936716" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# BirdCLEF TFWriter

The following is an approach to writing TFRecords for BirdCLEF.

# Imports/Setup

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import tensorflow as tf
import librosa
import tensorflow.keras as keras
import tensorflow_io as tfio
import requests

In [2]:
taxonomy = '/kaggle/input/birdclef-2023/eBird_Taxonomy_v2021.csv'
metadata = '/kaggle/input/birdclef-2023/train_metadata.csv'

In [3]:
dftax = pd.read_csv(taxonomy)
dfmeta = pd.read_csv(metadata)

In [4]:
dfmeta

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg
...,...,...,...,...,...,...,...,...,...,...,...,...
16936,yewgre1,[],[''],-1.2502,29.7971,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://xeno-canto.org/703472,yewgre1/XC703472.ogg
16937,yewgre1,[],[''],-1.2489,29.7923,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/703485,yewgre1/XC703485.ogg
16938,yewgre1,[],[''],-1.2433,29.7844,Eurillas latirostris,Yellow-whiskered Greenbul,András Schmidt,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/704433,yewgre1/XC704433.ogg
16939,yewgre1,[],[''],0.0452,36.3699,Eurillas latirostris,Yellow-whiskered Greenbul,Lars Lachmann,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/752974,yewgre1/XC752974.ogg


# Exclude Some Labels    : (

I have chosen to exclude any label with less than 50 instances. 

Later on I will import instances from **xeno-canto.org** to ensure every label has the bare minimum.

In [5]:
exclude_lst = dfmeta['primary_label'].value_counts()[
    dfmeta['primary_label'].value_counts() < 50
].index.to_list()

In [6]:
name_exclude_lst = dfmeta['common_name'].value_counts()[
    dfmeta['common_name'].value_counts() < 50
].index.to_list()

In [7]:
def include(row):
    if row['primary_label'] in exclude_lst:
        return False
    return True

In [8]:
include = dfmeta.apply(include, axis=1)
dfmeta_ex = dfmeta[include]

# Getting Extra Instances

We are going to use the **xeno-canto** API to query all the instances and hopefully fill the minimum requirement of 10 instances per label.

In [9]:
new_arr = []

In [10]:
for name, label in zip(name_exclude_lst, exclude_lst):
    
    os.mkdir(label)

    response = requests.get('https://xeno-canto.org/api/2/recordings?query=' + name)
    resp_json = response.json()
    
    print('importing ... {}: {}'.format(name, label))
    
    for record in resp_json['recordings']:
        bird_id = record['id']
        filename = '/kaggle/working/' + label + '/' + bird_id + '.mp3'
        with open(filename, 'wb') as f:
            f.write(requests.get(record['file']).content)
            
        new_arr.append([label, bird_id, filename])

importing ... Crowned Hornbill: crohor1
importing ... White-fronted Bee-eater: wfbeat1
importing ... African Fish-Eagle: affeag1
importing ... Squacco Heron: squher1
importing ... Red-billed Firefinch: rebfir2
importing ... Black-and-white-casqued Hornbill: bawhor2
importing ... Northern Gray-headed Sparrow: gyhspa1
importing ... African Thrush: afrthr1
importing ... Greater Blue-eared Starling: gbesta1
importing ... Silvery-cheeked Hornbill: sichor1
importing ... Quailfinch: quailf1
importing ... Amethyst Sunbird: amesun2
importing ... Cardinal Woodpecker: carwoo1
importing ... Southern Fiscal: soufis1
importing ... Red-faced Crombec: refcro1
importing ... Speckled Mousebird: spemou2
importing ... Lesser Striped Swallow: lessts1
importing ... Mariqua Sunbird: marsun2
importing ... Brown Woodland-Warbler: brwwar1
importing ... Bronze Mannikin: broman1
importing ... Black-fronted Bushshrike: blfbus1
importing ... Speckled Pigeon: spepig1
importing ... Mourning Collared-Dove: afmdov1
imp

In [11]:
df_extra = pd.DataFrame(new_arr, columns=['primary_label', 'id', 'filename'])

In [12]:
def get_id(row):
    return row['url'].split('/')[-1]

dfmeta['id'] = dfmeta.apply(get_id, axis=1)

In [13]:
dfmeta = pd.concat([dfmeta, df_extra]).drop_duplicates(subset=['primary_label', 'id'])

In [14]:
dfmeta.index = [x for x in range(dfmeta.shape[0])]

In [15]:
dfmeta

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,id
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg,128013
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg,363501
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg,363502
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg,363503
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg,363504
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19073,crefra2,,,,,,,,,,,/kaggle/working/crefra2/81268.mp3,81268
19074,crefra2,,,,,,,,,,,/kaggle/working/crefra2/453768.mp3,453768
19075,crefra2,,,,,,,,,,,/kaggle/working/crefra2/439377.mp3,439377
19076,crefra2,,,,,,,,,,,/kaggle/working/crefra2/196087.mp3,196087


In [16]:
dfprob = dfmeta['primary_label'].value_counts() / dfmeta.shape[0]
dfprob = 264 / dfprob / 4472424.0
dfprob

comsan     0.002252
thrnig1    0.002252
barswa     0.002252
eaywag1    0.002252
wlwwar     0.002252
             ...   
yebsto1    0.563072
golher1    0.563072
brtcha1    1.126144
whhsaw1    1.126144
lotcor1    1.126144
Name: primary_label, Length: 264, dtype: float64

# Handle the Data

Since we are dealing with such an inbalance of so many classes the easiest approach is to group the elements by label and then sample each group equally.

In [17]:
dfmeta_g = dfmeta.groupby("primary_label")
dft = dfmeta_g.sample(frac=0.8, random_state=42)
dfv = dfmeta.drop(dft.index)

In [18]:
dft['primary_label'].value_counts()

comsan     400
thrnig1    400
barswa     400
eaywag1    400
wlwwar     400
          ... 
golher1      2
whctur2      2
brtcha1      1
whhsaw1      1
lotcor1      1
Name: primary_label, Length: 264, dtype: int64

In [19]:
dft['primary_label'].value_counts().to_csv('value_counts.csv')

In [20]:
dfv['primary_label'].value_counts()

barswa     100
wlwwar     100
thrnig1    100
comsan     100
eaywag1    100
          ... 
stusta1      1
brcsta1      1
rehblu1      1
shesta1      1
whctur2      1
Name: primary_label, Length: 259, dtype: int64

In [21]:
len(set(dft['primary_label'].value_counts().keys().tolist()).intersection(set(dfv['primary_label'].value_counts().keys().tolist())))

259

Just to make sure we got what we want...

# One Hot Encoding
Next we need to setup one hot encoding. Create a tensorflow TextVect layer and predict.

In [22]:
unique_lst = dft['primary_label'].value_counts().index.tolist()
v_len = len(unique_lst)

In [23]:
pd.DataFrame(unique_lst).to_csv('unique_lst.csv')

In [24]:
text_vec =  keras.layers.TextVectorization(
    max_tokens=v_len+1,
    output_mode='multi_hot',
    vocabulary=unique_lst
)

By default tensorflow has a [UNK] token. To deal with it we add a lambda layer taking everything after index 0.

In [25]:
model = keras.models.Sequential()
model.add(keras.Input(shape=(1,), dtype=tf.string))
model.add(text_vec)
model.add(keras.layers.Lambda(lambda x: x[:, 1:]))

# IO Functions
The function will create a TFExample to write to a record.

In [26]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

def get_example(audio_path, label):
    audio, sr = librosa.load(audio_path, sr=None)
    a = tf.convert_to_tensor(audio)
    b = tf.expand_dims(a, axis=1)
    
    return Example(
        features=Features(
            feature={
                'audio': Feature(bytes_list=BytesList(value=[tfio.audio.encode_mp3(b, sr).numpy()])),
                'sr': Feature(int64_list=Int64List(value=[sr])),
                'label': Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(label).numpy()]))
            }
        )
    )

# TFWriter
Finally, we create a set of paths, setup our writers/stack, and iterate through the dataset. At each iteration we choose the record based on how it will % into the n_shards (adding more randomness) and then compute the image and one-hot encoded label.

> making sure to set verbose=0 so we can write in peace

In [27]:
from contextlib import ExitStack
from tqdm import tqdm

def write_tfrecords(name, dataset, n_shards=50):
    paths = ["{}.tfrecord-{:02d}-of-{:02d}".format(name, index, n_shards) for index in range(n_shards)]
    
    with ExitStack() as stack: 
        writers = [stack.enter_context(tf.io.TFRecordWriter(path)) for path in paths]
        
        for i, row in tqdm(dataset.iterrows()):
            shard = i % n_shards
            if row['filename'][-3:] == 'ogg':
                audio_path = '/kaggle/input/birdclef-2023/train_audio/' + row['filename']
            else:
                audio_path = row['filename']
            label = model.predict([row['primary_label']], verbose=0).tolist()[0]
            example = get_example(audio_path, label)
            writers[shard].write(example.SerializeToString())
            
    print('Done writing ' + name + '.')

In [28]:
write_tfrecords('train', dft)
write_tfrecords('valid', dfv, 25)

  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will b

Done writing train.


  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  """
	Deprecated as of librosa version 0.10.0.
	It will b

Done writing valid.





Sanity check...

# Clean Up the Extra Recordings
Iterate through all the extra records and delete their references.

In [29]:
def delete_file(filepath):
    print('deleting ' + filepath + ' from local')
    os.remove('/kaggle/working/' + filepath)
    
def clear_all_local():
    for k in os.listdir('/kaggle/working/'):
        if k == '.virtual_documents':
            continue
        delete_file(k)

In [30]:
import shutil

for i in exclude_lst:
    try:
        shutil.rmtree('/kaggle/working/' + i)
    except:
        pass

# TO BE CONTINUED...