### Import packages

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from pydub import AudioSegment
import os
import tensorflow_io as tfio

### Ingest data

In [None]:
def convert_and_resample(input_file: str, output_file: str):
    """
    Converts an audio file to WAV format and resamples it to 16 kHz.
    Skips conversion if the output file already exists.

    Args:
        input_file (str): Path to the input audio file (e.g., .ogg, .mp3).
        output_file (str): Path to save the converted and resampled .wav file.
    """
    print("start conversion")
    if os.path.exists(output_file):
        print(f"Skipped (already exists): {output_file}")
        return

    try:
        # Load the input audio file
        audio = AudioSegment.from_file(input_file)
        #set quantisation
        audio_quant16 = audio.set_sample_width(2)  # 2 bytes = 16 bit
        # Resample to 16 kHz
        audio_16k = audio_quant16.set_frame_rate(16000)
        # Export as .wav
        audio_16k.export(output_file, format="wav")
        print(f"Conversion and resampling successful: {output_file}")
    except Exception as e:
        print(f"Error during conversion and resampling: {e}")

In [None]:
def batch_convert_and_resample(input_root, output_root, convert_and_resample, max_folders=None):
    """
    Walk through input_root, find all .ogg files, and convert them to .wav
    in output_root with the same folder structure.
    
    Parameters:
        input_root (str): Path to the root folder containing .ogg files.
        output_root (str): Path where converted .wav files will be saved.
        convert_and_resample (func): Function that takes (in_path, out_path).
        max_folders (int, optional): If set, only process the first N subfolders.
    """
    # List top-level subfolders in input_root
    subfolders = sorted(
        [os.path.join(input_root, d) for d in os.listdir(input_root) 
         if os.path.isdir(os.path.join(input_root, d))]
    )
    print("start conversion")
    # Limit to first N folders if requested
    if max_folders is not None:
        subfolders = subfolders[:max_folders]

    for folder in subfolders:
        for dirpath, _, filenames in os.walk(folder):
            for filename in filenames:
                if filename.lower().endswith(".ogg") or filename.lower().endswith(".wav"):
                    in_path = os.path.join(dirpath, filename)
                    
                    # Build matching output path
                    rel_path = os.path.relpath(in_path, input_root)
                    rel_path_no_ext = os.path.splitext(rel_path)[0] + ".wav"
                    out_path = os.path.join(output_root, rel_path_no_ext)

                    # Ensure output directory exists
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    # Convert
                    convert_and_resample(in_path, out_path)
                    print(f"Converted: {in_path} -> {out_path}")

In [None]:
# input_root = "../data/birdclef-2024/train_audio"
# output_root = "../data/birdclef-2024/train_audio_16"

# batch_convert_and_resample(input_root, output_root, convert_and_resample)


In [None]:
# Read train meta data
train_metadata_path = "../data/birdclef-2024/train_metadata.csv"
train_df = pd.read_csv(train_metadata_path)


### EDA

In [None]:
train_df.head()

In [None]:
# train_df["rating"].info()
train_df.describe()

### Read wav bird data

In [None]:
# Read train meta data
base_data_path = "../data/birdclef-2024"
bird_metadata_path = os.path.join(base_data_path, "train_metadata.csv")
bird_df = pd.read_csv(bird_metadata_path)

# Display the first few rows of the dataframe
bird_df.head()

In [None]:

# Change the filename endings from .ogg to .wav in the filename column of bird_df
bird_df['filename'] = bird_df['filename'].str.replace('.ogg', '.wav', regex=False)
# Display the first few rows of the dataframe
bird_df.head()

In [None]:
import glob
# # Show rows where the filename matches the pattern "cohcuc1/*.wav"
# bird_df[bird_df['filename'].str.startswith('cohcuc1/') & bird_df['filename'].str.endswith('.wav')].head()
wav_files = glob.glob(base_data_path + "/train_audio_16/**/*.wav", recursive=True)
wav_files = [f.replace(base_data_path + "/train_audio_16/", "") for f in wav_files]

filtered_bird_df = bird_df[bird_df['filename'].isin(wav_files)]

bird_classes = list(set(filtered_bird_df['common_name']))

map_class_to_id = {name: idx for idx, name in enumerate(bird_classes)}

# filtered_pd = pd_data[pd_data.category.isin(my_classes)]

class_id = filtered_bird_df['common_name'].apply(lambda name: map_class_to_id[name])
filtered_bird_df = filtered_bird_df.assign(target=class_id)


full_path = filtered_bird_df['filename'].apply(lambda row: os.path.join(base_data_path + "/train_audio_16/", row))
filtered_bird_df = filtered_bird_df.assign(filename=full_path)

filtered_bird_df.head(10)

### Split data into training, validation and test data

In [None]:
from sklearn.model_selection import train_test_split

train_df_idx, temp_df_idx = train_test_split(filtered_bird_df.index, test_size=0.4, random_state=42, stratify=filtered_bird_df['target'])

val_df_idx, test_df_idx = train_test_split(temp_df_idx, test_size=0.5, random_state=42, stratify=filtered_bird_df.loc[temp_df_idx, 'target'])

# Step 3: Create 'fold' column in original filtered_bird_df
filtered_bird_df['fold'] = ''  # initialize empty
filtered_bird_df.loc[train_df_idx, 'fold'] = 1
filtered_bird_df.loc[val_df_idx, 'fold'] = 2
filtered_bird_df.loc[test_df_idx, 'fold'] = 3

filtered_bird_df.head(10)

In [None]:
plt.hist(filtered_bird_df[filtered_bird_df['fold'] == 1]['target'], bins=len(bird_classes), alpha=0.7, label='Train')
plt.hist(filtered_bird_df[filtered_bird_df['fold'] == 2]['target'], bins=len(bird_classes), alpha=0.7, label='Val')
plt.hist(filtered_bird_df[filtered_bird_df['fold'] == 3]['target'], bins=len(bird_classes), alpha=0.7, label='Test')
plt.xlabel('Bird Classes')
plt.ylabel('Count')
plt.title('Distribution of Bird Classes in Train, Val, and Test Sets')
plt.legend()
plt.show()

Plot one waveform

In [None]:
from pydub import AudioSegment

def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    print(file_contents)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    
    #sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    #wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [None]:

# print(x_train[0])
# wav = load_wav_16k_mono(x_train[0])
# plt.plot(wav)
#plt.plot(testing_wav_data)

### Modelling

In [None]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

#### Model inference / Create Embeddings

In [None]:
filenames_train = filtered_bird_df[filtered_bird_df['fold'] == 1]['filename']
targets_train = filtered_bird_df[filtered_bird_df['fold'] == 1]['target']

filenames_val = filtered_bird_df[filtered_bird_df['fold'] == 2]['filename']
targets_val = filtered_bird_df[filtered_bird_df['fold'] == 2]['target']

filenames_test = filtered_bird_df[filtered_bird_df['fold'] == 3]['filename']
targets_test = filtered_bird_df[filtered_bird_df['fold'] == 3]['target']


train_ds = tf.data.Dataset.from_tensor_slices((filenames_train, targets_train))
val_ds = tf.data.Dataset.from_tensor_slices((filenames_val, targets_val))
test_ds = tf.data.Dataset.from_tensor_slices((filenames_test, targets_test))

def load_wav_for_map(filename, label):
  return load_wav_16k_mono(filename), label

train_ds = train_ds.map(load_wav_for_map)
val_ds = val_ds.map(load_wav_for_map)
test_ds = test_ds.map(load_wav_for_map)

In [None]:
def extract_embedding(wav_data, label):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings,
            tf.repeat(label, num_embeddings))


In [None]:
train_ds = train_ds.map(extract_embedding).unbatch()
val_ds = val_ds.map(extract_embedding).unbatch()
test_ds = test_ds.map(extract_embedding).unbatch()
train_ds.element_spec

In [None]:
train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

#### Model classification layer

In [None]:
bird_class_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(1024,), dtype=tf.float32,
                          name='embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 Klassen
])

bird_class_model.summary()

In [None]:
bird_class_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

In [None]:
history = bird_class_model.fit(train_ds,
                       epochs=20,
                       validation_data=val_ds,
                       callbacks=callback)

In [None]:
loss, accuracy = bird_class_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
wav = load_wav_16k_mono(filtered_bird_df[filtered_bird_df['fold'] == 3]['filename'].values[0])
scores, embeddings, spectrogram = model(wav)
result = bird_class_model(embeddings).numpy()

inferred_class = bird_classes[result.mean(axis=0).argmax()]
print(f'The main sound is: {inferred_class}')