# Segments Extraction  

This notebook extracts annotated audio segments from the official recordings of Tovanella and WABAD using the `Bird_tags_Train.mat` file. Since BirdNET analyzes 3-second clips, all extracted segments follow this duration.  

Segments are generated with a 50% overlap, shifting by 1.5 seconds between consecutive clips.  

## Extraction Process:
1. **`species_dict`**: maps common names to scientific names for all species.  
2. **`category_annots.json`** & **`audio_annots.json`**: store segment annotations for each species in every audio file.  
3. **`audio_info.json`**: provides total duration and sampling rate for each recording.  
4. **`true_segments.json`**: lists the species present in each extracted segment.  

Unannotated segments can be included (labeled as `"None"`) by enabling `generate_None`, treating them as a non-species class.  

For WABAD, a different approach was used due to multiple recording sites—only relevant sites containing the study species were processed.


In [None]:
import scipy.io
import numpy as np
import os
from pydub import AudioSegment
import numpy as np
import json
from tqdm import tqdm
import pandas as pd
import csv
import librosa

In [8]:
# DATASET_NAME = 'DATASET_CNN'
# DATASET_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}'
# AUDIO_SOURCE = '/home/giacomoschiavo/Tovanella'

In [None]:
# Configuration variable
DATASET_NAME = 'NEW_DATASET_1'                              # name of the dataset (used to save utils file under its name) 
DATASET_PATH = f'E:/Giacomo/Tovanella/{DATASET_NAME}'       # path of the dataset
AUDIO_SOURCE = 'E:/Giacomo/Tovanella/Tovanella'             # folder that contains all the audio files

# Species Dict
Create a dictionary to map the scientific name of a species to its common name

In [10]:
from pathlib import Path

all_species = Path("utils/BirdNET_GLOBAL_6K_V2.4_Labels_en_uk.txt").read_text(encoding="utf-8").splitlines()
all_species[:5]

['Abroscopus albogularis_Rufous-faced Warbler',
 'Abroscopus schisticeps_Black-faced Warbler',
 'Abroscopus superciliaris_Yellow-bellied Warbler',
 'Aburria aburri_Wattled Guan',
 'Acanthagenys rufogularis_Spiny-cheeked Honeyeater']

In [11]:
# maps every scientific name to its common name
species_dict = {}
for specie in all_species:
    scientific_name, common_name = specie.split("_")    # <Abroscopus albogularis>_<Rufous-faced Warbler>
    species_dict[scientific_name] = common_name

species_dict["Abroscopus albogularis"]

'Rufous-faced Warbler'

In [12]:
# export species_dict to json
# with open('utils/species_dict_map.json', 'w') as f:
#     json.dump(species_dict, f)

# Category and Audio Annotation Files: A Deep Dive

These files hold invaluable annotation data, offering distinct perspectives on our dataset.

**`category_annots.json`**: This file provides a species-centric view. For each species identified, it lists *all* corresponding annotations found across *every* audio recording within the Tovanella folder.

**`audio_annots.json`**: In contrast, this file takes an audio-centric approach. For each individual audio file in our collection, it details *all* the annotations present within that specific recording.

In [None]:
# extract annotations from the given file
bird_tags = scipy.io.loadmat('Bird_tags_Train.mat')["Bird_tags"] 
# visualize an example, showing all the properties
for i, prop in enumerate(bird_tags[12][0][0][0]):
    print(i, prop)

In [None]:
def get_audio_category_annots(bird_tags_filepath, audio_source_path):
    bird_tags = scipy.io.loadmat(bird_tags_filepath)["Bird_tags"]
    category_annots = {}      # detections grouped by category
    audio_annots = {}         # detections grouped by audio
    for elem in bird_tags:
        tag = elem[0][0][0][0][0]
        scientific_name = tag.replace("_", " ")                 # Fringilla_coelebs -> Fringilla coelebs
        common_name = species_dict.get(scientific_name, "")     # Fringilla coelebs -> Common Chaffinch
        label = "_".join([scientific_name, common_name])        # Fringilla coelebs_Common Chaffinch

        if not common_name:             # this happens only for non-species class, like "Wind_" and "Vegetation_"
            label = scientific_name     # as they don't have a common name, we use the scientific name as label

        file_name = elem[0][0][0][1][0]                         
        file_path = os.path.join(audio_source_path, file_name)   # path to the audio file

        start_time, end_time = np.array(elem[0][0][0][2]).flatten()[-2:]
        duration = end_time - start_time
        
        if not os.path.exists(file_path):       # do not store if file does not exist 
            continue
        if label not in category_annots:
            category_annots[label] = []
        if file_name not in audio_annots:
            audio_annots[file_name] = []

        category_annots[label].append({ "file_name": file_name, "start_time": start_time, "duration": duration, "label": label  })
        audio_annots[file_name].append({ "scientific_name": scientific_name, "common_name": common_name, "start_time": start_time, "duration": duration, "label": label })

    return category_annots, audio_annots


In [None]:
category_annots, audio_annots = get_audio_category_annots("Bird_tags_Train.mat", AUDIO_SOURCE)

with open("utils/category_annots.json", "w") as f:
    json.dump(category_annots, f)
with open("utils/audio_annots.json", "w") as f:
    json.dump(audio_annots, f)

In [None]:
# creates species list
species_list = list(category_annots.keys())

# Segments Creation
Creates all the segments listed in Category Info

In [None]:
# store info about duration and sampling rate of the given audio
def generate_audio_info(source_audio_path, audio_annots):
    audio_info = {}
    audios = list(audio_annots.keys())
    for audio in audios:
        y, sr = librosa.load(os.path.join(source_audio_path, audio), sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        audio_info[audio] = {"duration": duration, "sampling_rate": sr}
    return audio_info

In [None]:
# LOAD
# with open("utils/audio_info.json") as f:
    # audio_info = json.load(f)

# SAVE
audio_info = generate_audio_info(AUDIO_SOURCE, audio_annots)
with open("utils/audio_info.json", "w") as f:
    json.dump(audio_info, f)

In [None]:
# generates the true segments for each audio file
def generate_true_segments(audio_annots, audio_info):
    true_segments = {}
    audios = list(audio_annots.keys())
    segment_length = 3.0    # length of each segment 
    step_size = 1.5         # overlap between segments 

    for audio in audios:
        # load annotations for this audio
        all_annotations = audio_annots[audio]               
        audio_duration = audio_info[audio]["duration"]
        true_segments[audio] = {}

        # generate all empty segments every 1.5 seconds
        segment_start = 0.0
        while segment_start + segment_length <= audio_duration:
            segm_id = f"{int(segment_start)}_{int((segment_start % 1) * 10)}"
            true_segments[audio][segm_id] = []      # empty list that will be filled with annotations
            segment_start += step_size              # move the segment forward by 1.5s

        # assign the annotations to the corresponding segments
        for annotation in all_annotations:
            start_time = annotation["start_time"]
            duration = annotation["duration"]
            species = annotation["label"]

            # find the time interval where this annotation is present
            annotation_end = start_time + duration

            # check which segments contain it at least partially
            for segment_start_str in true_segments[audio].keys():
                segment_start_time = float(segment_start_str.replace("_", "."))  # convert from string to float
                segment_end_time = segment_start_time + segment_length

                # if the annotation falls at least partially within this segment, add it
                if not (annotation_end <= segment_start_time or start_time >= segment_end_time):
                    if species not in true_segments[audio][segment_start_str]:
                        true_segments[audio][segment_start_str].append(species)
    return true_segments

In [None]:
true_segments = generate_true_segments(audio_annots, audio_info)

In [None]:
# SAVE
os.makedirs(f'utils/{DATASET_NAME}', exist_ok=True)
with open(f'utils/{DATASET_NAME}/true_segments.json', 'w') as f:
    json.dump(true_segments, f)

In [None]:
# create segment file given a species name
def generate_species_segment(segment_audio, species_name, target_path, basename, segm_id):
    os.makedirs(os.path.join(target_path, species_name), exist_ok=True)
    export_path = os.path.join(
        target_path,
        species_name, 
        f"{basename}_{segm_id}.wav"
    )
    if os.path.exists(export_path):
        return
    segment_audio.export(export_path, format="wav")

# generate the audio from the true_segments
def generate_segments(audio_source_path, target_path, true_segments, audio_info, generate_None=False):
    for audio_path, segms in true_segments.items():     # { <audio_path>.wav: { <segm_id>: [<species>] } }
        basename = os.path.splitext(audio_path)[0]      # removes ".wav"
        progress_bar = tqdm(total=len(segms), colour='red', desc="Processing segments...")
        print(f"Elaborating audio {audio_path}...")
        # loads the audio
        audio = AudioSegment.from_file(                 
                os.path.join(audio_source_path, audio_path),    
                format="wav",
                frame_rate=audio_info[audio_path]["sampling_rate"]
            )
        for segm_id, species in segms.items():          # <segm_id>: [<species>]
            segment_start_time = float(segm_id.replace("_", "."))
            segment_audio = audio[segment_start_time*1000:segment_start_time*1000 + 3000]
            if not species and generate_None:           # if the segment is empty, generate a None segment
                generate_species_segment(segment_audio, "None", target_path, basename, segm_id)
            for sp in species:
                generate_species_segment(segment_audio, sp, target_path, basename, segm_id)
            progress_bar.update(1)
        progress_bar.close()

In [None]:
# it takes almost 30 minutes
# generate_segments(audio_source_path=AUDIO_SOURCE,
#                   target_path=f"{DATASET_PATH}/train",
#                   true_segments=true_segments,
#                   audio_info=audio_info,
#                   generate_None=True)

In [None]:
# count segments by species
target_path = f"{DATASET_PATH}/train"
species_count = {species: len(os.listdir(os.path.join(target_path, species))) for species in os.listdir(target_path)}
species_count_df = pd.DataFrame(list(species_count.items()), columns=["Species", "Count"])
species_count_df.sort_values(by="Count", ascending=False).reset_index(drop=True)

Unnamed: 0,Species,Count
0,,11291
1,Fringilla coelebs_Common Chaffinch,6002
2,Turdus philomelos_Song Thrush,4317
3,Sylvia atricapilla_Eurasian Blackcap,3027
4,Regulus ignicapilla_Common Firecrest,2777
5,Phylloscopus collybita_Common Chiffchaff,2014
6,Erithacus rubecula_European Robin,1385
7,Troglodytes troglodytes_Eurasian Wren,1159
8,Regulus regulus_Goldcrest,550
9,Rain,477


# WABAD Segments Extraction: A Tailored Approach

For the WABAD dataset, a segment extraction strategy similar to the previous one is employed, with key adaptations to address its unique characteristics.

Initially, the focus is specifically on the **less represented species**. In this particular analysis, species with **fewer than 750 occurrences** were targeted, while the "non-species" class was excluded.

Next, the process involves extracting annotations directly from WABAD. This requires referencing a list of specific site datasets to download, located in the `wabad_datasets.txt` file. Once the annotations are gathered, audio segments are created. This follows the **same robust pipeline** used previously: transforming raw category and audio annotations (`category_annots`, `audio_annots`), saving audio details (`audio_info`), and then feeding into the generation of labeled segments (`true_segments`).

Finally, given the abundance of unannotated segments (exceeding 10,000 samples of "None"), they have been excluded from this analysis.

In [18]:
# get species from category_info file, filter non-species name
species_common_name_list = [species.split("_")[0] for species in list(category_annots.keys()) if len(species.split("_")) > 1]

In [20]:
minority_threshold = 750
species_to_augment = [species.split("_")[0] for species in species_list if species_count[species] <= minority_threshold if len(species.split("_")) > 1]
species_to_augment

['Muscicapa striata',
 'Turdus viscivorus',
 'Glaucidium passerinum',
 'Pyrrhula pyrrhula',
 'Periparus ater',
 'Prunella modularis',
 'Lophophanes cristatus',
 'Regulus regulus',
 'Turdus merula',
 'Certhia familiaris',
 'Loxia curvirostra',
 'Dendrocopos major',
 'Dryocopus martius',
 'Phylloscopus trochilus',
 'Spinus spinus',
 'Poecile palustris']

In [None]:
# 1. Locate site dataset list: `wabad_datasets.txt` in the `utils` folder.
# 2. Manually download and extract ALL listed datasets.
# 3. Place the extracted datasets into the designated `WABAD` folder.

# this is an example of the outcome
# E:\Giacomo\Tovanella\WABAD\BAM\BAM\Raven Pro annotations\BAM_20151116_060801.txt
WABAD_PATH = "E:/Giacomo/Tovanella/WABAD"

In [93]:
def extract_wabad_info(folder_path):
    # folder_path = ".../WABAD/BIAL/BIAL/Raven Pro annotations"
    audio_info_wabad = {}
    category_info_wabad = {}
    for txt_file in os.listdir(folder_path):
        complete_path = os.path.join(folder_path, txt_file)
        with open(complete_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='\t') 
            for row in reader:
                if row["Species"] not in species_to_augment or "End Time (s)" not in row:
                    continue
                file_name = txt_file.replace(".txt", ".WAV")
                start_time = float(row["Begin Time (s)"])
                end_time = float(row["End Time (s)"])
                duration = end_time - start_time
                common_name = species_dict[row["Species"]]
                label = f"{row['Species']}_{common_name}"

                audio_info_entry = {
                    "scientific_name": row["Species"],
                    "common_name": common_name, 
                    "start_time": start_time,
                    "duration": duration,
                    "label": label
                }
                category_info_entry = {
                    "file_name": file_name,
                    "start_time": start_time,
                    "duration": duration,
                    "label": label
                }
                
                if file_name not in audio_info_wabad:
                    audio_info_wabad[file_name] = []
                if label not in category_info_wabad:
                    category_info_wabad[label] = []
                audio_info_wabad[file_name].append(audio_info_entry)
                category_info_wabad[label].append(category_info_entry)
    return audio_info_wabad, category_info_wabad

In [None]:
audio_annots_wabad = {}
category_annots_wabad = {}

for folder in os.listdir(WABAD_PATH):
    if not os.path.isdir(os.path.join(WABAD_PATH, folder)):
        continue
    annotations = os.path.join(WABAD_PATH, folder, folder, "Raven Pro annotations")
    audio_info_update, category_info_update = extract_wabad_info(annotations)
    for audio in audio_info_update.keys():
        if audio not in audio_annots_wabad:
            audio_annots_wabad[audio] = []
        audio_annots_wabad[audio].extend(audio_info_update[audio])
    for category in category_info_update.keys():
        if category not in category_annots_wabad:
            category_annots_wabad[category] = []
        category_annots_wabad[category].extend(category_info_update[category])    

# with open("utils/WABAD/audio_annots_wabad.json", 'w', encoding='utf-8') as jsonfile:
#     json.dump(audio_annots_wabad, jsonfile)

# with open("utils/WABAD/category_annots_wabad.json", 'w', encoding='utf-8') as jsonfile:
#     json.dump(category_annots_wabad, jsonfile)

In [None]:
# show contribution of WABAD for every species
species_count_wabad = {species_name: len(segms) for species_name, segms in category_annots_wabad.items()}
species_count_wabad_df = pd.DataFrame(list(species_count_wabad.items()), columns=["Species", "Count WABAD"])
merged_df = pd.merge(species_count_df, species_count_wabad_df, on="Species", how="inner")
merged_df.sort_values(by=["Count"], ascending=False)

Unnamed: 0,Species,Count,Count WABAD
2,Phylloscopus collybita_Common Chiffchaff,488,819
8,Regulus regulus_Goldcrest,350,437
0,Troglodytes troglodytes_Eurasian Wren,249,530
5,Periparus ater_Coal Tit,199,768
10,Certhia familiaris_Eurasian Treecreeper,118,96
1,Muscicapa striata_Spotted Flycatcher,76,117
9,Turdus merula_Eurasian Blackbird,53,2308
7,Lophophanes cristatus_Crested Tit,48,132
12,Dendrocopos major_Great Spotted Woodpecker,42,242
11,Loxia curvirostra_Common Crossbill,39,25


In [None]:
# move all WABAD audio in a unique folder
WABAD_PATH = "E:/Giacomo/Tovanella/WABAD"
WABAD_AUDIO_SOURCE = "E:/Giacomo/Tovanella/all_wabad_audio"
for folder in os.listdir(WABAD_PATH):
    if not os.path.isdir(os.path.join(WABAD_PATH, folder)):
        continue
    # ...\BAM\BAM\Recordings
    folder_path = os.path.join(WABAD_PATH, folder, folder, "Recordings")
    all_audio = os.listdir(folder_path)
    for audio in all_audio:
        if audio.upper() in audio_annots_wabad.keys():
            os.rename(
                os.path.join(folder_path, audio),
                os.path.join(WABAD_AUDIO_SOURCE, audio)
            )
    

In [None]:
# with open("utils/audio_info_wabad.json") as f:
#     audio_info_wabad = json.load(f)

audio_info_wabad = generate_audio_info(WABAD_AUDIO_SOURCE, audio_annots_wabad)
with open("utils/WABAD/audio_info_wabad.json", "w") as f:
    json.dump(audio_info_wabad, f)

In [131]:
true_segments_wabad = generate_true_segments(audio_annots_wabad, audio_info_wabad)
with open("utils/WABAD/true_segments_wabad.json", "w") as f:
    json.dump(true_segments_wabad, f)

In [None]:
# here we generate the segments for WABAD in WABAD_SEGMENTS_PATH folder
WABAD_SEGMENTS_PATH = "E:/Giacomo/Tovanella/WABAD_segments"
os.makedirs(WABAD_SEGMENTS_PATH, exist_ok=True)

In [None]:
# generate_segments(WABAD_AUDIO_SOURCE, WABAD_SEGMENTS_PATH, true_segments_wabad, audio_info_wabad, generate_None=False)

In [144]:
species_count_wabad_fr = {}
for species in os.listdir(WABAD_SEGMENTS_PATH):
    species_count_wabad_fr[species] = len(os.listdir(os.path.join(WABAD_SEGMENTS_PATH, species)))

species_count_wabad_fr_df = pd.DataFrame(list(species_count_wabad_fr.items()), columns=["Species", "Count WABAD FR"])
merged_df = pd.merge(species_count_df, species_count_wabad_fr_df, on="Species", how="inner")
merged_df.sort_values(by=["Count"], ascending=False)

Unnamed: 0,Species,Count,Count WABAD FR
2,Phylloscopus collybita_Common Chiffchaff,488,2970
8,Regulus regulus_Goldcrest,350,1198
0,Troglodytes troglodytes_Eurasian Wren,249,2503
5,Periparus ater_Coal Tit,199,3698
10,Certhia familiaris_Eurasian Treecreeper,118,258
1,Muscicapa striata_Spotted Flycatcher,76,158
9,Turdus merula_Eurasian Blackbird,53,8318
7,Lophophanes cristatus_Crested Tit,48,359
12,Dendrocopos major_Great Spotted Woodpecker,42,638
11,Loxia curvirostra_Common Crossbill,39,125
