# Data Preprocessing

In [34]:
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import math
import os
from pydub import AudioSegment
import numpy as np
import json
from tqdm import tqdm
import pandas as pd
import seaborn as sns

# Species Dict
Create a dictionary to map the scientific name of a species to its common name

In [35]:
from pathlib import Path

all_species = Path("utils/BirdNET_GLOBAL_6K_V2.4_Labels_en_uk.txt").read_text(encoding="utf-8").splitlines()
all_species[:5]

['Abroscopus albogularis_Rufous-faced Warbler',
 'Abroscopus schisticeps_Black-faced Warbler',
 'Abroscopus superciliaris_Yellow-bellied Warbler',
 'Aburria aburri_Wattled Guan',
 'Acanthagenys rufogularis_Spiny-cheeked Honeyeater']

In [36]:
# maps every scientific name to its common name
species_dict = {}
for specie in all_species:
    scientific_name, common_name = specie.split("_")    # <Abroscopus albogularis>_<Rufous-faced Warbler>
    species_dict[scientific_name] = common_name

species_dict["Abroscopus albogularis"]

'Rufous-faced Warbler'

In [37]:
# export species_dict to json
# with open('utils/species_dict_map.json', 'w') as f:
#     json.dump(species_dict, f)

# Category and Audio info files
This files are precious as they represent the information about every category and every audio 

In [38]:
# extract annotations from the given file
bird_tags = scipy.io.loadmat('Bird_tags_Test.mat')["Bird_tags"] 
# visualize an example, showing all the properties
for i, prop in enumerate(bird_tags[12][0][0][0]):
    print(i, prop)

0 ['Fringilla_coelebs']
1 ['20190601_030000.WAV']
2 [[ 6.47588832  1.63324873  1.63324873  6.47588832 15.42014847 17.89657331]]
3 [[15.42014847  6.47588832]
 [15.42014847  1.63324873]
 [17.89657331  1.63324873]
 [17.89657331  6.47588832]
 [15.42014847  6.47588832]]
4 [[2]]


+ **Category Info**: group segments by its category
+ **Audio Info**: group segments by its original audio

In [39]:
def get_audio_category_info(bird_tags_filepath, out_category_info="utils/category_info.json", out_audio_info="utils/audio_info.json", audio_path="E:/Giacomo/Tovanella/Tovanella"):
    bird_tags = scipy.io.loadmat(bird_tags_filepath)["Bird_tags"]
    category_info = {}      # detections grouped by category
    audio_info = {}         # detections grouped by audio
    for elem in bird_tags:
        # get <scientific name>_<common name> format 
        tag = elem[0][0][0][0][0]
        scientific_name = tag.replace("_", " ")                 # Fringilla coelebs -> Fringilla coelebs
        common_name = species_dict.get(scientific_name, "")     # Fringilla coelebs -> Common Chaffinch
        label = "_".join([scientific_name, common_name])        # Fringilla coelebs_Common Chaffinch
        # get source file
        file_name = elem[0][0][0][1][0]
        file_path = os.path.join(audio_path, file_name)
        # duration calculation
        start_time, end_time = np.array(elem[0][0][0][2]).flatten()[-2:]
        duration = end_time - start_time
        # do not store info if file do not exist
        if not os.path.exists(file_path):   
            continue
        # save in dictionaries
        if label not in category_info:
            category_info[label] = []
        if file_name not in audio_info:
            audio_info[file_name] = []
        category_info[label].append({ "file_name": file_name, "start_time": start_time, "duration": duration, "label": label  })
        audio_info[file_name].append({ "scientific_name": scientific_name, "common_name": common_name, "start_time": start_time, "duration": duration, "label": label })
    return category_info, audio_info


In [40]:
category_info_test, audio_info_test = get_audio_category_info("Bird_tags_Test.mat")

# with open("utils/category_info_test.json", "w") as f:
#     json.dump(category_info_test, f)
# with open("utils/audio_info_test.json", "w") as f:
#     json.dump(audio_info_test, f)

In [41]:
# category_info, audio_info = get_audio_category_info("Bird_tags_Train.mat")
category_info, audio_info = get_audio_category_info("Bird_tags_Train.mat")

# with open("utils/category_info.json", "w") as f:
#     json.dump(category_info, f)
# with open("utils/audio_info.json", "w") as f:
#     json.dump(audio_info, f)

In [42]:
# creates species list
species_list = category_info.keys()

# Segments Creation
Creates all the segments listed in Category Info

In [43]:
def generate_segments(target_path, audio_source_path, category_info, generate=False): 
    true_segments = {}
    # target_path = "E:/Giacomo/Tovanella/wabad_segments"
    # audio_source_path = "E:/Giacomo/Tovanella/WABAD/"
    species_list = category_info.keys()
    categories = list(species_list)
    for j, category in enumerate(categories):
        all_category_audio = category_info[category][:500]  # limit to 500 annotations
        total_length = len(all_category_audio)
        print(f"Loading {category} category... {j}/{len(categories)}")
        for i in tqdm(range(total_length)):
            audio_path = all_category_audio[i]["file_name"] # ex. BIAL_20210420_051500.WAV
            file_name, wav = audio_path.split('.')
            site = file_name.split('_')[0]
            start_time = all_category_audio[i]["start_time"]
            duration = all_category_audio[i]["duration"]
            start_times = []
            start_segms = []
            start_segm = int(start_time // 3)
            if duration < 1:    # ignore
                continue
            elif duration < 3:  # add contextual audio
                remaining_time = 3.0 - duration
                start_times.append(start_time - remaining_time / 2)
                start_segms.append(start_segm) 
            elif duration >= 3 and duration < 4:
                start_times.extend([start_time, max(start_time - 1.5, 0), start_time + 1.5])
                start_segms.extend([start_segm, max(start_segm - 1, 0), start_segm + 1]) 
            elif duration >= 4 and duration < 6:
                start_times.extend([start_time, max(start_time - 1.5, 0), start_time + 3])
                start_segms.extend([start_segm, max(start_segm - 1, 0), start_segm + 1]) 
            elif duration >= 6:
                start_times.extend([max(start_time - 1.5, 0)])
                start_segms.extend([max(start_segm - 1, 0)]) 
                num_full_segm = int(duration // 3)
                start_times.extend([start_time + i * 3.0 for i in range(num_full_segm)])
                start_segms.extend([start_segm + i for i in range(num_full_segm)]) 
                remaining_time = duration - 3.0 * num_full_segm
                if remaining_time >= 1:
                    start_times.extend([start_time + 3.0 * num_full_segm])
                    start_segms.extend([start_segm + num_full_segm]) 

            # print(start_times, start_segms)
            if audio_path not in true_segments:
                true_segments[audio_path] = {}
            for segm in start_segms:
                if segm not in true_segments[audio_path]:
                    true_segments[audio_path][segm] = []
                true_segments[audio_path][segm].extend([category])
            # "E:\\Giacomo\\Tovanella-20241110T120546Z-001\\WABAD\\BAM\\BAM\\Recordings"

            if generate:
                audio = AudioSegment.from_file(os.path.join(
                    audio_source_path, site, site, "Recordings", audio_path), 
                    format="wav"
                )
                for i, start_time in enumerate(start_times):
                    export_path = os.path.join(
                        target_path,
                        category, 
                        f"{file_name}_{start_segms[i]}.wav"
                    )
                    if os.path.exists(export_path):
                        continue
                    segment = audio[start_time*1000:start_time*1000 + 3000]
                    os.makedirs(os.path.join(target_path, category), exist_ok=True)
                    segment.export(export_path, format="wav")
    return true_segments

In [44]:
true_segments = generate_segments(target_path="E:/Giacomo/Tovanella/all_segments_5/train", 
                  audio_source_path="E:/Giacomo/Tovanella/Tovanella", 
                  category_info=category_info, 
                  generate=False)

Loading Wind_ category... 0/31


100%|██████████| 92/92 [00:00<?, ?it/s]


Loading Regulus ignicapilla_Common Firecrest category... 1/31


100%|██████████| 500/500 [00:00<?, ?it/s]


Loading Sylvia atricapilla_Eurasian Blackcap category... 2/31


100%|██████████| 500/500 [00:00<?, ?it/s]


Loading Fringilla coelebs_Common Chaffinch category... 3/31


100%|██████████| 500/500 [00:00<?, ?it/s]


Loading Troglodytes troglodytes_Eurasian Wren category... 4/31


100%|██████████| 249/249 [00:00<00:00, 140923.18it/s]


Loading Muscicapa striata_Spotted Flycatcher category... 5/31


100%|██████████| 76/76 [00:00<?, ?it/s]


Loading Phylloscopus collybita_Common Chiffchaff category... 6/31


100%|██████████| 488/488 [00:00<?, ?it/s]


Loading Turdus viscivorus_Mistle Thrush category... 7/31


100%|██████████| 23/23 [00:00<?, ?it/s]


Loading Glaucidium passerinum_Eurasian Pygmy-Owl category... 8/31


100%|██████████| 6/6 [00:00<?, ?it/s]


Loading Pyrrhula pyrrhula_Eurasian Bullfinch category... 9/31


100%|██████████| 23/23 [00:00<?, ?it/s]


Loading Pecking_ category... 10/31


100%|██████████| 34/34 [00:00<?, ?it/s]


Loading Periparus ater_Coal Tit category... 11/31


100%|██████████| 199/199 [00:00<?, ?it/s]


Loading Prunella modularis_Dunnock category... 12/31


100%|██████████| 1/1 [00:00<?, ?it/s]


Loading Lophophanes cristatus_Crested Tit category... 13/31


100%|██████████| 48/48 [00:00<00:00, 48026.38it/s]


Loading Regulus regulus_Goldcrest category... 14/31


100%|██████████| 350/350 [00:00<00:00, 254024.29it/s]


Loading Insect_ category... 15/31


100%|██████████| 4/4 [00:00<?, ?it/s]


Loading Aeroplane_ category... 16/31


100%|██████████| 3/3 [00:00<?, ?it/s]


Loading Vegetation_ category... 17/31


100%|██████████| 62/62 [00:00<?, ?it/s]


Loading Rain_ category... 18/31


100%|██████████| 34/34 [00:00<?, ?it/s]


Loading Turdus merula_Eurasian Blackbird category... 19/31


100%|██████████| 53/53 [00:00<?, ?it/s]


Loading Certhia familiaris_Eurasian Treecreeper category... 20/31


100%|██████████| 118/118 [00:00<?, ?it/s]


Loading Erithacus rubecula_European Robin category... 21/31


100%|██████████| 500/500 [00:00<?, ?it/s]


Loading Turdus philomelos_Song Thrush category... 22/31


100%|██████████| 500/500 [00:00<?, ?it/s]


Loading Bat_ category... 23/31


100%|██████████| 1/1 [00:00<?, ?it/s]


Loading Loxia curvirostra_Common Crossbill category... 24/31


100%|██████████| 39/39 [00:00<00:00, 38947.11it/s]


Loading Dendrocopos major_Great Spotted Woodpecker category... 25/31


100%|██████████| 42/42 [00:00<?, ?it/s]


Loading Dryocopus martius_Black Woodpecker category... 26/31


100%|██████████| 28/28 [00:00<?, ?it/s]


Loading Phylloscopus trochilus_Willow Warbler category... 27/31


100%|██████████| 3/3 [00:00<?, ?it/s]


Loading Spinus spinus_Eurasian Siskin category... 28/31


100%|██████████| 5/5 [00:00<?, ?it/s]


Loading Poecile palustris_Marsh Tit category... 29/31


100%|██████████| 1/1 [00:00<?, ?it/s]


Loading unknown_ category... 30/31


100%|██████████| 3/3 [00:00<?, ?it/s]


In [45]:
# remove duplicate labels in segments
for audio in true_segments:
    for segm in true_segments[audio]:
        true_segments[audio][segm] = list(set(true_segments[audio][segm]))

In [None]:
# with open('utils/true_segments.json', 'w') as f:
#     json.dump(true_segments, f)

# WABAD Dataset Integration
Same procedure as above but for another dataset, requires a different treatment

In [47]:
# import species_dict_map
with open('utils/species_dict_map.json') as json_file:
    species_dict_map = json.load(json_file)

In [48]:
allowed_species_name = [species.split("_")[0] for species in species_list]

In [49]:
import csv
def convert_to_json(input_file):
    audio_info = {}
    category_info = {}
    for txt_file in os.listdir(input_file):
        complete_path = os.path.join(input_file, txt_file)
        with open(complete_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='\t') 
            for row in reader:
                if row["Species"] not in allowed_species_name or "End Time (s)" not in row:
                    continue
                file_name = txt_file.replace(".txt", ".WAV")
                start_time = float(row["Begin Time (s)"])
                end_time = float(row["End Time (s)"])
                duration = end_time - start_time
                common_name = species_dict_map[row["Species"]]
                label = f"{row['Species']}_{common_name}"

                audio_info_entry = {
                    "scientific_name": row["Species"],
                    "common_name": common_name, 
                    "start_time": start_time,
                    "duration": duration,
                    "label": label
                }
                category_info_entry = {
                    "file_name": file_name,
                    "start_time": start_time,
                    "duration": duration,
                    "label": label
                }
                
                if file_name not in audio_info:
                    audio_info[file_name] = []
                if label not in category_info:
                    category_info[label] = []
                audio_info[file_name].append(audio_info_entry)
                category_info[label].append(category_info_entry)
    return audio_info, category_info

In [50]:
wabad_path = "E:/Giacomo/Tovanella/WABAD"
audio_info_ext = {}
category_info_ext = {}
for folder in os.listdir(wabad_path):
    if not os.path.isdir(os.path.join(wabad_path, folder)):
        continue
    annotations = os.path.join(wabad_path, folder, folder, "Raven Pro annotations")
    audio_info_update, category_info_update = convert_to_json(annotations)
    for audio in audio_info_update.keys():
        if audio not in audio_info_ext:
            audio_info_ext[audio] = []
        audio_info_ext[audio].extend(audio_info_update[audio])
    for category in category_info_update.keys():
        if category not in category_info_ext:
            category_info_ext[category] = []
        category_info_ext[category].extend(category_info_update[category])    

# with open("utils/audio_info_ext.json", 'w', encoding='utf-8') as jsonfile:
#     json.dump(audio_info_ext, jsonfile)

# with open("utils/category_info_ext.json", 'w', encoding='utf-8') as jsonfile:
#     json.dump(category_info_ext, jsonfile)

In [51]:
true_segments_ext = generate_segments(target_path="E:/Giacomo/Tovanella/wabad_segments", 
                  audio_source_path="E:/Giacomo/Tovanella/WABAD/", 
                  category_info=category_info_ext, 
                  generate=False)

Loading Phylloscopus collybita_Common Chiffchaff category... 0/22


100%|██████████| 500/500 [00:00<00:00, 47968.89it/s]


Loading Phylloscopus trochilus_Willow Warbler category... 1/22


100%|██████████| 418/418 [00:00<00:00, 139243.83it/s]


Loading Dendrocopos major_Great Spotted Woodpecker category... 2/22


100%|██████████| 242/242 [00:00<?, ?it/s]


Loading Dryocopus martius_Black Woodpecker category... 3/22


100%|██████████| 32/32 [00:00<00:00, 31926.20it/s]


Loading Erithacus rubecula_European Robin category... 4/22


100%|██████████| 500/500 [00:00<00:00, 163100.95it/s]


Loading Turdus philomelos_Song Thrush category... 5/22


100%|██████████| 500/500 [00:00<?, ?it/s]


Loading Certhia familiaris_Eurasian Treecreeper category... 6/22


100%|██████████| 96/96 [00:00<00:00, 35215.43it/s]


Loading Fringilla coelebs_Common Chaffinch category... 7/22


100%|██████████| 500/500 [00:00<00:00, 124986.71it/s]


Loading Turdus merula_Eurasian Blackbird category... 8/22


100%|██████████| 500/500 [00:00<00:00, 500036.24it/s]


Loading Troglodytes troglodytes_Eurasian Wren category... 9/22


100%|██████████| 500/500 [00:00<00:00, 166705.25it/s]


Loading Prunella modularis_Dunnock category... 10/22


100%|██████████| 75/75 [00:00<00:00, 75005.44it/s]


Loading Regulus regulus_Goldcrest category... 11/22


100%|██████████| 437/437 [00:00<00:00, 436823.37it/s]


Loading Periparus ater_Coal Tit category... 12/22


100%|██████████| 500/500 [00:00<00:00, 166731.75it/s]


Loading Regulus ignicapilla_Common Firecrest category... 13/22


100%|██████████| 280/280 [00:00<00:00, 280287.62it/s]


Loading Sylvia atricapilla_Eurasian Blackcap category... 14/22


100%|██████████| 500/500 [00:00<00:00, 99959.58it/s]


Loading Pyrrhula pyrrhula_Eurasian Bullfinch category... 15/22


100%|██████████| 8/8 [00:00<?, ?it/s]


Loading Spinus spinus_Eurasian Siskin category... 16/22


100%|██████████| 102/102 [00:00<?, ?it/s]


Loading Poecile palustris_Marsh Tit category... 17/22


100%|██████████| 12/12 [00:00<?, ?it/s]


Loading Lophophanes cristatus_Crested Tit category... 18/22


100%|██████████| 132/132 [00:00<?, ?it/s]


Loading Turdus viscivorus_Mistle Thrush category... 19/22


100%|██████████| 229/229 [00:00<?, ?it/s]


Loading Loxia curvirostra_Common Crossbill category... 20/22


100%|██████████| 25/25 [00:00<?, ?it/s]


Loading Muscicapa striata_Spotted Flycatcher category... 21/22


100%|██████████| 117/117 [00:00<?, ?it/s]


In [None]:
# with open("utils/true_segments_ext.json", "w") as f:
#     json.dump(true_segments_ext, f)