# Data Preprocessing

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import json
import librosa
import soundfile as sf
import random
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm
from birdlib import utils

In [6]:
DATASET_NAME = 'dataset'
AUDIO_SOURCE = '/home/giacomoschiavo/Tovanella'

In [7]:
DATASET_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}'
TRAIN_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/train'
VALID_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/valid'
TEST_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/test'

In [8]:
species_list = set(os.listdir(TEST_PATH)).intersection(set(os.listdir(TRAIN_PATH)))
if 'Engine_Engine' in species_list:
    species_list.remove('Engine_Engine')
if 'Cuculus canorus_Common Cuckoo' in species_list:
    species_list.remove('Cuculus canorus_Common Cuckoo')

In [9]:
def print_dataset_count_table(dataset_path):
    train_folder = f"{dataset_path}/train"
    valid_folder = f"{dataset_path}/valid"
    test_folder = f"{dataset_path}/test"

    dataset_count = {}
    for species in os.listdir(test_folder):
        if species not in species_list:
            continue
        dataset_count[species] = {
            "train": len(os.listdir(os.path.join(train_folder, species))) if os.path.exists(os.path.join(train_folder, species)) else 0,
            "valid": len(os.listdir(os.path.join(valid_folder, species))) if os.path.exists(os.path.join(valid_folder, species)) else 0,
            "test": len(os.listdir(os.path.join(test_folder, species))) if os.path.exists(os.path.join(test_folder, species)) else 0
        }

    dataset_species_count_df = pd.DataFrame.from_dict(dataset_count, orient='index')
    dataset_species_count_df.index.name = 'Species'
    return dataset_species_count_df.sort_values(by=["train"], ascending=False)


In [26]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13860,0,4683
Fringilla coelebs_Common Chaffinch,8034,0,1666
Sylvia atricapilla_Eurasian Blackcap,3742,0,540
Regulus ignicapilla_Common Firecrest,2857,0,599
Phylloscopus collybita_Common Chiffchaff,2172,0,674
Erithacus rubecula_European Robin,1726,0,556
Troglodytes troglodytes_Eurasian Wren,1227,0,278
Periparus ater_Coal Tit,957,0,232
Turdus merula_Eurasian Blackbird,730,0,315
Regulus regulus_Goldcrest,725,0,168


In [29]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13800,1413,4683
Fringilla coelebs_Common Chaffinch,8995,807,1666
Sylvia atricapilla_Eurasian Blackcap,3540,373,540
Regulus ignicapilla_Common Firecrest,2817,285,599
Phylloscopus collybita_Common Chiffchaff,2129,218,674
Erithacus rubecula_European Robin,1702,171,556
Troglodytes troglodytes_Eurasian Wren,1160,123,278
Regulus regulus_Goldcrest,1019,72,168
Periparus ater_Coal Tit,1000,91,232
Wind,838,46,194


In [12]:
dates_count = utils.get_date_count(TRAIN_PATH, species_list)

# Test Integration

In [13]:
train_integration, test_integration = utils.split_dataset(dates_count, TEST_PATH, test_ratio=0.2)

with open(f"utils/{DATASET_NAME}/train_integration.json", 'w') as f:
    json.dump(train_integration, f)

with open(f"utils/{DATASET_NAME}/test_integration.json", 'w') as f:
    json.dump(test_integration, f)

In [58]:
# with open(f"utils/{DATASET_NAME}/train_integration.json") as f:
#     train_integration = json.load(f)

# with open(f"utils/{DATASET_NAME}/test_integration.json") as f:
#     test_integration = json.load(f)

In [None]:
utils.move_by_date(test_integration, TRAIN_PATH, TEST_PATH)

# Validation creation

In [27]:
# take 5% of the last samples from each date, samples are ordered by the segment number (32_5 -> 32.5)
for species in species_list:
    os.makedirs(os.path.join(VALID_PATH, species), exist_ok=True)
    train_species_path = os.path.join(TRAIN_PATH, species)
    train_files = sorted(os.listdir(train_species_path))
    date_count = utils.get_date_count(TRAIN_PATH, species_list)
    for date in dates_count[species]:
        date_files = [f for f in train_files if f.startswith(date)]
        date_files = sorted(date_files, key=lambda x: float(x.split('_')[-2] + '.' + x.split('_')[-1].split('.')[0]))
        if len(date_files) == 0:
            continue
        num_samples_to_move = max(1, int(len(date_files) * 0.1))
        samples_to_move = date_files[-num_samples_to_move:]
        
        for sample in samples_to_move:
            src_path = os.path.join(train_species_path, sample)
            dest_path = os.path.join(VALID_PATH, species, sample)
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            os.rename(src_path, dest_path)        


In [8]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fringilla coelebs_Common Chaffinch,10098,0,1370
,7082,0,4683
Sylvia atricapilla_Eurasian Blackcap,3960,0,493
Regulus ignicapilla_Common Firecrest,3463,0,238
Phylloscopus collybita_Common Chiffchaff,2347,0,674
Erithacus rubecula_European Robin,1873,0,556
Loxia curvirostra_Common Crossbill,1462,0,11
Troglodytes troglodytes_Eurasian Wren,1450,0,111
Periparus ater_Coal Tit,1294,0,29
Regulus regulus_Goldcrest,1243,0,16


In [20]:
len(os.listdir(TRAIN_PATH)), len(os.listdir(TEST_PATH)), len(os.listdir(VALID_PATH))

(55, 23, 20)

## Removed unused species

In [21]:
valid_species = os.listdir(VALID_PATH)
REMOVED_PATH = f'{DATASET_PATH}/removed'
os.makedirs(REMOVED_PATH, exist_ok=True)
REMOVED_TRAIN_PATH = f'{REMOVED_PATH}/train'
os.makedirs(REMOVED_TRAIN_PATH, exist_ok=True)
for species in os.listdir(TRAIN_PATH):
    if species not in valid_species:
        os.makedirs(os.path.join(REMOVED_TRAIN_PATH, species), exist_ok=True)
        os.rename(
            os.path.join(TRAIN_PATH, species),
            os.path.join(REMOVED_TRAIN_PATH, species)
        )

In [22]:
valid_species = os.listdir(VALID_PATH)
REMOVED_TEST_PATH = f'{REMOVED_PATH}/test'
os.makedirs(REMOVED_TEST_PATH, exist_ok=True)
for species in os.listdir(TEST_PATH):
    if species not in valid_species:
        os.makedirs(os.path.join(REMOVED_TEST_PATH, species), exist_ok=True)
        os.rename(
            os.path.join(TEST_PATH, species),
            os.path.join(REMOVED_TEST_PATH, species)
        )

In [23]:
len(os.listdir(TRAIN_PATH)), len(os.listdir(TEST_PATH)), len(os.listdir(VALID_PATH))

(20, 20, 20)

In [24]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13131,729,4683
Fringilla coelebs_Common Chaffinch,7623,411,1666
Sylvia atricapilla_Eurasian Blackcap,3556,186,540
Regulus ignicapilla_Common Firecrest,2715,142,599
Phylloscopus collybita_Common Chiffchaff,2060,112,674
Erithacus rubecula_European Robin,1633,93,556
Troglodytes troglodytes_Eurasian Wren,1162,65,278
Periparus ater_Coal Tit,909,48,232
Regulus regulus_Goldcrest,690,35,168
Turdus merula_Eurasian Blackbird,689,41,315


## Undo Valid

In [None]:
# # MOVE VALID TO TRAIN
# for species in tqdm(os.listdir(VALID_PATH)):
#     if species not in species_list:
#         continue
#     species_train_path = os.path.join(TRAIN_PATH, species)
#     species_valid_path = os.path.join(VALID_PATH, species)
#     if not os.path.exists(species_train_path):
#         os.makedirs(species_train_path)

#     for file in os.listdir(species_valid_path):
#         src = os.path.join(species_valid_path, file)
#         dst = os.path.join(species_train_path, file)
#         if not os.path.exists(dst):
#             os.rename(src, dst)
#         else:
#             print(f"File {dst} already exists, skipping {src}")