# Data Preprocessing

In [None]:
import os
import random
import pandas as pd
from tqdm import tqdm

In [2]:
DATASET_NAME = 'dataset'
AUDIO_SOURCE = '/home/giacomoschiavo/Tovanella'

In [3]:
DATASET_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}'
TRAIN_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/train'
VALID_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/valid'
TEST_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/test'

In [4]:
species_list = set(os.listdir(TEST_PATH)).intersection(set(os.listdir(TRAIN_PATH)))
if 'Engine_Engine' in species_list:
    species_list.remove('Engine_Engine')
if 'Cuculus canorus_Common Cuckoo' in species_list:
    species_list.remove('Cuculus canorus_Common Cuckoo')

In [5]:
def print_dataset_count_table(dataset_path):
    train_folder = f"{dataset_path}/train"
    valid_folder = f"{dataset_path}/valid"
    test_folder = f"{dataset_path}/test"

    dataset_count = {}
    for species in os.listdir(test_folder):
        if species not in species_list:
            continue
        dataset_count[species] = {
            "train": len(os.listdir(os.path.join(train_folder, species))) if os.path.exists(os.path.join(train_folder, species)) else 0,
            "valid": len(os.listdir(os.path.join(valid_folder, species))) if os.path.exists(os.path.join(valid_folder, species)) else 0,
            "test": len(os.listdir(os.path.join(test_folder, species))) if os.path.exists(os.path.join(test_folder, species)) else 0
        }

    dataset_species_count_df = pd.DataFrame.from_dict(dataset_count, orient='index')
    dataset_species_count_df.index.name = 'Species'
    return dataset_species_count_df.sort_values(by=["train"], ascending=False)


In [6]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13800,1413,4683
Fringilla coelebs_Common Chaffinch,8995,807,1666
Sylvia atricapilla_Eurasian Blackcap,3540,373,540
Regulus ignicapilla_Common Firecrest,2817,285,599
Phylloscopus collybita_Common Chiffchaff,2129,218,674
Erithacus rubecula_European Robin,1702,171,556
Troglodytes troglodytes_Eurasian Wren,1160,123,278
Regulus regulus_Goldcrest,1019,72,168
Periparus ater_Coal Tit,1000,91,232
Wind,838,46,194


In [7]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13800,1413,4683
Fringilla coelebs_Common Chaffinch,8995,807,1666
Sylvia atricapilla_Eurasian Blackcap,3540,373,540
Regulus ignicapilla_Common Firecrest,2817,285,599
Phylloscopus collybita_Common Chiffchaff,2129,218,674
Erithacus rubecula_European Robin,1702,171,556
Troglodytes troglodytes_Eurasian Wren,1160,123,278
Regulus regulus_Goldcrest,1019,72,168
Periparus ater_Coal Tit,1000,91,232
Wind,838,46,194


# Sample Removal

In [8]:
train_folder = f"{DATASET_PATH}/train"
train_removed = f"{DATASET_PATH}/train_removed"

In [9]:
os.makedirs(train_removed, exist_ok=True)
for species in os.listdir(train_removed):
    all_audio = os.listdir(os.path.join(train_removed, species))
    for audio in all_audio:
        os.rename(
            os.path.join(train_removed, species, audio),
            os.path.join(train_folder, species, audio)
        )

In [10]:

for species in os.listdir(train_folder):
    if species != 'None':
        continue
    all_audio = os.listdir(os.path.join(train_folder, species))
    if len(all_audio) <= 100:
        continue
    random.shuffle(all_audio)
    os.makedirs(os.path.join(train_removed, species), exist_ok=True)
    chosen_audio = all_audio[:5000]
    for audio in all_audio:
        if audio in chosen_audio:
            continue
        print(species, audio)
        os.rename(
            os.path.join(train_folder, species, audio),
            os.path.join(train_removed, species, audio)
        )

None 20190621_020000_480_0.wav
None 20190603_150000_117_0.wav
None 20190608_040000_207_0.wav
None 20200215_070000_354_0.wav
None 20190603_110000_325_5.wav
None 20200216_110000_331_5.wav
None 20200216_060000_484_5.wav
None 20190603_230000_237_0.wav
None 20200217_060000_57_0.wav
None 20190621_020000_165_0.wav
None 20200217_060000_420_0.wav
None 20200217_160000_1_5.wav
None 20190621_110000_114_0.wav
None 20190621_110000_55_5.wav
None 20200217_090000_313_5.wav
None 20190608_110000_519_0.wav
None 20190608_080000_333_0.wav
None 20200217_060000_240_0.wav
None XC696997_0_81_0.wav
None 20190621_140000_4_5.wav
None 20200217_110000_360_0.wav
None 20190608_120000_57_0.wav
None 20200215_090000_244_5.wav
None 20200216_080000_93_0.wav
None 20190603_230000_442_5.wav
None 20190603_150000_202_5.wav
None 20190603_140000_73_5.wav
None 20200217_140000_73_5.wav
None 20190603_040000_520_5.wav
None 20190608_170000_157_5.wav
None 20190621_210000_39_0.wav
None 20190608_060000_513_0.wav
None 20190621_010000_495_

# Augmentation

In [11]:
REMOVED_PATH = f'{DATASET_PATH}/augm_removed'
species_to_augment = ['Coccothraustes coccothraustes_Hawfinch',
 'Lophophanes cristatus_Crested Tit',
 'Loxia curvirostra_Common Crossbill',
 'Parus major_Great Tit']

In [12]:
# MOVE ALREADY CALCULATED AUGMENTATIONS
if os.path.exists(REMOVED_PATH):
    for species in species_to_augment:
        for audio in os.listdir(os.path.join(REMOVED_PATH, species)):
            if "aug" in audio:
                os.rename(
                    os.path.join(REMOVED_PATH, species, audio),
                    os.path.join(TRAIN_PATH, species, audio)
                )

In [13]:
import librosa
import soundfile as sf
from audiomentations import Compose, PitchShift, TimeStretch, AddBackgroundNoise, Gain

In [14]:
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    return audio, sr

def save_audio(file_path, audio, sr):
    sf.write(file_path, audio, sr)

In [15]:

bg_noise_path = f"{DATASET_PATH}/train/None"
bg_noises = os.listdir(bg_noise_path)
augmentations = {
    "psA": Compose([PitchShift(min_semitones=-3, max_semitones=-1, p=0.75)]),
    "psB": Compose([PitchShift(min_semitones=1, max_semitones=3, p=0.75)]),
    "ts": Compose([TimeStretch(min_rate=0.9, max_rate=1.1, p=0.75)]),
    "gain": Compose([Gain(min_gain_db=-5, max_gain_db=5, p=0.5)]),
    "bn": Compose([
        AddBackgroundNoise(sounds_path=os.path.join(bg_noise_path, random.choice(bg_noises)), p=0.8),
        Gain(min_gain_db=-5, max_gain_db=5, p=0.5) 
    ]),
}


def apply_augmentations(file_path, output_dir):
    audio, sr = load_audio(file_path)
    
    for aug_name, aug in augmentations.items():
        file_name = os.path.splitext(file_path)[0].split('/')[-1]
        if "aug" in file_name:
            continue
        save_audio_path = f"{output_dir}/{file_name}_aug_{aug_name}.wav"
        if os.path.exists(save_audio_path):
            continue
        if "spec" in save_audio_path:
            augmented_audio, _, _ = aug(audio, sr)
        else:
            augmented_audio = aug(audio, sr)
        save_audio(save_audio_path, augmented_audio, sr)

In [16]:
# # pick samples where validation samples == 0
# train_folder = f'{DATASET_PATH}/train'
# species_to_augment = []
# threshold = 250
# for species in os.listdir(train_folder):
#     if len(species.split("_")) <= 1:
#         print(species, "skipped")
#         continue
#     if len(os.listdir(os.path.join(train_folder, species))) >= threshold:
#         continue
#     species_to_augment.append(species)

# species_to_augment

In [17]:
for species in species_to_augment:
    files = os.listdir(os.path.join(train_folder, species))
    print("Augmenting ", species)
    for i in tqdm(range(len(files)), colour="blue"):
        audio = files[i]
        apply_augmentations(os.path.join(train_folder, species, audio), os.path.join(train_folder, species))

Augmenting  Coccothraustes coccothraustes_Hawfinch


100%|[34m██████████[0m| 81/81 [00:11<00:00,  7.10it/s]


Augmenting  Lophophanes cristatus_Crested Tit


100%|[34m██████████[0m| 140/140 [00:19<00:00,  7.04it/s]


Augmenting  Loxia curvirostra_Common Crossbill


100%|[34m██████████[0m| 193/193 [00:26<00:00,  7.24it/s]


Augmenting  Parus major_Great Tit


100%|[34m██████████[0m| 118/118 [00:16<00:00,  7.32it/s]


In [18]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fringilla coelebs_Common Chaffinch,8995,807,1666
,5000,1413,4683
Sylvia atricapilla_Eurasian Blackcap,3540,373,540
Regulus ignicapilla_Common Firecrest,2817,285,599
Phylloscopus collybita_Common Chiffchaff,2129,218,674
Erithacus rubecula_European Robin,1702,171,556
Troglodytes troglodytes_Eurasian Wren,1160,123,278
Loxia curvirostra_Common Crossbill,1158,19,46
Regulus regulus_Goldcrest,1019,72,168
Periparus ater_Coal Tit,1000,91,232


## Augmentation removal

In [19]:
# train_path = f"{DATASET_PATH}/train"
# target_path = f"{DATASET_PATH}/augm_removed"
# os.makedirs(target_path, exist_ok=True)
# for species in os.listdir(train_path):
#     for audio in os.listdir(os.path.join(train_path, species)):
#         os.makedirs(os.path.join(target_path, species), exist_ok=True)
#         if "aug" in audio:
#             print(audio)
#             os.rename(
#                 os.path.join(train_path, species, audio),
#                 os.path.join(target_path, species, audio),
#             )

In [20]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fringilla coelebs_Common Chaffinch,8995,807,1666
,5000,1413,4683
Sylvia atricapilla_Eurasian Blackcap,3540,373,540
Regulus ignicapilla_Common Firecrest,2817,285,599
Phylloscopus collybita_Common Chiffchaff,2129,218,674
Erithacus rubecula_European Robin,1702,171,556
Troglodytes troglodytes_Eurasian Wren,1160,123,278
Loxia curvirostra_Common Crossbill,1158,19,46
Regulus regulus_Goldcrest,1019,72,168
Periparus ater_Coal Tit,1000,91,232
