# Data augmentation

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import json
import librosa
import soundfile as sf
import random
import pandas as pd
from tqdm import tqdm
from audiomentations import Compose, PitchShift, TimeStretch, AddBackgroundNoise, Gain

In [10]:
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    return audio, sr

def save_audio(file_path, audio, sr):
    sf.write(file_path, audio, sr)

In [11]:
bg_noise_path = "e:/Giacomo/Tovanella/soundscapes"
bg_noises = os.listdir(bg_noise_path)
augmentations = {
    "ps": Compose([PitchShift(min_semitones=-1, max_semitones=1, p=1.0)]),
    # "ts": Compose([TimeStretch(min_rate=0.95, max_rate=1.05, p=1.0)]),
    # "bn": Compose([AddBackgroundNoise(sounds_path=os.path.join(bg_noise_path, random.choice(bg_noises)), p=1.0)]),
}

def apply_augmentations(file_path, output_dir):
    audio, sr = load_audio(file_path)
    
    for aug_name, aug in augmentations.items():
        file_name = os.path.splitext(file_path)[0].split('\\')[-1]
        if len(file_name.split("_")) >= 4:
            continue
        if os.path.exists(f"{output_dir}/{file_name}_{aug_name}.wav"):
            continue
        augmented_audio = aug(samples=audio, sample_rate=sr)
        save_audio(f"{output_dir}/{file_name}_{aug_name}.wav", augmented_audio, sr)

### ripristina training folder spostando tutto da subset

In [12]:
subset_folder = "E:/Giacomo/Tovanella/DATASET/subset_training"
training_folder = "E:/Giacomo/Tovanella/orig_segments/train"

for specie in os.listdir(subset_folder):
    for audio in os.listdir(os.path.join(subset_folder, specie)):
        if len(audio.split("_")) == 3:
            os.rename(
                os.path.join(subset_folder, specie, audio),
                os.path.join(training_folder, specie, audio)
            ) 

In [13]:
# divide in subset
# subset_folder = "E:/Giacomo/Tovanella/DATASET/subset_training"
training_folder = "E:/Giacomo/Tovanella/orig_segments/train"

for specie in os.listdir(training_folder):
    os.makedirs(os.path.join(subset_folder, specie), exist_ok=True)
    # select 10 random audio for specie in training folder
    all_audios = os.listdir(os.path.join(training_folder, specie))
    n_samples = 100
    if len(all_audios) <= n_samples:
        selected_sample = all_audios
    else:
        selected_sample = np.random.choice(all_audios, n_samples, replace=False)
    # for audio in selected_sample:
        # os.rename(
        #     os.path.join(training_folder, specie, audio),
        #     os.path.join(subset_folder, specie, audio)
        # )

KeyboardInterrupt: 

### move augmentation in training other folder

In [None]:
# remove any augmented audio
training_folder = "E:/Giacomo/Tovanella/orig_segments/train"
other_folder = "E:/Giacomo/Tovanella/orig_segments/altro"
os.makedirs(other_folder, exist_ok=True)
for specie in os.listdir(training_folder):
    for audio in os.listdir(os.path.join(training_folder, specie)):
        os.makedirs(os.path.join(other_folder, specie), exist_ok=True)
        if len(audio.split("_")) > 3:
            os.rename(
                os.path.join(training_folder, specie, audio),
                os.path.join(other_folder, specie, audio)
            ) 

### applica augmentation su tutti i file in subset

In [None]:
augmentations = {
    "ps": Compose([PitchShift(min_semitones=-3, max_semitones=3, p=1.0)]),
    "ts": Compose([TimeStretch(min_rate=0.9, max_rate=1.1, p=1.0)]),
    "bn": Compose([
        AddBackgroundNoise(sounds_path=os.path.join(bg_noise_path, random.choice(bg_noises)), p=1.0),
        Gain(min_gain_db=-10, max_gain_db=5, p=0.5)  # Cambia il volume del rumore
    ]),
    "combo": Compose([
        PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
        TimeStretch(min_rate=0.9, max_rate=1.1, p=0.5),
        AddBackgroundNoise(sounds_path=os.path.join(bg_noise_path, random.choice(bg_noises)), p=0.5)
    ]),
}

augm_path = "E:/Giacomo/Tovanella/orig_segments/"
for specie in os.listdir(augm_path):
    files = os.listdir(os.path.join(augm_path, specie))
    if len(files) > 200:
        continue
    print(f"Augmenting {specie}, {len(files)} audios")
    # for i in tqdm(range(len(files))):
    #     audio = files[i]
    #     apply_augmentations(os.path.join(augm_path, specie, audio), os.path.join(augm_path, specie))

Augmenting Certhia familiaris_Eurasian Treecreeper, 37 audios


100%|██████████| 37/37 [00:06<00:00,  5.52it/s]


Augmenting Dendrocopos major_Great Spotted Woodpecker, 12 audios


100%|██████████| 12/12 [00:02<00:00,  5.68it/s]


Augmenting Dryocopus martius_Black Woodpecker, 24 audios


100%|██████████| 24/24 [00:04<00:00,  5.42it/s]


Augmenting Lophophanes cristatus_Crested Tit, 42 audios


100%|██████████| 42/42 [00:07<00:00,  5.56it/s]


Augmenting Loxia curvirostra_Common Crossbill, 77 audios


100%|██████████| 77/77 [00:13<00:00,  5.53it/s]


Augmenting Muscicapa striata_Spotted Flycatcher, 129 audios


100%|██████████| 129/129 [00:22<00:00,  5.82it/s]


Augmenting Pecking_, 87 audios


100%|██████████| 87/87 [00:16<00:00,  5.35it/s]


Augmenting Periparus ater_Coal Tit, 88 audios


100%|██████████| 88/88 [00:16<00:00,  5.45it/s]


Augmenting Regulus regulus_Goldcrest, 175 audios


100%|██████████| 175/175 [00:31<00:00,  5.50it/s]


Augmenting Turdus merula_Eurasian Blackbird, 58 audios


100%|██████████| 58/58 [00:10<00:00,  5.38it/s]


Augmenting Vegetation_, 46 audios


100%|██████████| 46/46 [00:08<00:00,  5.22it/s]


Augmenting Wind_, 158 audios


100%|██████████| 158/158 [00:26<00:00,  6.01it/s]


In [None]:
# valid_folder = "E:/Giacomo/Tovanella/DATASET/validation"
# test_folder = "E:/Giacomo/Tovanella/DATASET/test"
# for specie in os.listdir(valid_folder):
#     for audio in os.listdir(os.path.join(valid_folder, specie)):
#         os.rename(
#             os.path.join(valid_folder, specie, audio),
#             os.path.join(test_folder, specie, audio)
#         )

In [None]:
valid_folder = "E:/Giacomo/Tovanella/DATASET/validation"
test_folder = "E:/Giacomo/Tovanella/DATASET/test"
for specie in os.listdir(test_folder):
    audios = os.listdir(os.path.join(test_folder, specie))
    n_valid_samples = len(audios) // 4
    valid_sample = np.random.choice(audios, n_valid_samples, replace=False)
    os.makedirs(os.path.join(valid_folder, specie), exist_ok=True)
    # for audio in valid_sample:
    #     os.rename(
    #         os.path.join(test_folder, specie, audio),
    #         os.path.join(valid_folder, specie, audio)
    #     )

In [None]:
training_folder = "E:/Giacomo/Tovanella/orig_segments/train"
species_folders = os.listdir(training_folder)
species_count = { folder: {
    "train": len(os.listdir(os.path.join(training_folder, folder))),
    # "valid": len(os.listdir(os.path.join(valid_folder, folder))),
    "test": len(os.listdir(os.path.join(test_folder, folder))),
    } for folder in species_folders }

info = pd.DataFrame(species_count).T
info.sort_values("train", ascending=False)

Unnamed: 0,train,test
Sylvia atricapilla_Eurasian Blackcap,2702,169
Fringilla coelebs_Common Chaffinch,2632,580
Turdus philomelos_Song Thrush,2360,21
Troglodytes troglodytes_Eurasian Wren,1828,20
Periparus ater_Coal Tit,1399,33
Erithacus rubecula_European Robin,1259,18
Regulus ignicapilla_Common Firecrest,1237,91
Turdus merula_Eurasian Blackbird,1162,3
Phylloscopus collybita_Common Chiffchaff,1042,773
Phylloscopus trochilus_Willow Warbler,670,4
