# Data augmentation

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import json
import librosa
import soundfile as sf
import random
import pandas as pd
from tqdm import tqdm

In [3]:
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    return audio, sr

def save_audio(file_path, audio, sr):
    sf.write(file_path, audio, sr)

In [4]:
from audiomentations import Compose, PitchShift, TimeStretch, AddBackgroundNoise

bg_noise_path = "e:/Giacomo/Tovanella/soundscapes"
bg_noises = os.listdir(bg_noise_path)
augmentations = {
    "ps": Compose([PitchShift(min_semitones=-1, max_semitones=1, p=1.0)]),
    "ts": Compose([TimeStretch(min_rate=0.95, max_rate=1.05, p=1.0)]),
    "bn": Compose([AddBackgroundNoise(sounds_path=os.path.join(bg_noise_path, random.choice(bg_noises)), p=1.0)]),
}

def apply_augmentations(file_path, output_dir):
    audio, sr = load_audio(file_path)
    
    for aug_name, aug in augmentations.items():
        file_name = os.path.splitext(file_path)[0].split('\\')[-1]
        if len(file_name.split("_")) >= 4:
            continue
        if os.path.exists(f"{output_dir}/{file_name}_{aug_name}.wav"):
            continue
        augmented_audio = aug(samples=audio, sample_rate=sr)
        save_audio(f"{output_dir}/{file_name}_{aug_name}.wav", augmented_audio, sr)

In [5]:
# divide in subset
subset_folder = "E:/Giacomo/Tovanella/DATASET/subset_training"
training_folder = "E:/Giacomo/Tovanella/orig_segments/train"

# for specie in os.listdir(training_folder):
#     os.makedirs(os.path.join(subset_folder, specie), exist_ok=True)
#     # select 10 random audio for specie in training folder
#     all_audios = os.listdir(os.path.join(training_folder, specie))
#     if len(all_audios) < 10:
#         selected_sample = all_audios
#     else:
#         selected_sample = np.random.choice(all_audios, 10, replace=False)
#     for audio in selected_sample:
#         os.rename(
#             os.path.join(training_folder, specie, audio),
#             os.path.join(subset_folder, specie, audio)
#         )

In [6]:
# augm_path = "E:/Giacomo/Tovanella/orig_segments/train"
# for specie in os.listdir(subset_folder):
#     files = os.listdir(os.path.join(subset_folder, specie))
#     for i in tqdm(range(len(files))):
#         audio = files[i]
#         apply_augmentations(os.path.join(subset_folder, specie, audio), os.path.join(subset_folder, specie))

In [None]:
valid_folder = "E:/Giacomo/Tovanella/DATASET/validation"
test_folder = "E:/Giacomo/Tovanella/DATASET/test"
for specie in os.listdir(subset_folder):
    audios = os.listdir(os.path.join(test_folder, specie))
    n_valid_samples = len(audios) // 4
    valid_sample = np.random.choice(audios, n_valid_samples, replace=False)
    os.makedirs(os.path.join(valid_folder, specie), exist_ok=True)
    # for audio in valid_sample:
    #     os.rename(
    #         os.path.join(test_folder, specie, audio),
    #         os.path.join(valid_folder, specie, audio)
    #     )

In [14]:
species_folders = os.listdir(valid_folder)
# list all elements in each folder in folders
species_count = { folder: {
    # "train_count": len(os.listdir(os.path.join(training_folder, folder))),
    "train": len(os.listdir(os.path.join(subset_folder, folder))),
    "valid": len(os.listdir(os.path.join(valid_folder, folder))),
    "test": len(os.listdir(os.path.join(test_folder, folder))),
    } for folder in species_folders }

info = pd.DataFrame(species_count).T
info

Unnamed: 0,train,valid,test
Certhia familiaris_Eurasian Treecreeper,80,2,7
Dendrocopos major_Great Spotted Woodpecker,48,2,7
Dryocopus martius_Black Woodpecker,80,2,9
Erithacus rubecula_European Robin,80,6,18
Fringilla coelebs_Common Chaffinch,80,193,580
Lophophanes cristatus_Crested Tit,80,2,8
Loxia curvirostra_Common Crossbill,80,6,18
Muscicapa striata_Spotted Flycatcher,80,0,1
Pecking_,80,1,3
Periparus ater_Coal Tit,80,11,33
