In [None]:
# -*- coding: utf-8 -*-
# -----------------------------------------------------------------------------
# Module name       : training of the ResNext model for audio
# Description       : Read Functions
# Author            : Francisco Gómez <fagomezj@unal.edu.co>, Freddy Hernández <fohernandezr@unal.edu.co>
# Creation date     : 2025
# -----------------------------------------------------------------------------

# Note: This notebook runs locally with using NVIDIA RTX-4090 GPU (24G)

# Create the DB with reduced data and train the diffusion model

Create a database with subsamples of the original database

In [3]:
import os
import shutil
import random
from pathlib import Path

def create_variable_sample_datasets(
    original_dataset_path,
    output_base_path,
    samples_config,
    dataset_names=("dataset1", "dataset2", "dataset3"),
    seed=42
):
    """
    Creates three non-overlapping datasets from a class-based folder structure,
    allowing a different number of samples per class for each dataset.

    Parameters:
        original_dataset_path (str): Path to the original dataset.
        output_base_path (str): Path where the new sampled datasets will be saved.
        samples_config (dict): Dictionary with class names as keys and tuples as values.
                               Each tuple specifies the number of samples for dataset1, dataset2, and dataset3.
                               Example: {"dogs": (3, 2, 5), "cats": (4, 4, 2)}
        dataset_names (tuple): Names of the output datasets (default: ("dataset1", "dataset2", "dataset3")).
        seed (int): Random seed for reproducibility.
    """
    random.seed(seed)
    original_dataset_path = Path(original_dataset_path)
    output_base_path = Path(output_base_path)

    for class_name, (n1, n2, n3) in samples_config.items():
        class_folder = original_dataset_path / class_name
        if not class_folder.exists():
            print(f"[!] Class '{class_name}' not found. Skipping.")
            continue

        audio_files = list(class_folder.glob("*.wav"))
        total_needed = n1 + n2 + n3

        if len(audio_files) < total_needed:
            print(f"[!] Not enough files in '{class_name}': need {total_needed}, found {len(audio_files)}. Skipping.")
            continue

        random.shuffle(audio_files)
        index = 0
        sample_counts = [n1, n2, n3]

        for i, dataset_name in enumerate(dataset_names):
            sample_count = sample_counts[i]
            selected_files = audio_files[index:index + sample_count]
            index += sample_count

            dest_folder = output_base_path / dataset_name / class_name
            dest_folder.mkdir(parents=True, exist_ok=True)

            for f in selected_files:
                shutil.copy(f, dest_folder / f.name)

    print(f"✅ Datasets created under: {output_base_path}")

samples_config = {
"s1":(20,	15,	175),
"s2":(20,	15,	255),
"s3":(20,	15,	135),
"s4":(20,	15,	21),
"s5":(20,	15,	47),
"s6":(20,	15,	20),
"s7":(20,	15,	55),
"s8":(20,	15,	220),
"s9":(20,	15,	152) 
}

create_variable_sample_datasets(
    # path with the original data
    original_dataset_path="C:/Users/user/Documents/backup2/soundDB/fullDB_original",
    # path with reduced amount of training data
    output_base_path="C:/Users/user/Documents/soundDBAugmented/train_val_test_fulldb",
    # dataset names
    dataset_names=("train", "validation", "test"),
    samples_config=samples_config
)

✅ Datasets created under: C:\Users\user\Documents\soundDBAugmented\train_val_test_fulldb


Creates copies of data for training the diffusion model

In [5]:
import os
import shutil
from pathlib import Path

def copy_files_until_n(source_folder, target_folder, n):
    """
    Copies files from a source folder to a target folder until reaching a total of n copies.
    
    Parameters:
        source_folder (str): Path to the folder with the original files.
        target_folder (str): Path where the copied files will be saved.
        n (int): Total number of copied files desired.
    """
    source_folder = Path(source_folder)
    target_folder = Path(target_folder)
    target_folder.mkdir(parents=True, exist_ok=True)

    files = list(source_folder.glob("*"))  # You can filter by extension if needed

    if not files:
        print("No files found in the source folder.")
        return

    total_files = len(files)

    for i in range(n):
        original_file = files[i % total_files]
        name = original_file.stem
        extension = original_file.suffix
        new_name = f"{name}_copy_{i+1}{extension}"
        target_path = target_folder / new_name
        shutil.copy(original_file, target_path)

    print(f"{n} files copied successfully to {target_folder}")

for iclass in range(1,10):
    idclass = 's'+str(iclass)
    # Example usage:
    copy_files_until_n("C:/Users/user/Documents/soundDBAugmented/train_val_test_fulldb/train/"+idclass, "C:/Users/user/Documents/soundDBAugmented/train_copies_diff/"+idclass,13000)


13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s1
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s2
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s3
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s4
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s5
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s6
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s7
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s8
13000 files copied successfully to C:\Users\user\Documents\soundDBAugmented\train_copies_diff\s9


# Data augmentation with diffusion

In [7]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('C:/Users/user/Documents/DataScience/AnuraSoundProject/Classification')
sys.path.append('C:/Users/user/Documents/DataScience/AnuraSoundProject/DataAugmentation/diffusion')
sys.path.append("C:/Users/user/Documents/DataScience/AnuraSoundProject/DataAugmentation/diffusion/model_utils")


########
import os
import shutil
import random
from pathlib import Path
########
from audio_classification import configureModel,saveModel
from transformers import AutoFeatureExtractor

########
import argparse
import json
import models
import training_loss
import torch
import dataset_manager
from dataset_manager import GenerativeAIDataset
from torch.utils.data import DataLoader
from transformation.data_preprocessing import get_data_composing
from tqdm import tqdm

from torch.utils.data import Dataset
import os
''' Load parameters like we did in the training script '''
from collections import namedtuple
from dataset_manager import GenerativeAIDataset


#########
import argparse
import json
import models
import training_loss
import torch
import dataset_manager
from dataset_manager import GenerativeAIDataset
from torch.utils.data import DataLoader
from transformation.data_preprocessing import get_data_composing
from tqdm import tqdm

from torch.utils.data import Dataset
import os
from collections import namedtuple

''' Now we can generate samples using the model '''
from sampling.sampling_utils import get_diffusion_sample
import sys
from model_utils.diffusion_utils import calc_diffusion_hyperparams
from dataset_manager import GenerativeAIDataset
import numpy as np
import librosa
import soundfile as sf
import os


def trainDiffusion(epoch,model,train_dataloader,device,optimizer,training_loss_function):
    '''
    Function to train the model for one epoch

    Args:
        epoch (int): The current epoch number

    Returns:
        float: The average loss of the epoch
    '''

    model.train()  # Set model to training mode (lo dejo si hago validación)

    running_loss = 0.0
    pbar = tqdm(train_dataloader,
            unit="audios",
            unit_scale=train_dataloader.batch_size,
            desc=f'Epoca {epoch}')
    for step, batch in enumerate(pbar):
        batch = batch['samples'].to(device)
        optimizer.zero_grad()
        loss = training_loss_function(model, batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / (step + 1))
        })
    loss_epoch = running_loss / len(train_dataloader)
    return loss_epoch

# Creates different datasets from training for the experiments of data augmentation
def sample_dataset(original_root, new_root, nper=0.2, seed=42):
    """
    Sample a percentage of audio files from each class (subfolder) in the original dataset
    and save them to a new dataset folder with the same structure.

    Parameters:
        original_root (str or Path): Path to the original dataset root folder.
        new_root (str or Path): Path where the new sampled dataset will be created.
        nper (float): Percentage of files to sample from each class (between 0 and 1).
        seed (int): Random seed for reproducibility.
    """
    random.seed(seed)
    original_root = Path(original_root)
    new_root = Path(new_root)
    new_root.mkdir(parents=True, exist_ok=True)

    for class_dir in original_root.iterdir():
        if class_dir.is_dir():
            files = list(class_dir.glob("*.wav"))  # You can change the extension if needed
            n_samples = max(1, int(len(files) * nper))
            sampled_files = random.sample(files, n_samples)

            new_class_dir = new_root / class_dir.name
            new_class_dir.mkdir(parents=True, exist_ok=True)

            for file_path in sampled_files:
                shutil.copy(file_path, new_class_dir / file_path.name)

            print(f"Copied {len(sampled_files)} files to {new_class_dir}")


#####################################
def runModelSave(nameModel,pathTrainData,pathValidationData,epochs,batch_size,pathModelSaved):
  # Configure the classification model
  model,train,test = configureModel(epochs,batch_size,pathTrainData,pathValidationData)

  # Remove targets from training dictionaries
  train_x = {x: y for x, y in train.items() if x != "class_label"}
  test_x = {x: y for x, y in test.items() if x != "class_label"}

  # Fit the model
  history = model.fit(
      train_x,
      train["class_label"],
      validation_data=(test_x, test["class_label"]),
      batch_size=batch_size,
      epochs=epochs,
  )

  # Save model and history of fitting
  saveModel(model,history,pathModelSaved+'/'+nameModel)

def saveAudiosToFolder(audios,output_dir,sample_rate,nameFile):
    # Create folder if is missing
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the audio
    for i, audio in enumerate(audios):
        y = audio.squeeze()  
        
        # It is in a valid range?
        if y.dtype != np.float32:
            y = y.astype(np.float32)
    
        filename = os.path.join(output_dir, nameFile+f"_audio_{i+1}.wav")
        sf.write(filename, y, sample_rate)
    
    print("Generated Audios Saved")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
######################################################
# CONFIGURATION FOR TRAINING DIFFUSION
######################################################
pathTraindata = 'C:/Users/user/Documents/soundDBAugmented/train_copies_diff/'
pathDiffusionModels = 'C:/Users/user/Documents/soundDBAugmented/ModelsExperiments/diffusion_model/'
epochs_diffusion = 50
#epochs_diffusion = 3
labelsClasses = ['s1','s2','s3','s5','s6','s7','s8','s9']

######################################################
# CONFIGURATION FOR GENERATING SAMPLES USING DIFFUSION
######################################################
''' Now we can generate samples using the model '''
Args = namedtuple('Args', [
    'batch_size',
    'learning_rate',
    'max_epochs',
    'model',
    'config'
])
#############################
args = Args(
    batch_size=8,
    learning_rate=1e-3,
    max_epochs=350,
    model='DIFFUSION',
    config='./config.json')

In [13]:
import dataset_manager

def trainDiffusionModel(pathGendata,config,classId,pathModels,epochs_diffusion):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    BATCH_SIZE = args.batch_size
    model_name = args.model
    lr = args.learning_rate
    
    sr = config['audio_config']['sample_rate']
    
    '''Load the Data'''
    data_processing = get_data_composing(model_name, config)
    train_dataset = GenerativeAIDataset(pathGendata,
                                data_processing,
                                class_c=classId,
                                expand_data=1)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)
    
    ## Load the model
    model = models.create_model(model_name, config=config).to(device)
    training_loss_function = training_loss.get_loss_train(model_name, general_config=config)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.1)
    
    print(f'Training {model_name}')   
    for epoch in range(epochs_diffusion):
        loss_train = trainDiffusion(epoch,model,train_dataloader,device,optimizer,training_loss_function)
    
        lr_scheduler.step(metrics=loss_train)
        if epoch % 2 == 0:
          if model_name == 'VAE':
              path = f'./models_trained/VAE/{model_name}_model.pth'
          else:
              path = pathModels + '.pth'

          pathfine = pathModels +"_"+ str(epoch) + '.pth'

          torch.save(model, path)
          torch.save(model, pathfine) 

    torch.save(model, f'{model_name}_model.pth')    
    print('Model was saved')



'''Load the model config file'''
with open(args.config, 'r') as f:
    config = json.load(f)

for classAugmentationV in labelsClasses:
    print('classAugmentationV')
    print(classAugmentationV)
    pathDiffModel = pathDiffusionModels+"/"+classAugmentationV
    pathdm = Path(pathDiffModel)
    pathdm.mkdir(parents=True, exist_ok=True)
    trainDiffusionModel(pathTraindata,config,classAugmentationV,pathDiffModel+'/modeldiff_'+classAugmentationV,epochs_diffusion)
    torch.cuda.empty_cache()  # Libera memoria no usada pero caché ocupada


  WeightNorm.apply(module, name, dim)


classAugmentationV
s1
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:18<00:00, 40.88audios/s, loss=0.03968]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.02030]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.01288]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.01163]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.93audios/s, loss=0.01097]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.93audios/s, loss=0.01063]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.00998]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.93audios/s, loss=0.01004]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s2
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.94audios/s, loss=0.05175]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.03audios/s, loss=0.02575]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.03audios/s, loss=0.02313]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.03audios/s, loss=0.02050]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.03audios/s, loss=0.01900]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.04audios/s, loss=0.01778]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.03audios/s, loss=0.01645]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:16<00:00, 41.03audios/s, loss=0.01814]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s3
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.94audios/s, loss=0.04817]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.99audios/s, loss=0.02236]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.99audios/s, loss=0.02172]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.98audios/s, loss=0.01871]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.98audios/s, loss=0.01814]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01681]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01660]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01546]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s5
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.04994]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.98audios/s, loss=0.02778]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.02405]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.02117]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01969]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01916]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01774]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01700]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s6
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.91audios/s, loss=0.02971]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.00653]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.00467]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.00345]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.97audios/s, loss=0.01097]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.96audios/s, loss=0.00403]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.96audios/s, loss=0.00457]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.96audios/s, loss=0.00255]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s7
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.90audios/s, loss=0.03372]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.01156]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00988]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.01027]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00885]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00928]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00726]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00734]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s8
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.89audios/s, loss=0.03721]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.94audios/s, loss=0.01245]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.01113]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00909]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00831]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.94audios/s, loss=0.00791]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00731]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.95audios/s, loss=0.00659]
Epoca 8: 100%|██████████████████████████

Model was saved
classAugmentationV
s9
last version schedule
Training DIFFUSION


Epoca 0: 100%|█████████████████████████████████████████████████| 13000/13000 [05:18<00:00, 40.85audios/s, loss=0.07614]
Epoca 1: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.91audios/s, loss=0.04477]
Epoca 2: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.03766]
Epoca 3: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.03447]
Epoca 4: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.03162]
Epoca 5: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.02956]
Epoca 6: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.02871]
Epoca 7: 100%|█████████████████████████████████████████████████| 13000/13000 [05:17<00:00, 40.92audios/s, loss=0.02713]
Epoca 8: 100%|██████████████████████████

Model was saved



