In [1]:
import os
import pandas as pd

# Path to the training dataset
dataset_path = './IRMAS/IRMAS-TrainingData'

# List to store file paths and labels
data = []

# Iterate over each subfolder (instrument class) in the dataset
for instrument_folder in os.listdir(dataset_path):
    instrument_path = os.path.join(dataset_path, instrument_folder)
    if not os.path.isdir(instrument_path):
        continue  # Skip if not a directory
    
    # Extract instrument name from folder name
    instrument_name = instrument_folder.split('(')[0]
    # Iterate over files in the instrument folder
    for file_name in os.listdir(instrument_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(instrument_path, file_name)
            data.append({'file_path': file_path, 'label': instrument_name})

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

                                           file_path label
0  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
1  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
2  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
3  ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...   cel
4  ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...   cel


In [4]:
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
import os
import random
from tqdm import tqdm
import time

def audio_mixing_augmentation(df, output_dir, seed=42, sample_rate=16000, min_class_sample=1, max_augmented_size=None):
    """
    Perform audio mixing data augmentation on a given DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing 'label' and 'file_path' columns.
        output_dir (str): Directory where the augmented audio files will be saved.
        seed (int): Random seed for reproducibility.
        sample_rate (int): Sample rate for audio files.
        min_class_sample (int): Minimum number of samples per class for balancing.
        max_augmented_size (int or None): Maximum size of the augmented dataset.

    Returns:
        pd.DataFrame: DataFrame with new augmented data.
    """
    random.seed(seed)
    np.random.seed(seed)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    def mix_audios(audio1, audio2):
        max_length = max(len(audio1), len(audio2))
        padded_audio1 = np.pad(audio1, (0, max_length - len(audio1)), 'constant')
        padded_audio2 = np.pad(audio2, (0, max_length - len(audio2)), 'constant')
        mixed_audio = (padded_audio1 + padded_audio2) / 2
        return mixed_audio

    class_audio_dict = {label: df[df['label'] == label]['file_path'].tolist() for label in df['label'].unique()}
    class_counts = df['label'].value_counts()

    total_pairs_to_create = min(len(df), max_augmented_size) if max_augmented_size is not None else len(df)

    augmented_data = []
    pairs_created = {}
    
    # Minimum probability for any class (adjustable)
    min_probability = 0.001
    
    # Calculate a weight based on class frequency (more frequent, lower weight)
    class_weights = 1 / (class_counts/ len(df))
    
    # Normalize weights to sum to 1
    total_weight = class_weights.sum()
    normalized_weights = class_weights / total_weight
    
    # Amplify the effect for minority classes
    # You can adjust the power factor (e.g., 1.5) to further increase the imbalance
    amplified_weights = normalized_weights ** 0.5
    
    # Normalize amplified weights to sum to 1
    amplified_weights /= amplified_weights.sum()
    
    # Calculate probabilities using amplified weights and minimum probability
    class_probabilities = {label: max(min_probability, weight) for label, weight in zip(class_counts.index, amplified_weights)}

    print(class_probabilities)

    progress_bar = tqdm(total=total_pairs_to_create, desc="Creating augmented dataset", unit=" pairs")

    start_time = time.time()
    last_mb_printed = 0

    # Balance the dataset by augmenting minor classes more frequently
    while len(augmented_data) < total_pairs_to_create:
        class1, class2 = np.random.choice(list(class_audio_dict.keys()), size=2, replace=False, p=list(class_probabilities.values()))

        audio_paths1 = random.choices(class_audio_dict[class1], k=1)
        audio_paths2 = random.choices(class_audio_dict[class2], k=1)

        key = tuple(sorted((class1, class2)))
        if key not in pairs_created:
            pairs_created[key] = 0

        audio1, sr1 = librosa.load(audio_paths1[0], sr=sample_rate)
        audio2, sr2 = librosa.load(audio_paths2[0], sr=sample_rate)

        mixed_audio = mix_audios(audio1, audio2)
        output_file_name = f"{class1}{class2}{os.path.basename(audio_paths1[0]).split('.')[0]}_{os.path.basename(audio_paths2[0]).split('.')[0]}.wav"
        output_file_path = os.path.join(output_dir, output_file_name)
        sf.write(output_file_path, mixed_audio, sample_rate)

        augmented_data.append({'label': f"{class1},{class2}", 'file_path': output_file_path})
        pairs_created[key] += 1

        # Print progress and dataset size information
        current_mb = sum(os.path.getsize(file['file_path']) for file in augmented_data) / (1024 * 1024)  # Size in MB
        if int(current_mb) > last_mb_printed:
            elapsed_time = time.time() - start_time
            print(f"Dataset size: {int(current_mb)} MB | Elapsed time: {elapsed_time:.2f} seconds")
            last_mb_printed = int(current_mb)

        progress_bar.update(1)

    progress_bar.close()

    augmented_df = pd.DataFrame(augmented_data)
    combined_df = pd.concat([df, augmented_df]).reset_index(drop=True)

    # Print distribution metrics
    print("\nDistribution of labels before augmentation:")
    print(df['label'].value_counts(), df['label'].value_counts().mean(), df['label'].value_counts().std())
    print("\nDistribution of labels after augmentation:")
    labels = combined_df.label.apply(lambda x:x.split(","))
    flattened_labels = [label for sublist in labels for label in sublist]
    
    # Calculate value counts
    value_counts = pd.Series(flattened_labels).value_counts()
    print(value_counts, value_counts.mean(),  value_counts.std())
    print("gain by class", value_counts - df['label'].value_counts())

    return combined_df, df['label'].value_counts(), value_counts

In [5]:
output_dir = './augdata'
augmented_df, labels_before, labels_after = audio_mixing_augmentation(df, output_dir, seed=42, max_augmented_size=1000)
print(augmented_df)

{'voi': 0.07918536221601388, 'gel': 0.08011759610640823, 'pia': 0.08225590288596382, 'org': 0.08457509977612815, 'gac': 0.08751147219219138, 'sax': 0.08827699468079771, 'vio': 0.0917108480446586, 'tru': 0.09194895536372252, 'cla': 0.09828540171872736, 'flu': 0.1040031361080633, 'cel': 0.11212923090732502}


Creating augmented dataset:   2%|▏         | 21/1000 [00:02<01:15, 13.00 pairs/s]

Dataset size: 1 MB | Elapsed time: 2.49 seconds
Dataset size: 2 MB | Elapsed time: 2.60 seconds


Creating augmented dataset:   5%|▌         | 52/1000 [00:02<00:22, 41.48 pairs/s]

Dataset size: 3 MB | Elapsed time: 2.71 seconds
Dataset size: 4 MB | Elapsed time: 2.83 seconds


Creating augmented dataset:   7%|▋         | 72/1000 [00:03<00:15, 60.39 pairs/s]

Dataset size: 5 MB | Elapsed time: 2.94 seconds
Dataset size: 6 MB | Elapsed time: 3.05 seconds


Creating augmented dataset:   9%|▉         | 92/1000 [00:03<00:11, 75.85 pairs/s]

Dataset size: 7 MB | Elapsed time: 3.16 seconds
Dataset size: 8 MB | Elapsed time: 3.28 seconds


Creating augmented dataset:  11%|█         | 112/1000 [00:03<00:10, 85.70 pairs/s]

Dataset size: 9 MB | Elapsed time: 3.39 seconds
Dataset size: 10 MB | Elapsed time: 3.50 seconds


Creating augmented dataset:  13%|█▎        | 132/1000 [00:03<00:09, 91.30 pairs/s]

Dataset size: 11 MB | Elapsed time: 3.61 seconds
Dataset size: 12 MB | Elapsed time: 3.73 seconds


Creating augmented dataset:  15%|█▌        | 152/1000 [00:03<00:08, 94.49 pairs/s]

Dataset size: 13 MB | Elapsed time: 3.83 seconds
Dataset size: 14 MB | Elapsed time: 3.94 seconds


Creating augmented dataset:  18%|█▊        | 182/1000 [00:04<00:08, 95.56 pairs/s]

Dataset size: 15 MB | Elapsed time: 4.05 seconds
Dataset size: 16 MB | Elapsed time: 4.17 seconds


Creating augmented dataset:  20%|██        | 202/1000 [00:04<00:08, 95.48 pairs/s]

Dataset size: 17 MB | Elapsed time: 4.29 seconds
Dataset size: 18 MB | Elapsed time: 4.40 seconds


Creating augmented dataset:  22%|██▏       | 222/1000 [00:04<00:08, 94.63 pairs/s]

Dataset size: 19 MB | Elapsed time: 4.52 seconds
Dataset size: 20 MB | Elapsed time: 4.63 seconds


Creating augmented dataset:  24%|██▍       | 242/1000 [00:04<00:07, 95.05 pairs/s]

Dataset size: 21 MB | Elapsed time: 4.75 seconds
Dataset size: 22 MB | Elapsed time: 4.87 seconds


Creating augmented dataset:  26%|██▌       | 262/1000 [00:05<00:07, 94.41 pairs/s]

Dataset size: 23 MB | Elapsed time: 4.98 seconds
Dataset size: 24 MB | Elapsed time: 5.10 seconds


Creating augmented dataset:  29%|██▉       | 292/1000 [00:05<00:07, 95.25 pairs/s]

Dataset size: 25 MB | Elapsed time: 5.21 seconds
Dataset size: 26 MB | Elapsed time: 5.32 seconds


Creating augmented dataset:  31%|███       | 312/1000 [00:05<00:07, 94.24 pairs/s]

Dataset size: 27 MB | Elapsed time: 5.44 seconds
Dataset size: 28 MB | Elapsed time: 5.56 seconds


Creating augmented dataset:  33%|███▎      | 332/1000 [00:05<00:07, 94.49 pairs/s]

Dataset size: 29 MB | Elapsed time: 5.67 seconds
Dataset size: 30 MB | Elapsed time: 5.79 seconds


Creating augmented dataset:  35%|███▌      | 352/1000 [00:06<00:06, 93.44 pairs/s]

Dataset size: 31 MB | Elapsed time: 5.90 seconds
Dataset size: 32 MB | Elapsed time: 6.03 seconds


Creating augmented dataset:  37%|███▋      | 372/1000 [00:06<00:06, 94.09 pairs/s]

Dataset size: 33 MB | Elapsed time: 6.14 seconds
Dataset size: 34 MB | Elapsed time: 6.26 seconds


Creating augmented dataset:  39%|███▉      | 392/1000 [00:06<00:06, 94.09 pairs/s]

Dataset size: 35 MB | Elapsed time: 6.38 seconds
Dataset size: 36 MB | Elapsed time: 6.49 seconds


Creating augmented dataset:  42%|████▏     | 422/1000 [00:06<00:06, 93.12 pairs/s]

Dataset size: 37 MB | Elapsed time: 6.60 seconds
Dataset size: 38 MB | Elapsed time: 6.72 seconds


Creating augmented dataset:  44%|████▍     | 442/1000 [00:07<00:05, 93.63 pairs/s]

Dataset size: 39 MB | Elapsed time: 6.84 seconds
Dataset size: 40 MB | Elapsed time: 6.95 seconds


Creating augmented dataset:  46%|████▌     | 462/1000 [00:07<00:05, 93.55 pairs/s]

Dataset size: 41 MB | Elapsed time: 7.07 seconds
Dataset size: 42 MB | Elapsed time: 7.19 seconds


Creating augmented dataset:  48%|████▊     | 482/1000 [00:07<00:05, 93.50 pairs/s]

Dataset size: 43 MB | Elapsed time: 7.31 seconds
Dataset size: 44 MB | Elapsed time: 7.42 seconds


Creating augmented dataset:  50%|█████     | 502/1000 [00:07<00:05, 93.41 pairs/s]

Dataset size: 45 MB | Elapsed time: 7.54 seconds
Dataset size: 46 MB | Elapsed time: 7.66 seconds


Creating augmented dataset:  53%|█████▎    | 532/1000 [00:07<00:05, 92.04 pairs/s]

Dataset size: 47 MB | Elapsed time: 7.78 seconds
Dataset size: 48 MB | Elapsed time: 7.90 seconds


Creating augmented dataset:  55%|█████▌    | 552/1000 [00:08<00:04, 92.17 pairs/s]

Dataset size: 49 MB | Elapsed time: 8.01 seconds
Dataset size: 50 MB | Elapsed time: 8.13 seconds


Creating augmented dataset:  57%|█████▋    | 572/1000 [00:08<00:04, 92.23 pairs/s]

Dataset size: 51 MB | Elapsed time: 8.25 seconds
Dataset size: 52 MB | Elapsed time: 8.37 seconds


Creating augmented dataset:  59%|█████▉    | 592/1000 [00:08<00:04, 91.51 pairs/s]

Dataset size: 53 MB | Elapsed time: 8.49 seconds
Dataset size: 54 MB | Elapsed time: 8.61 seconds


Creating augmented dataset:  61%|██████    | 612/1000 [00:08<00:04, 91.37 pairs/s]

Dataset size: 55 MB | Elapsed time: 8.73 seconds
Dataset size: 56 MB | Elapsed time: 8.85 seconds


Creating augmented dataset:  63%|██████▎   | 632/1000 [00:09<00:04, 90.90 pairs/s]

Dataset size: 57 MB | Elapsed time: 8.97 seconds
Dataset size: 58 MB | Elapsed time: 9.09 seconds


Creating augmented dataset:  66%|██████▌   | 662/1000 [00:09<00:03, 91.14 pairs/s]

Dataset size: 59 MB | Elapsed time: 9.22 seconds
Dataset size: 60 MB | Elapsed time: 9.34 seconds


Creating augmented dataset:  68%|██████▊   | 682/1000 [00:09<00:03, 91.14 pairs/s]

Dataset size: 61 MB | Elapsed time: 9.44 seconds
Dataset size: 62 MB | Elapsed time: 9.57 seconds


Creating augmented dataset:  70%|███████   | 701/1000 [00:09<00:03, 89.15 pairs/s]

Dataset size: 63 MB | Elapsed time: 9.69 seconds
Dataset size: 64 MB | Elapsed time: 9.82 seconds


Creating augmented dataset:  72%|███████▏  | 720/1000 [00:10<00:03, 89.56 pairs/s]

Dataset size: 65 MB | Elapsed time: 9.94 seconds
Dataset size: 66 MB | Elapsed time: 10.06 seconds


Creating augmented dataset:  75%|███████▌  | 750/1000 [00:10<00:02, 90.39 pairs/s]

Dataset size: 67 MB | Elapsed time: 10.18 seconds
Dataset size: 68 MB | Elapsed time: 10.30 seconds


Creating augmented dataset:  77%|███████▋  | 770/1000 [00:10<00:02, 89.70 pairs/s]

Dataset size: 69 MB | Elapsed time: 10.43 seconds
Dataset size: 70 MB | Elapsed time: 10.55 seconds


Creating augmented dataset:  79%|███████▉  | 789/1000 [00:10<00:02, 89.54 pairs/s]

Dataset size: 71 MB | Elapsed time: 10.67 seconds
Dataset size: 72 MB | Elapsed time: 10.80 seconds


Creating augmented dataset:  81%|████████  | 807/1000 [00:11<00:02, 88.10 pairs/s]

Dataset size: 73 MB | Elapsed time: 10.91 seconds
Dataset size: 74 MB | Elapsed time: 11.04 seconds


Creating augmented dataset:  83%|████████▎ | 834/1000 [00:11<00:01, 83.49 pairs/s]

Dataset size: 75 MB | Elapsed time: 11.19 seconds
Dataset size: 76 MB | Elapsed time: 11.31 seconds


Creating augmented dataset:  85%|████████▌ | 852/1000 [00:11<00:01, 85.83 pairs/s]

Dataset size: 77 MB | Elapsed time: 11.44 seconds
Dataset size: 78 MB | Elapsed time: 11.56 seconds


Creating augmented dataset:  88%|████████▊ | 879/1000 [00:11<00:01, 86.61 pairs/s]

Dataset size: 79 MB | Elapsed time: 11.69 seconds
Dataset size: 80 MB | Elapsed time: 11.82 seconds


Creating augmented dataset:  90%|████████▉ | 897/1000 [00:12<00:01, 87.50 pairs/s]

Dataset size: 81 MB | Elapsed time: 11.94 seconds
Dataset size: 82 MB | Elapsed time: 12.06 seconds


Creating augmented dataset:  92%|█████████▏| 924/1000 [00:12<00:00, 86.96 pairs/s]

Dataset size: 83 MB | Elapsed time: 12.19 seconds
Dataset size: 84 MB | Elapsed time: 12.32 seconds


Creating augmented dataset:  94%|█████████▍| 942/1000 [00:12<00:00, 87.50 pairs/s]

Dataset size: 85 MB | Elapsed time: 12.45 seconds
Dataset size: 86 MB | Elapsed time: 12.56 seconds


Creating augmented dataset:  96%|█████████▌| 960/1000 [00:12<00:00, 87.32 pairs/s]

Dataset size: 87 MB | Elapsed time: 12.68 seconds
Dataset size: 88 MB | Elapsed time: 12.81 seconds


Creating augmented dataset:  99%|█████████▊| 987/1000 [00:13<00:00, 87.41 pairs/s]

Dataset size: 89 MB | Elapsed time: 12.94 seconds
Dataset size: 90 MB | Elapsed time: 13.06 seconds


Creating augmented dataset: 100%|██████████| 1000/1000 [00:13<00:00, 75.43 pairs/s]

Dataset size: 91 MB | Elapsed time: 13.19 seconds

Distribution of labels before augmentation:
voi    778
gel    760
pia    721
org    682
gac    637
sax    626
vio    580
tru    577
cla    505
flu    451
cel    388
Name: label, dtype: int64 609.5454545454545 125.21610410515385

Distribution of labels after augmentation:
voi    998
gel    923
pia    909
org    852
sax    819
gac    792
vio    791
tru    771
cla    685
flu    612
cel    553
dtype: int64 791.3636363636364 133.79930696926104
gain by class cel    165
cla    180
flu    161
gac    155
gel    163
org    170
pia    188
sax    193
tru    194
vio    211
voi    220
dtype: int64
                                              file_path    label
0     ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...      cel
1     ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...      cel
2     ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...      cel
3     ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...      cel
4     ./IRMAS/IRMAS-TrainingData/




In [70]:
output_csv_path = './augmented_dataset_all.csv'

# Salve o DataFrame como um arquivo CSV
augmented_df.to_csv(output_csv_path, index=False)

  values = values.astype(str)
