In [59]:
import os
import pandas as pd

# Path to the training dataset
dataset_path = './IRMAS/IRMAS-TrainingData'

# List to store file paths and labels
data = []

# Iterate over each subfolder (instrument class) in the dataset
for instrument_folder in os.listdir(dataset_path):
    instrument_path = os.path.join(dataset_path, instrument_folder)
    if not os.path.isdir(instrument_path):
        continue  # Skip if not a directory
    
    # Extract instrument name from folder name
    instrument_name = instrument_folder.split('(')[0]
    # Iterate over files in the instrument folder
    for file_name in os.listdir(instrument_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(instrument_path, file_name)
            data.append({'file_path': file_path, 'label': instrument_name})

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

Creating augmented dataset:   0%|          | 0/1000 [15:20<?, ? pairs/s]
Creating augmented dataset:   0%|          | 0/1000 [15:05<?, ? pairs/s]

                                           file_path label
0  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
1  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
2  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
3  ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...   cel
4  ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...   cel





In [60]:
df_youtube = pd.read_csv('dataset_youtube.csv')
df_youtube['classe'].unique()

array(['Fluete', 'Cello', 'Piano', 'Clarinet', 'Organ ', 'Saxophone',
       'Trumpet', 'Violin'], dtype=object)

In [61]:
df_soundcloud = pd.read_csv('dataset_soundcloud.csv')
df_soundcloud['classe'].unique()

array(['piano', 'organ', 'acoustic_guitar', 'electric_guitar', 'cello',
       'clarinet', 'flute', 'saxophone', 'trumpet', 'violin'],
      dtype=object)

In [62]:
mapping_dict = {
    'Fluete': 'flu',
    'Cello': 'cel',
    'Piano': 'pia',
    'Clarinet': 'cla',
    'Organ ': 'org',
    'Saxophone': 'sax',
    'Trumpet': 'tru',
    'Violin': 'vio',
    'piano': 'pia',
    'organ': 'org',
    'acoustic_guitar': 'gac',
    'electric_guitar': 'gel',
    'cello': 'cel',
    'clarinet': 'cla',
    'flute': 'flu',
    'saxophone': 'sax',
    'trumpet': 'tru',
    'violin': 'vio'
}

In [63]:
# Apply the mapping to the YouTube dataset
df_youtube['label'] = df_youtube['classe'].map(mapping_dict)
df_youtube = df_youtube.drop(columns=['classe'])
df_youtube = df_youtube.rename(columns={'arquivo': 'file_path'})

# Apply the mapping to the SoundCloud dataset
df_soundcloud['label'] = df_soundcloud['classe'].map(mapping_dict)
df_soundcloud = df_soundcloud.drop(columns=['classe'])
df_soundcloud = df_soundcloud.rename(columns={'arquivo': 'file_path'})

# Adjust file paths for YouTube and SoundCloud datasets
df_youtube['file_path'] = './content/' + df_youtube['file_path']
df_soundcloud['file_path'] = './content/' + df_soundcloud['file_path']

# Concatenate the dataframes
df_combined = pd.concat([df, df_youtube, df_soundcloud], ignore_index=True)

# Display the combined DataFrame
print("Combined dataset:")
print(df_combined.head())

Combined dataset:
                                           file_path label
0  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
1  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
2  ./IRMAS/IRMAS-TrainingData/cel/008__[cel][nod]...   cel
3  ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...   cel
4  ./IRMAS/IRMAS-TrainingData/cel/012__[cel][nod]...   cel


In [64]:
!pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [65]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Função para balancear o dataset
def balancear_dataset(df, estrategia='under'):
    X = df['file_path'].values.reshape(-1, 1)  # Reshape para 2D
    y = df['label']
    
    if estrategia == 'under':
        sampler = RandomUnderSampler()
    elif estrategia == 'over':
        sampler = RandomOverSampler()
    else:
        raise ValueError("Estratégia deve ser 'under' ou 'over'")

    X_res, y_res = sampler.fit_resample(X, y)
    
    df_balanced = pd.DataFrame({
        'file_path': X_res.flatten(),
        'label': y_res
    })
    
    return df_balanced
# Balanceamento do dataset
df_balanced = balancear_dataset(df_combined, estrategia='under')

In [66]:
quantidade_por_classe = df_balanced['label'].value_counts()
print(f"Quantidades por classe no dataset balanceado:\n{quantidade_por_classe}")

Quantidades por classe no dataset balanceado:
cel    778
cla    778
flu    778
gac    778
gel    778
org    778
pia    778
sax    778
tru    778
vio    778
voi    778
Name: label, dtype: int64


In [67]:
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
import os
import random
from tqdm import tqdm
import time

def audio_mixing_augmentation(df, output_dir, seed=42, sample_rate=16000, min_class_sample=1, max_augmented_size=None):
    """
    Perform audio mixing data augmentation on a given DataFrame using MixUp technique.

    Args:
        df (pd.DataFrame): DataFrame containing 'label' and 'file_path' columns.
        output_dir (str): Directory where the augmented audio files will be saved.
        seed (int): Random seed for reproducibility.
        sample_rate (int): Sample rate for audio files.
        min_class_sample (int): Minimum number of samples per class for balancing.
        max_augmented_size (int or None): Maximum size of the augmented dataset.

    Returns:
        pd.DataFrame: DataFrame with new augmented data.
    """
    random.seed(seed)
    np.random.seed(seed)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    def mix_audios(audio1, audio2, alpha):
        max_length = max(len(audio1), len(audio2))
        padded_audio1 = np.pad(audio1, (0, max_length - len(audio1)), 'constant')
        padded_audio2 = np.pad(audio2, (0, max_length - len(audio2)), 'constant')
        mixed_audio = alpha * padded_audio1 + (1 - alpha) * padded_audio2
        return mixed_audio

    class_audio_dict = {label: df[df['label'] == label]['file_path'].tolist() for label in df['label'].unique()}
    class_counts = df['label'].value_counts()

    total_pairs_to_create = min(len(df), max_augmented_size) if max_augmented_size is not None else len(df)

    augmented_data = []
    pairs_created = {}
    
    # Minimum probability for any class (adjustable)
    min_probability = 0.001
    
    # Calculate a weight based on class frequency (more frequent, lower weight)
    class_weights = 1 / class_counts
    
    # Normalize weights to sum to 1
    total_weight = class_weights.sum()
    normalized_weights = class_weights / total_weight
    
    # Amplify the effect for minority classes
    amplified_weights = normalized_weights ** 2  # Amplify more significantly
    
    # Normalize amplified weights to sum to 1
    amplified_weights /= amplified_weights.sum()
    
    # Calculate probabilities using amplified weights and minimum probability
    class_probabilities = {label: max(min_probability, weight) for label, weight in zip(class_counts.index, amplified_weights)}

    progress_bar = tqdm(total=total_pairs_to_create, desc="Creating augmented dataset", unit=" pairs")

    start_time = time.time()
    last_mb_printed = 0

    # Balance the dataset by augmenting minor classes more frequently
    while len(augmented_data) < total_pairs_to_create:
        class1, class2 = np.random.choice(list(class_audio_dict.keys()), size=2, replace=False, p=list(class_probabilities.values()))

        audio_paths1 = random.choices(class_audio_dict[class1], k=1)
        audio_paths2 = random.choices(class_audio_dict[class2], k=1)

        key = tuple(sorted((class1, class2)))
        if key not in pairs_created:
            pairs_created[key] = 0

        audio1, sr1 = librosa.load(audio_paths1[0], sr=sample_rate)
        audio2, sr2 = librosa.load(audio_paths2[0], sr=sample_rate)

        alpha = np.random.beta(0.4, 0.4)
        mixed_audio = mix_audios(audio1, audio2, alpha)
        output_file_name = f"{class1}{class2}{os.path.basename(audio_paths1[0]).split('.')[0]}_{os.path.basename(audio_paths2[0]).split('.')[0]}.wav"
        output_file_path = os.path.join(output_dir, output_file_name)
        sf.write(output_file_path, mixed_audio, sample_rate)

        augmented_data.append({'label': f"{class1},{class2}", 'file_path': output_file_path, 'alpha': alpha})
        pairs_created[key] += 1

        # Print progress and dataset size information
        current_mb = sum(os.path.getsize(file['file_path']) for file in augmented_data) / (1024 * 1024)  # Size in MB
        if int(current_mb) > last_mb_printed:
            elapsed_time = time.time() - start_time
            print(f"Dataset size: {int(current_mb)} MB | Elapsed time: {elapsed_time:.2f} seconds")
            last_mb_printed = int(current_mb)

        progress_bar.update(1)

    progress_bar.close()

    augmented_df = pd.DataFrame(augmented_data)
    combined_df = pd.concat([df, augmented_df]).reset_index(drop=True)

    # Print distribution metrics
    print("\nDistribution of labels before augmentation:")
    print(df['label'].value_counts())
    print("\nDistribution of labels after augmentation:")
    labels = combined_df.label.apply(lambda x: x.split(","))
    flattened_labels = [label for sublist in labels for label in sublist]
    
    # Calculate value counts
    value_counts = pd.Series(flattened_labels).value_counts()
    print(value_counts)

    return combined_df, df['label'].value_counts(), value_counts

# Example usage:
# df = pd.DataFrame({
#     'label': ['class1', 'class2', 'class1', 'class3'],
#     'file_path': ['path/to/audio1.wav', 'path/to/audio2.wav', 'path/to/audio3.wav', 'path/to/audio4.wav']
# })
output_dir = './augdata_all'
augmented_df, labels_before, labels_after = audio_mixing_augmentation(df_balanced, output_dir, seed=42, max_augmented_size=500)
print(augmented_df)

Creating augmented dataset:   2%|▏         | 12/500 [00:00<00:04, 113.61 pairs/s]

Dataset size: 1 MB | Elapsed time: 0.10 seconds


Creating augmented dataset:   5%|▍         | 24/500 [00:00<00:04, 100.32 pairs/s]

Dataset size: 2 MB | Elapsed time: 0.21 seconds


Creating augmented dataset:   7%|▋         | 35/500 [00:00<00:04, 98.41 pairs/s] 

Dataset size: 3 MB | Elapsed time: 0.33 seconds


Creating augmented dataset:   9%|▉         | 45/500 [00:00<00:04, 97.07 pairs/s]

Dataset size: 4 MB | Elapsed time: 0.45 seconds


Creating augmented dataset:  11%|█         | 55/500 [00:00<00:04, 95.15 pairs/s]

Dataset size: 5 MB | Elapsed time: 0.56 seconds


Creating augmented dataset:  13%|█▎        | 65/500 [00:00<00:04, 96.08 pairs/s]

Dataset size: 6 MB | Elapsed time: 0.68 seconds


Creating augmented dataset:  17%|█▋        | 86/500 [00:00<00:04, 99.12 pairs/s]

Dataset size: 7 MB | Elapsed time: 0.79 seconds


Creating augmented dataset:  19%|█▉        | 96/500 [00:00<00:04, 97.35 pairs/s]

Dataset size: 8 MB | Elapsed time: 0.90 seconds


Creating augmented dataset:  21%|██        | 106/500 [00:01<00:04, 97.57 pairs/s]

Dataset size: 9 MB | Elapsed time: 1.01 seconds


Creating augmented dataset:  23%|██▎       | 116/500 [00:01<00:03, 97.37 pairs/s]

Dataset size: 10 MB | Elapsed time: 1.13 seconds


Creating augmented dataset:  25%|██▌       | 126/500 [00:01<00:03, 95.77 pairs/s]

Dataset size: 11 MB | Elapsed time: 1.25 seconds


Creating augmented dataset:  27%|██▋       | 137/500 [00:01<00:03, 98.21 pairs/s]

Dataset size: 12 MB | Elapsed time: 1.36 seconds
Dataset size: 13 MB | Elapsed time: 1.45 seconds


Creating augmented dataset:  33%|███▎      | 167/500 [00:01<00:03, 95.12 pairs/s]

Dataset size: 14 MB | Elapsed time: 1.58 seconds
Dataset size: 15 MB | Elapsed time: 1.70 seconds


Creating augmented dataset:  37%|███▋      | 187/500 [00:01<00:03, 87.57 pairs/s]

Dataset size: 16 MB | Elapsed time: 1.82 seconds
Dataset size: 17 MB | Elapsed time: 1.96 seconds


Creating augmented dataset:  42%|████▏     | 208/500 [00:02<00:03, 92.69 pairs/s]

Dataset size: 18 MB | Elapsed time: 2.08 seconds
Dataset size: 19 MB | Elapsed time: 2.19 seconds


Creating augmented dataset:  46%|████▌     | 229/500 [00:02<00:02, 96.93 pairs/s]

Dataset size: 20 MB | Elapsed time: 2.30 seconds
Dataset size: 21 MB | Elapsed time: 2.41 seconds


Creating augmented dataset:  52%|█████▏    | 259/500 [00:02<00:02, 93.54 pairs/s]

Dataset size: 22 MB | Elapsed time: 2.53 seconds
Dataset size: 23 MB | Elapsed time: 2.66 seconds


Creating augmented dataset:  56%|█████▌    | 280/500 [00:02<00:02, 94.53 pairs/s]

Dataset size: 24 MB | Elapsed time: 2.76 seconds
Dataset size: 25 MB | Elapsed time: 2.87 seconds


Creating augmented dataset:  60%|██████    | 300/500 [00:03<00:02, 95.16 pairs/s]

Dataset size: 26 MB | Elapsed time: 2.99 seconds
Dataset size: 27 MB | Elapsed time: 3.10 seconds


Creating augmented dataset:  64%|██████▍   | 320/500 [00:03<00:02, 89.87 pairs/s]

Dataset size: 28 MB | Elapsed time: 3.23 seconds
Dataset size: 29 MB | Elapsed time: 3.36 seconds


Creating augmented dataset:  68%|██████▊   | 340/500 [00:03<00:01, 91.46 pairs/s]

Dataset size: 30 MB | Elapsed time: 3.48 seconds
Dataset size: 31 MB | Elapsed time: 3.60 seconds


Creating augmented dataset:  72%|███████▏  | 360/500 [00:03<00:01, 89.38 pairs/s]

Dataset size: 32 MB | Elapsed time: 3.74 seconds
Dataset size: 33 MB | Elapsed time: 3.85 seconds


Creating augmented dataset:  78%|███████▊  | 390/500 [00:04<00:01, 89.59 pairs/s]

Dataset size: 34 MB | Elapsed time: 3.97 seconds
Dataset size: 35 MB | Elapsed time: 4.10 seconds


Creating augmented dataset:  82%|████████▏ | 409/500 [00:04<00:01, 89.43 pairs/s]

Dataset size: 36 MB | Elapsed time: 4.22 seconds
Dataset size: 37 MB | Elapsed time: 4.33 seconds


Creating augmented dataset:  86%|████████▌ | 430/500 [00:04<00:00, 93.92 pairs/s]

Dataset size: 38 MB | Elapsed time: 4.46 seconds
Dataset size: 39 MB | Elapsed time: 4.57 seconds


Creating augmented dataset:  90%|█████████ | 450/500 [00:04<00:00, 90.14 pairs/s]

Dataset size: 40 MB | Elapsed time: 4.68 seconds
Dataset size: 41 MB | Elapsed time: 4.81 seconds


Creating augmented dataset:  94%|█████████▍| 469/500 [00:05<00:00, 85.92 pairs/s]

Dataset size: 42 MB | Elapsed time: 4.94 seconds
Dataset size: 43 MB | Elapsed time: 5.07 seconds


Creating augmented dataset:  99%|█████████▉| 497/500 [00:05<00:00, 88.39 pairs/s]

Dataset size: 44 MB | Elapsed time: 5.21 seconds
Dataset size: 45 MB | Elapsed time: 5.32 seconds


Creating augmented dataset: 100%|██████████| 500/500 [00:05<00:00, 92.33 pairs/s]


Distribution of labels before augmentation:
cel    778
cla    778
flu    778
gac    778
gel    778
org    778
pia    778
sax    778
tru    778
vio    778
voi    778
Name: label, dtype: int64

Distribution of labels after augmentation:
cel    880
cla    872
org    872
pia    872
voi    871
vio    870
tru    869
sax    866
flu    863
gac    862
gel    861
dtype: int64
                                              file_path    label     alpha
0     ./IRMAS/IRMAS-TrainingData/cel/[cel][cla]0081_...      cel       NaN
1     ./content/output_wav/Cello_Mischa Maisky plays...      cel       NaN
2     ./content/output_wav/cello_Bach Suite for Cell...      cel       NaN
3     ./IRMAS/IRMAS-TrainingData/cel/[cel][jaz_blu]0...      cel       NaN
4     ./IRMAS/IRMAS-TrainingData/cel/074__[cel][nod]...      cel       NaN
...                                                 ...      ...       ...
9053  ./augdata_all/piagel[pia][pop_roc]1281__1_[gel...  pia,gel  0.995693
9054  ./augdata_all/gelcelelec




In [68]:
augmented_df['label'].unique()

array(['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru',
       'vio', 'voi', 'gel,voi', 'cla,cel', 'sax,cel', 'flu,gac',
       'gac,pia', 'gel,org', 'org,pia', 'cla,sax', 'org,cel', 'sax,gac',
       'flu,voi', 'flu,cel', 'flu,vio', 'org,cla', 'voi,tru', 'tru,sax',
       'cel,gac', 'pia,gac', 'gac,tru', 'sax,tru', 'org,gel', 'cel,sax',
       'vio,flu', 'vio,org', 'flu,gel', 'gel,flu', 'voi,gac', 'gac,voi',
       'org,gac', 'pia,org', 'voi,flu', 'flu,tru', 'gac,flu', 'sax,cla',
       'voi,cla', 'voi,vio', 'tru,pia', 'cla,vio', 'sax,vio', 'pia,sax',
       'sax,flu', 'cla,gel', 'voi,gel', 'gac,cel', 'gel,vio', 'pia,voi',
       'cla,pia', 'org,vio', 'tru,vio', 'org,tru', 'cel,vio', 'cel,pia',
       'sax,voi', 'tru,flu', 'cel,voi', 'cla,flu', 'cel,cla', 'vio,sax',
       'pia,cel', 'voi,pia', 'tru,gac', 'pia,vio', 'tru,voi', 'tru,cla',
       'vio,gac', 'voi,org', 'gac,sax', 'voi,sax', 'cla,org', 'gac,vio',
       'org,voi', 'gel,pia', 'gel,tru', 'pia,gel', 'gel,gac', 

In [69]:
augmented_df

Unnamed: 0,file_path,label,alpha
0,./IRMAS/IRMAS-TrainingData/cel/[cel][cla]0081_...,cel,
1,./content/output_wav/Cello_Mischa Maisky plays...,cel,
2,./content/output_wav/cello_Bach Suite for Cell...,cel,
3,./IRMAS/IRMAS-TrainingData/cel/[cel][jaz_blu]0...,cel,
4,./IRMAS/IRMAS-TrainingData/cel/074__[cel][nod]...,cel,
...,...,...,...
9053,./augdata_all/piagel[pia][pop_roc]1281__1_[gel...,"pia,gel",0.995693
9054,./augdata_all/gelcelelectric_guitar_Práctica ...,"gel,cel",0.057313
9055,./augdata_all/gacvoiacoustic_guitar_Acoustic G...,"gac,voi",0.000048
9056,./augdata_all/saxgac[sax][cla]1725__2_[gac][cl...,"sax,gac",0.937099


In [70]:
output_csv_path = './augmented_dataset_all.csv'

# Salve o DataFrame como um arquivo CSV
augmented_df.to_csv(output_csv_path, index=False)

  values = values.astype(str)
