In [17]:
from pydub import AudioSegment
import os
import glob
import random
import librosa
import soundfile as sf
from tqdm.notebook import tqdm

os.makedirs("mixed_up_data_speak/speak", exist_ok = True)

VOICE_DIR = os.path.abspath("datasets/mathurinache/the-lj-speech-dataset/versions/1/LJSpeech-1.1/wavs")
BG_DIR = os.path.abspath("datasets/background_simplified")
OUT_DIR = os.path.abspath("mixed_up_data_speak/speak")
AMPLIFICATION = [10, 15, 20]

voice_files = glob.glob(os.path.join(VOICE_DIR, "*.wav"))
bg_noise_type = ["cafeteria_noises", "metro_noises", "park_noises", "station_noises", "traffic_noises"]

def mix_it_up(idx):
    for type in bg_noise_type:
        
        bg_files    = glob.glob(os.path.join(BG_DIR, type, "*.wav"))

        voice_path = random.choice(voice_files)
        bg_path = random.choice(bg_files)

        voice = AudioSegment.from_file(voice_path)
        background = AudioSegment.from_file(bg_path)

        background += random.choice(AMPLIFICATION)

        combined = voice.overlay(background)

        out_path = os.path.join(OUT_DIR, f"mixed_noisy_{type}_{idx}.wav")
        combined.export(out_path, format = 'wav')

In [18]:
for i in tqdm(range(20), desc="Creazione Speak Set - Parziale"): mix_it_up(i)

Creazione Speak Set - Parziale:   0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
os.makedirs("mixed_up_data_no_talk/noisy", exist_ok = True)
os.makedirs("mixed_up_data_no_talk/music", exist_ok=True)

BG_DIR = os.path.abspath("datasets/background_simplified")
OUT_DIR_MUSIC = os.path.abspath("mixed_up_data_no_talk/music")
OUT_DIR_NOISE = os.path.abspath("mixed_up_data_no_talk/noisy")

def augment_file(input_path, output_dir, sample_rate, stretch_factors, pitch_steps, amplification):
    
    y, sr = librosa.load(input_path, sr=sample_rate)
    basename = os.path.splitext(os.path.basename(input_path))[0]
    
    ops = ['stretch', 'pitch', 'gain', 'none']
    choice = random.choice(ops)
    
    if choice == 'stretch':
        factor = random.choice(stretch_factors)
        y_stretch = librosa.effects.time_stretch(y, rate=factor)
        out_name = f"{basename}_stretch{factor:.2f}.wav"
        sf.write(os.path.join(output_dir, out_name), y_stretch, sr)

    if choice == 'pitch':
        steps = random.choice(pitch_steps)
        y_shift = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=steps)
        out_name = f"{basename}_pitch{steps:+d}.wav"
        sf.write(os.path.join(output_dir, out_name), y_shift, sr)
    
    if choice == 'gain':
        gain = random.choice(amplification)
        y_amplify = y * gain
        out_name = f"{basename}_amplified_{gain:.1f}x.wav"
        sf.write(os.path.join(output_dir, out_name), y_amplify, sr)
    
    if ops == 'none': sf.write(os.path.join(output_dir, basename), y, sr)

def batch_augment_music(input_dir, output_dir,
                  sample_rate=44100,
                  stretch_factors=(0.9, 1.1),
                  pitch_steps=(-2, 2), amplification = 0):
    
    for f in tqdm(os.listdir(input_dir), desc="Augumenting Music"):
        base, ext = os.path.splitext(f)
        
        inp_path = os.path.join(input_dir, f)
        augment_file(inp_path, output_dir, sample_rate, stretch_factors, pitch_steps, amplification)
        
def batch_augment_noise(input_dir, output_dir,
                  sample_rate=44100,
                  stretch_factors=(0.9, 1.1),
                  pitch_steps=(-2, 2), amplification = 0):
    
    bg_noise_type = ["cafeteria_noises", "metro_noises", "park_noises", "station_noises", "traffic_noises"]       
    
    for i in tqdm(bg_noise_type, desc="Augumenting Noise"):
        dir_path = os.path.join(input_dir,i)
        for f in os.listdir(dir_path):                        
            inp_path = os.path.join(dir_path, f)
            augment_file(inp_path, output_dir, sample_rate, stretch_factors, pitch_steps, amplification)

In [20]:
os.makedirs("mixed_up_data_speak/no_speak", exist_ok = True)

OUT_DIR_SPEAK = os.path.abspath("mixed_up_data_speak/speak")
OUT_DIR_NO_SPEAK = os.path.abspath("mixed_up_data_speak/no_speak")

music_path = os.path.abspath("datasets/music_set_wav/complete_song")
vocals_path = os.path.abspath("datasets/music_set_wav/vocals")
instrumentals_path = os.path.abspath("datasets/music_set_wav/instrumentals")
noise_path = os.path.abspath("datasets/background_simplified")

sr       = 44100
stretches = [0.8, 1.2]
pitches  = [-3, 3]
amplification = [-5, 1, +5]

batch_augment_music(instrumentals_path, OUT_DIR_NO_SPEAK, sr, stretches, pitches, amplification)
batch_augment_noise(noise_path, OUT_DIR_NO_SPEAK, sr, stretches, pitches, amplification)
batch_augment_music(music_path, OUT_DIR_SPEAK, sr, stretches, pitches, amplification)
batch_augment_music(vocals_path, OUT_DIR_SPEAK, sr, stretches, pitches, amplification)

print("Augmentazione completata!") 

Augumenting Music:   0%|          | 0/93 [00:00<?, ?it/s]

Augumenting Noise:   0%|          | 0/5 [00:00<?, ?it/s]

Augumenting Music:   0%|          | 0/93 [00:00<?, ?it/s]

Augumenting Music:   0%|          | 0/100 [00:00<?, ?it/s]

Augmentazione completata!


In [30]:
from math import floor
import os
import random
from pydub import AudioSegment

def segment_file(in_path: str, out_dir: str, window_s: float, segments: int) -> None:
    
    audio = AudioSegment.from_file(in_path)
    dur_ms = len(audio)
    window_ms = int(window_s * 1000)
    base, _ = os.path.splitext(os.path.basename(in_path))

    if dur_ms < window_ms:
        out_name = f"{base}segment{1:03d}.wav"
        audio.export(os.path.join(out_dir, out_name), format="wav")
        return
        
    for i in range(segments):
        start = random.randint(0, dur_ms - window_ms)
        end = start + window_ms
        segment = audio[start:end]

        out_name = f"{base}segment{i:03d}.wav"
        segment.export(os.path.join(out_dir, out_name), format="wav")

In [50]:
win_s = 3

os.makedirs("mixed_up_data_speak_segmented", exist_ok=True)
os.makedirs("mixed_up_data_speak_segmented/speak", exist_ok=True)
os.makedirs("mixed_up_data_speak_segmented/no_speak", exist_ok=True)

output_dir_speak = os.path.abspath("mixed_up_data_speak_segmented/speak")
output_dir_no_speak = os.path.abspath("mixed_up_data_speak_segmented/no_speak")

speak_dir = os.path.abspath("mixed_up_data_speak/speak")
no_speak_dir = os.path.abspath("mixed_up_data_speak/no_speak")
# ripetiamo per chiarezza

music_files = [f for f in os.listdir(speak_dir)]
noisy_files = [f for f in os.listdir(no_speak_dir)]

In [49]:
for fname in tqdm(music_files, desc="Finestratura Audio Speak"):
    in_path = os.path.join(speak_dir,fname)
    try:
        segment_file(in_path, output_dir_speak, win_s, 12)
    except Exception as e:
            print(f"Errore con {fname}: {e}")

music_files = [f for f in os.listdir(output_dir_speak)]
print(len(music_files))

Finestratura Audio Speak:   0%|          | 0/242 [00:00<?, ?it/s]

2838


In [51]:
for fname in noisy_files:
    in_path = os.path.join(no_speak_dir,fname)
    try:
        segment_file(in_path, output_dir_no_speak, win_s, random.choice([20,23]))
    except Exception as e:
            print(f"Errore con {fname}: {e}")
noisy_files = [f for f in os.listdir(output_dir_no_speak)]
print(len(noisy_files))

2620


In [52]:
import pandas as pd

music_dir = os.path.abspath("mixed_up_data_speak_segmented/speak")
noisy_dir = os.path.abspath("mixed_up_data_speak_segmented/no_speak")

music_files = [f for f in os.listdir(music_dir)]
noisy_files = [f for f in os.listdir(noisy_dir)]
# ripetiamo per chiarezza

records = []
for f in music_files:
    records.append({f"filepath": os.path.join(music_dir,f), "label": "music"})
for f in noisy_files:
    records.append({"filepath": os.path.join(noisy_dir,f), "label": "noisy"})

df = pd.DataFrame.from_records(records)

df.to_csv("datasets/labels_mfcc_speak_segmented.csv", index=False)
print("Dataset salvato in datasets/labels_mfcc_speak_segmented.csv")

Dataset salvato in datasets/labels_mfcc_speak_segmented.csv
