In [82]:
import librosa
import numpy as np
import random

import os
import soundfile as sf

In [71]:
data_dir = "/Users/glebmokeev/Rask/data"

In [107]:
def synthesize_audio(speech_path, music_path, output_path, reverse_music=False, gain_db=0):
    speech, sr_speech = librosa.load(speech_path)
    music, sr_music = librosa.load(music_path)
    

    if sr_speech != sr_music:
        raise ValueError("Speech and music sample rates must be equal.")
    
      # Optional: Reverse music
    if reverse_music:
        music = music[::-1]

    speech = speech * (10**(gain_db / 20))  # Convert gain_db to linear gain
    
      # Normalize music to avoid clipping
    music_rms = librosa.feature.rms(y=music)
    scaling_factor = 0.3  # Adjust this value to control volume reduction
    music = music * scaling_factor

    music = music / np.max(np.abs(music_rms))

    if len(speech) != len(music):
        return
      # Combine speech and music (assuming mono audio)
    combined_audio = speech + music
    
      # Save the synthesized audio
    sf.write(output_path, combined_audio, sr_music)


In [108]:
os.makedirs(os.path.join(data_dir, "synths"), exist_ok=True)

In [109]:
speech_path = os.path.join(data_dir, "vocals/mixed/audio/frag")
music_path = os.path.join(data_dir, "music/clean/frag/")
synth_path =os.path.join(data_dir, "synths")

In [111]:
number_of_combinations = 100

speech_files = [f for f in os.listdir(speech_path) if f.endswith(".wav")]
music_files = [f for f in os.listdir(music_path) if f.endswith(".wav")]

for i in range(number_of_combinations):
    gain_db = random.uniform(0, 20)
    is_reversed = random.random() < 0.05
    
    chosen_speech = random.choice(speech_files)
    chosen_music = random.choice(music_files)
    synthesize_audio(os.path.join(speech_path, chosen_speech), os.path.join(music_path, chosen_music), os.path.join(synth_path, f"comb_{i+1}.wav"), reverse_music=is_reversed, gain_db=gain_db)

  music = music / np.max(np.abs(music_rms))
