In [1]:
import os
import pandas as pd
import librosa
import numpy as np
import random
import soundfile as sf
import shutil

In [2]:
def process_erroneous_files(error_file_path):
    erroneous_files = []
    with open(error_file_path, 'r') as file:
        for line in file:
            filename = line.strip().split('\t')[0]  # Extract the filename part
            if filename.startswith("audio/"):
                filename = filename[6:]  # Remove 'audio/' part
            erroneous_files.append(filename)
    return erroneous_files

In [3]:
def copy_normal_files(source_directory, dest_directory, excluded_files):
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    for filename in os.listdir(source_directory):
        if filename.endswith(".wav") and filename not in excluded_files:
            source_path = os.path.join(source_directory, filename)
            dest_path = os.path.join(dest_directory, filename)
            shutil.copyfile(source_path, dest_path)
            print(f"Copied {source_path} to {dest_path}")

In [4]:
def load_audio_files(directory):
    audio_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".wav"):
            file_path = os.path.join(directory, filename)
            try:
                y, sr = librosa.load(file_path, sr=None)
                audio_files.append((y, sr))
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    return audio_files

In [5]:
def add_silence(duration, sr):
    return np.zeros(int(duration * sr))

In [6]:
def concatenate_audios(audio_files):
    concatenated_audios = []
    sr = audio_files[0][1]

    current_audio = np.array([])
    total_length = 0

    for y, sr in audio_files:
        silence_duration = random.uniform(0.3, 0.5)
        silence = add_silence(silence_duration, sr)
        current_audio = np.concatenate((current_audio, y, silence))
        total_length = librosa.get_duration(y=current_audio, sr=sr)

        if total_length >= 23:
            if total_length < 30:
                silence_needed = 30 - total_length
                silence = add_silence(silence_needed, sr)
                current_audio = np.concatenate((current_audio, silence))

            concatenated_audios.append((current_audio, sr))
            current_audio = np.array([])
            total_length = 0

    if total_length > 0 and total_length < 30:
        silence_needed = 30 - total_length
        silence = add_silence(silence_needed, sr)
        current_audio = np.concatenate((current_audio, silence))
        concatenated_audios.append((current_audio, sr))

    return concatenated_audios


In [7]:
def save_audio(audio, sr, output_directory, filename):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    output_file = os.path.join(output_directory, filename)
    sf.write(output_file, audio, sr)
    print(f"Audio saved to {output_file}")

In [8]:
def normalize_audio(y):
    return librosa.util.normalize(y)

In [26]:
def mix_audios(audio, voice):
    audio, voice = pad_to_same_length(audio, voice)
    return audio + voice

In [28]:
def pad_to_30_seconds(y, sr):
    total_length = librosa.get_duration(y=y, sr=sr)
    if total_length < 30:
        silence_needed = 30 - total_length
        silence = add_silence(silence_needed, sr)
        y = np.concatenate((y, silence))
    return y

In [25]:
def pad_to_same_length(audio1, audio2):
    max_length = max(len(audio1), len(audio2))
    if len(audio1) < max_length:
        audio1 = np.pad(audio1, (0, max_length - len(audio1)))
    if len(audio2) < max_length:
        audio2 = np.pad(audio2, (0, max_length - len(audio2)))
    return audio1, audio2

In [10]:
voice1_directory = r"C:\data\Voice\kss\1"  # Path to the directory containing audio files
voice2_directory = r"C:\data\Voice\kss\2"  # Path to the directory containing audio files
voice3_directory = r"C:\data\Voice\kss\3"  # Path to the directory containing audio files
voice4_directory = r"C:\data\Voice\kss\4"  # Path to the directory containing audio files

voice_concat_directory = r"C:\data\Voice\kss\concat"  # Path to the directory containing audio files


In [23]:
audio_directory = r"C:\data\Voice\TUT-acoustic-scenes-2016\audio"
error_file_path = r"C:\data\Voice\TUT-acoustic-scenes-2016\error.txt"
normal_audio_directory = r"C:\data\Voice\TUT-acoustic-scenes-2016\normal"
output_directory = r"C:\data\Voice\mix"

In [12]:
erroneous_files = process_erroneous_files(error_file_path)

In [13]:
erroneous_files

['a040_0_30.wav',
 'a040_30_60.wav',
 'a040_60_90.wav',
 'a040_90_120.wav',
 'a054_120_150.wav',
 'a066_0_30.wav',
 'a066_60_90.wav',
 'a069_0_30.wav',
 'a069_30_60.wav',
 'a069_30_60.wav',
 'a069_30_60.wav',
 'a069_60_90.wav',
 'a069_90_120.wav',
 'a075_0_30.wav',
 'a075_120_150.wav',
 'a075_120_150.wav',
 'a075_120_150.wav',
 'a075_150_180.wav',
 'a075_180_210.wav',
 'a075_180_210.wav',
 'a075_240_270.wav',
 'a075_90_120.wav',
 'a075_90_120.wav',
 'a092_120_150.wav',
 'a092_180_210.wav',
 'a092_180_210.wav',
 'a097_120_150.wav',
 'a097_120_150.wav',
 'a097_180_210.wav',
 'a097_210_240.wav',
 'a097_240_270.wav',
 'a097_240_270.wav',
 'a112_0_30.wav',
 'a112_120_150.wav',
 'a156_0_30.wav',
 'a156_120_150.wav',
 'a156_240_270.wav',
 'a156_240_270.wav',
 'a156_60_90.wav',
 'a156_90_120.wav',
 'a157_0_30.wav',
 'a157_90_120.wav',
 'b067_0_30.wav',
 'b067_150_180.wav',
 'b078_0_30.wav',
 'b078_120_150.wav',
 'b079_150_180.wav',
 'b079_150_180.wav',
 'b079_180_210.wav',
 'b079_60_90.wav',
 

In [14]:
len(erroneous_files)

62

In [22]:
copy_normal_files(audio_directory, normal_audio_directory, erroneous_files)

Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a001_0_30.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a001_0_30.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a001_120_150.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a001_120_150.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a001_150_180.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a001_150_180.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a001_30_60.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a001_30_60.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a001_60_90.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a001_60_90.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a001_90_120.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a001_90_120.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a003_0_30.wav to C:\data\Voice\TUT-acoustic-scenes-2016\normal\a003_0_30.wav
Copied C:\data\Voice\TUT-acoustic-scenes-2016\audio\a003_120_150.wav to C:\da

In [15]:
voice_files = load_audio_files(voice1_directory)

In [14]:
voice_files

[(array([ 0.0000000e+00,  3.0517578e-05, -4.5776367e-05, ...,
          0.0000000e+00, -3.0517578e-05, -3.0517578e-05], dtype=float32),
  44100),
 (array([ 3.0517578e-04,  6.1035156e-05, -3.6621094e-04, ...,
          6.7138672e-04,  7.0190430e-04,  7.3242188e-04], dtype=float32),
  44100),
 (array([ 0.00125122,  0.00109863,  0.00094604, ..., -0.00088501,
         -0.00045776, -0.00045776], dtype=float32),
  44100),
 (array([-0.00115967, -0.0012207 , -0.00131226, ...,  0.00027466,
          0.00094604,  0.00106812], dtype=float32),
  44100),
 (array([ 0.00045776,  0.00073242,  0.00134277, ..., -0.00033569,
         -0.00054932, -0.00045776], dtype=float32),
  44100),
 (array([-0.00039673, -0.00082397, -0.0007019 , ...,  0.00027466,
          0.00030518,  0.00012207], dtype=float32),
  44100),
 (array([-0.0045166 , -0.00476074, -0.00448608, ...,  0.00054932,
          0.00079346,  0.00079346], dtype=float32),
  44100),
 (array([-0.00100708, -0.00076294, -0.00061035, ...,  0.00076294,
  

In [17]:
concatenated_audios = concatenate_audios(voice_files)

In [18]:
len(concatenated_audios)

2047

In [19]:
for i, (audio, sr) in enumerate(concatenated_audios):
    save_audio(audio, sr, voice_concat_directory, f"concatenated_voice_{i+1}.wav")

Audio saved to C:\data\Voice\kss\concat\concatenated_voice_1.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_2.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_3.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_4.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_5.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_6.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_7.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_8.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_9.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_10.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_11.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_12.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_13.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_14.wav
Audio saved to C:\data\Voice\kss\concat\concatenated_voice_15.wav
Audio saved to C:\d

In [20]:
audio_files = load_audio_files(normal_audio_directory)

In [21]:
len(audio_files)

1121

In [22]:
normalized_audio_files = [(normalize_audio(y), sr) for y, sr in audio_files]
normalized_voice_files = [(normalize_audio(y), sr) for y, sr in concatenated_audios]

In [29]:
for i, (audio, sr) in enumerate(normalized_audio_files):
    normalized_voice, _ = normalized_voice_files[i % len(normalized_voice_files)]  # Cycle through concatenated voices
    mixed_audio = mix_audios(audio, normalized_voice)
    mixed_audio = pad_to_30_seconds(mixed_audio, sr)
    save_audio(mixed_audio, sr, output_directory, f"mixed_output_{i+1}.wav")

Audio saved to C:\data\Voice\mix\mixed_output_1.wav
Audio saved to C:\data\Voice\mix\mixed_output_2.wav
Audio saved to C:\data\Voice\mix\mixed_output_3.wav
Audio saved to C:\data\Voice\mix\mixed_output_4.wav
Audio saved to C:\data\Voice\mix\mixed_output_5.wav
Audio saved to C:\data\Voice\mix\mixed_output_6.wav
Audio saved to C:\data\Voice\mix\mixed_output_7.wav
Audio saved to C:\data\Voice\mix\mixed_output_8.wav
Audio saved to C:\data\Voice\mix\mixed_output_9.wav
Audio saved to C:\data\Voice\mix\mixed_output_10.wav
Audio saved to C:\data\Voice\mix\mixed_output_11.wav
Audio saved to C:\data\Voice\mix\mixed_output_12.wav
Audio saved to C:\data\Voice\mix\mixed_output_13.wav
Audio saved to C:\data\Voice\mix\mixed_output_14.wav
Audio saved to C:\data\Voice\mix\mixed_output_15.wav
Audio saved to C:\data\Voice\mix\mixed_output_16.wav
Audio saved to C:\data\Voice\mix\mixed_output_17.wav
Audio saved to C:\data\Voice\mix\mixed_output_18.wav
Audio saved to C:\data\Voice\mix\mixed_output_19.wav
Au