This creates a long file with clip FT magnitudes transposed and concatenated into a long series of samples

It's designed for a process that just grabs a window instead of using a prepared frame (since that creates massive data duplication)

In [1]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
from IPython.display import Audio, display
import numpy as np

import os
import time
import cmath

In [2]:
# Constants and settings
DTYPE = 'float32'

# Number of all zero samples between clips
# TODO - not using pads at all, on assumption that the samples have a bit of buffer at start and end
# PAD_SIZE = 1 

# cqt related
FFT_BINS = 768 # function of items below
HOP_LENGTH = 256 # Required for good cqt results

# stft values
N_FFT = 1024 # 512 recommended for speech, music typically 2048

# cqt values
BINS_PER_OCTAVE = 12 * 8
FMIN = librosa.note_to_hz('C1')
OCTAVES = 8

In [3]:
# Idea here is that we operate on magnitude, and will just use phase from the original noisy sample

def rebuild_fft(output, original_fft):
    vphase = np.vectorize(cmath.phase)
    o_phase = vphase(original_fft)
    mag = output.T
    vrect = np.vectorize(cmath.rect)
    return vrect(mag, o_phase)
    
# May not actually use this - may want to just pass a reduced view and then add this back to get right shape
def filter(cqt):
    cqt[0:BINS_PER_OCTAVE,:] = 0
    return cqt

def get_ft(wav):
    c = librosa.cqt(wav, hop_length=HOP_LENGTH, fmin=FMIN, n_bins=OCTAVES*BINS_PER_OCTAVE, bins_per_octave=BINS_PER_OCTAVE)
    #c = librosa.stft(wav, hop_length=HOP_LENGTH, n_fft=N_FFT)
    return c

def inv_ft(ft):
    return librosa.icqt(ft, hop_length=HOP_LENGTH, fmin=FMIN, bins_per_octave=BINS_PER_OCTAVE)
    #return librosa.istft(ft, hop_length=HOP_LENGTH)
    

In [4]:
# some test data to hack around with
test_file = "Assets\\DataShareArchive\\Test\\Clean\\p232_010.wav"
wav, rate = librosa.core.load(test_file)
fft = get_ft(wav)
print(fft.shape)

(768, 239)


In [5]:
# Creating data from clip wave files for adding to long data arrays

# Sample output is (samples, bins) all converted to magnitude
def get_samples(file):
    wav, rate = librosa.core.load(file)
    samples = abs(get_ft(wav).T) # organized as bins, frames so we need to transpose them to frames, bins
    return samples

In [6]:
def frames_file(data_path, max_samples):
    frames_file = data_path + "\\fsamples-" + str(max_samples)
    filename = os.fsdecode(frames_file)
    return filename

def targets_file(data_path, max_samples):
    targets_file = data_path + "\\ftargets-" + str(max_samples)
    filename = os.fsdecode(targets_file)
    return filename

# Iterate over clean & noisy folders to create frames and targets
def create_data(wav_root, data_path, max_samples = 10000):
    clean_dir = wav_root + "\\Clean\\"
    noisy_dir = wav_root + "\\Noisy\\"
    sample_index = 0
    frames = np.memmap(frames_file(data_path, max_samples), mode='w+', dtype=DTYPE, shape=(max_samples,FFT_BINS,1))
    targets = np.memmap(targets_file(data_path, max_samples), mode='w+', dtype=DTYPE, shape=(max_samples,FFT_BINS))
#    frames = np.empty(dtype=DTYPE, shape=(max_samples,FFT_BINS,1))
#    targets = np.empty(dtype=DTYPE, shape=(max_samples,FFT_BINS))
    file_list = os.listdir(clean_dir)
    file_index = 0
    while (sample_index < max_samples) and (file_index < len(file_list)) :
        file = file_list[file_index]
        filename = os.fsdecode(file)
        new_frames = get_samples(noisy_dir + file)
        max_step = min(new_frames.shape[0], max_samples-sample_index)
        frames[sample_index:sample_index+max_step,:,0] = new_frames[:max_step,:]
        new_targets = get_samples(clean_dir + file)
        targets[sample_index:sample_index+max_step,:] = new_targets[:max_step,:]
        sample_index += new_targets.shape[0]
        file_index += 1
    print("Reached sample # " + str(min(sample_index, max_samples)))
    return frames, targets


In [7]:
# Training data comes in at 357K samples total (at hop length 128)
# small test data "Assets\\DataShareArchive\\Test"
# 28K "F:\\Audiodata\\Train28Spk"
f, t = create_data("F:\\Audiodata\\Train28Spk", "f:\\Audiodata", max_samples=300000)

Reached sample # 300000
