In [None]:
import warnings
warnings.filterwarnings(action='ignore') 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
import torch.optim as optim
import torchvision.transforms as transforms
from torch.nn.parameter import Parameter
from torch.utils.data import Dataset, DataLoader

import os
import sys
import math
import time
import random
import pydub
import numpy as np
import pandas as pd
import hashlib
import librosa
import librosa.display
import matplotlib.pyplot as plt
import msaf
import pickle
import seaborn as sns
import IPython.display # IPython.display for audio output

In [None]:
config = {
    'path' : './datasets/karaoke_ky',
    'sr' : 44100,
    "n_fft" : 2048,
    "hop_length" : 512,
    "msaf_feature" : 'mfcc',
    'msaf_algorithm' : 'cnmf',
    'num_segment' : 63,
    'stft_center' : True,
    'masking':{
        'Segment': {
            'ratio' : 0.15,
        },
        'Frequency' : {
            'ratio' : 0.15,
            'count' : 3,
        },
    },
    'n_mels' : 128,
    'batch_size' : 16,
}

In [None]:
# plot wave 
def plot_wave(y):
    x = np.arange(0, len(y), 1)
    plt.plot(x,y)
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.show()

# plot spectrogram
def plot_spectrogram(s):
    librosa.display.specshow(librosa.amplitude_to_db(s, ref=np.max), y_axis='linear', x_axis='time')
    plt.title('power Spectrogram')
    plt.colorbar(format="%+2.0f db")
    plt.tight_layout()
    plt.show()    

In [None]:
def merge_state(state0, state1):
    result = {}
    for k, v in state0.items():
        result[k] = v
    for k, v in state1.items():
        if k in result.keys():
            result[k] += v
        else:
            result[k] = v
    return result

def msaf_postprocessing(boundaries, labels, desired_num_boundaries):
    """
    boundaries : <numpy.array> boundary detection result of msaf.process() 
    labels : <list> segmentation grouping result of msaf.process()
    desired_num_boundaries : <int> desired number of boundaries after postprocessing
    """
    
    while desired_num_boundaries > len(boundaries):
        diff = [ j - i for i, j in zip(boundaries[:-1],boundaries[1:])]
        max_idx = np.argmax(diff)
#         boundaries.append((boundaries[max_idx] + boundaries[max_idx+1])/2)
        boundaries = np.insert(boundaries, max_idx+1 , (boundaries[max_idx] + boundaries[max_idx+1])/2)
        boundaries = np.array(sorted(boundaries))
        labels.insert(max_idx, labels[max_idx])
        
    if desired_num_boundaries == len(boundaries):
        return boundaries, labels, len(labels) 

    while desired_num_boundaries < len(boundaries):
        # initialize helper arrays(boundaries_difference, average bi-directional boundaries difference, label and diff state)
        diff = [ j - i for i, j in zip(boundaries[:-1],boundaries[1:])]
        avg_bi_diff = (np.array(diff + [diff[-1]]) + np.array([diff[0]]+diff))/2
        state = [{i : j} for i, j in zip(labels, diff)]

        removal_idx = np.argmin(avg_bi_diff)

        # shift removal idx because can't remove first & last boundary
        if removal_idx == 0:
            removal_idx += 1
        if removal_idx == len(boundaries)-1:
            removal_idx -= 1

        # update list and state according to minimum index of average bi-directional diff
        boundaries = np.delete(boundaries, removal_idx, 0)
        avg_bi_diff = np.delete(avg_bi_diff, removal_idx, 0)
        state[removal_idx-1: removal_idx+1] = [merge_state(state[removal_idx-1], state[removal_idx])]

    # Straighten out state to labels
    labels = [max(x, key=x.get)  for x in state]
    
    return boundaries , labels, len(labels)

def time_synchronize(boundaries, random_wave_crop_idx, sr, wave_length):
    boundaries = boundaries - random_wave_crop_idx / sr
    boundaries[0] = 0
    boundaries[-1] = wave_length / sr
    return boundaries
    
def load(file_path, config):
    y, sr = librosa.load(file_path, sr =  config['sr'])
    random_wave_crop_idx = random.randint(0, len(y)%config['hop_length'])
    wave = y[random_wave_crop_idx : random_wave_crop_idx + len(y) - len(y)%config['hop_length']]
    spectrogram = librosa.stft(wave, n_fft = config['n_fft'], hop_length = config['hop_length'], center=config['stft_center'])
    # if center is True, then w(shape = hop_length * k) result D.shape = [k+1, n_fft/2 + 1].
    # if center is False, then w result D.shape = [(k - n_fft/hop_length) + 1, n_fft/2 + 1].
    mel_spectrogram = mel_filterbank(spectrogram, sr=config['sr'], n_fft=config['n_fft'], n_mels = config['n_mels'])
    return wave, spectrogram, mel_spectrogram, random_wave_crop_idx, len(y)

def msaf_process(file_path, config, crop_idx, origin_wave_length):
    boundaries, labels = msaf.process(file_path, feature=config['msaf_feature'], 
                                      boundaries_id=config['msaf_algorithm'], labels_id=config['msaf_algorithm'])
    boundaries, labels, seg_num = msaf_postprocessing(boundaries, labels.copy(), config['num_segment']+1)
    boundaries = time_synchronize(boundaries, crop_idx, config['sr'], origin_wave_length)
    return boundaries, labels, seg_num

In [None]:
# masking
def mel_filterbank(spectrogram, sr, n_fft, n_mels):
    mel_filter = librosa.filters.mel(sr = sr , n_fft = n_fft, n_mels = n_mels)
    return (mel_filter @ np.abs(spectrogram)**2)

def masking_spectrogram(spectrogram, config, boundaries, labels):
    """
    mode : choice of masking mode in ['Segment', 'ConsecutiveFrame', 'Frequency', 'Change', ...] 

    """
    mode = config['masking']
    masked_spectrogram = spectrogram.copy()
    masking_index = []

    if 'Segment' in mode.keys():
        seg_masking_ratio = mode['Segment']['ratio']
        masking_labels = random.sample(population = set(labels), k= int(seg_masking_ratio * len(set(labels))))
        for masking_label in masking_labels:
            for masking_idx in np.argwhere(np.array(labels)==masking_label).flatten():
                start, end = boundaries[masking_idx], boundaries[masking_idx+1]
                start = math.floor(start*config['sr']/config['hop_length'])
                end = math.ceil(end*config['sr']/config['hop_length'])
                masked_spectrogram[:, start : end] = 0
                masking_index.append([start,end])
        
    if 'ConsecutiveFrame' in mode.keys():
        cons_frame_masking_ratio, cons_frame_masking_count = mode['ConsecutiveFrame']['ratio'], mode['ConsecutiveFrame']['count']
        frames = spectrogram.shape[1]
        masking_range = int(frames * cons_frame_masking_ratio)
        for _ in range(cons_frame_masking_count):
            masked_idx = random.randint(0,int(frames-masking_range))
            masking_index.append([masked_idx, masking_range])
            masked_spectrogram[:,masked_idx:masked_idx + masking_range] = 0
            masking_index.append([masked_idx , masked_idx + masking_range])

    if 'Frequency' in mode.keys():
        freq_masking_ratio, freq_masking_count = mode['Frequency']['ratio'], mode['Frequency']['count']
        freqs = spectrogram.shape[0]
        masking_range = int(freqs * freq_masking_ratio)
        for _ in range(freq_masking_count):
            masked_freq = random.randint(0, int(freqs - masking_range))
            masked_spectrogram[masked_freq:masked_freq+masking_range] = 0

    return masked_spectrogram, masking_index

def masking(spectrogram, config, boundaries, labels):
    masked_spectrogram, masking_index = masking_spectrogram(spectrogram, config, boundaries, labels)
    masked_mel_spectrogram = mel_filterbank(masked_spectrogram, sr=config['sr'], n_fft=config['n_fft'], n_mels = config['n_mels'])
    masked_wave = librosa.istft(masked_spectrogram, hop_length = config['hop_length'], win_length = config['n_fft'])
    return masked_wave, masked_mel_spectrogram, masking_index            

In [None]:
def store_mp3(f, x, sr=44100, normalized=False):
    """
    numpy array to MP3
    reference : https://stackoverflow.com/questions/53633177/how-to-read-a-mp3-audio-file-into-a-numpy-array-save-a-numpy-array-to-mp3
    """
    if normalized:  # normalized array - each item should be a float in [-1, 1)
        y = np.int16(x * 2 ** 15)
    else:
        y = np.int16(x)
    song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=1)
    song.export(f+".mp3", format="mp3", bitrate="320k")

def store_obj(f, obj):
    with open(f, 'wb') as file:
        pickle.dump(obj, file)

def filename_hashing(filename):
    hash_object = hashlib.sha1(bytes(filename, encoding='cp949'))
    return str(hash_object.hexdigest())


def preprocess(config, features, algorithms,  exts=['mp3', 'wav']):

    dataset_path = os.path.join(config['path'], 'audio')
    preprocessed_path = os.path.join(config['path'], 'preprocessed',  "num_seg_"+str(config['num_segment']+1) )

    if not os.path.isdir(dataset_path):
        raise Exception("dataset path is invalid!")
    if not os.path.isdir(preprocessed_path):
        os.mkdir(preprocessed_path)
    if not os.path.isdir(preprocessed_path):
        os.mkdir(preprocessed_path)

    # boundary_algorithms = msaf.get_all_boundary_algorithms()
    # label_algorithms = msaf.get_all_label_algorithms()
    # algorithms = set(boundary_algorithms) & set(label_algorithms)
    # features = list(msaf.features_registry.keys())

    meta_result = []
    error_feature_algorithm_set = set()
    for filename in os.listdir(dataset_path):
        if filename.split(".")[-1].lower() in exts: # check file extension
            hashed_filename = filename_hashing(".".join(filename.split(".")[:-1]))

            # crop with hop_length multiple & Short Time Fourier Transform for spectrogram
            wave, spectrogram, mel_spectrogram, crop_idx, origin_wave_length = load(os.path.join(dataset_path, filename), config )

            # store cropped wave(mp3) & spectrogram(pickle)
            cropped_wave_file_name = os.path.join(preprocessed_path, "cropped_" + hashed_filename)
            melspectrogram_file_name = os.path.join(preprocessed_path, 'melspectrogram_'+hashed_filename)  
            store_mp3(cropped_wave_file_name, wave, config['sr'], normalized=True)
            store_obj(melspectrogram_file_name, mel_spectrogram)

            # meta information buffer for total meta.csv
            meta_buffer = [os.path.join(dataset_path, filename), hashed_filename, cropped_wave_file_name+".mp3", melspectrogram_file_name]

            for feature in features:
                for algorithm in algorithms:
                    try:
                        config['msaf_feature'] = feature
                        config['msaf_algorithm'] = algorithm
                        
                        # msaf & masking
                        boundaries, labels, seg_num = msaf_process(os.path.join(dataset_path, filename), config, crop_idx, origin_wave_length)
                        masked_wave, masked_mel_spectrogram, masking_index = masking(spectrogram, config, boundaries, labels)
                        
                        # check between origin data shape and return of masking process shape
                        assert wave.shape == masked_wave.shape
                        assert mel_spectrogram.shape == masked_mel_spectrogram.shape
                        
                        # store masked wave(mp3) & others(pickle)
                        masked_wave_file_name = os.path.join(preprocessed_path,  "masked_{}_{}_".format(feature, algorithm) + hashed_filename)
                        others_file_name = os.path.join(preprocessed_path, "others_{}_{}_".format(feature,algorithm)+hashed_filename)
                        store_mp3(masked_wave_file_name, masked_wave, config['sr'],normalized=True)
                        store_obj(others_file_name, [masked_mel_spectrogram, boundaries, labels, seg_num, masking_index])
                        meta_buffer.append(masked_wave_file_name+'.mp3')
                        meta_buffer.append(others_file_name)
                    except Exception as ex:
                        # TODO : error handling, logging
                        error_feature_algorithm_set.add("{}_{}".format(feature, algorithm))
                        print(feature, algorithm, ex)

            # append buffer of meta info in result 
            meta_result.append(meta_buffer)
    # store result to csv
    try:
        columns = "origin_file_path,hashed_filename,cropped_file_path,mel_file_path,"
        for feature in features:
            for algorithm in algorithms:
                if "{}_{}".format(feature, algorithm) not in error_feature_algorithm_set:
                    columns += "masked_wave_{}_{},".format(feature, algorithm)
                    columns += "masked_other_{}_{},".format(feature, algorithm)
        np.savetxt(os.path.join(preprocessed_path, "meta.csv"), np.array(meta_result), delimiter=",", fmt="%s", header=columns[:-1])
    except Exception as ex:
        print(ex)
    return meta_result

In [None]:
mr = preprocess(config, ['mfcc'], ['cnmf'])

  hop_length=hop_length))
  hop_length=hop_length))
  hop_length=hop_length))
