In [2]:
import wave
import numpy as np
import librosa
from IPython import embed
import os
from sklearn import preprocessing

In [3]:
def CreateFolder( fd ):
    if not os.path.exists(fd):
        os.makedirs(fd)

In [4]:
def load_audio(filename, mono=True, fs=44100):
    """Load audio file into numpy array
    Supports 24-bit wav-format
    
    Taken from TUT-SED system: https://github.com/TUT-ARG/DCASE2016-baseline-system-python
    
    Parameters
    ----------
    filename:  str
        Path to audio file
    mono : bool
        In case of multi-channel audio, channels are averaged into single channel.
        (Default value=True)
    fs : int > 0 [scalar]
        Target sample rate, if input audio does not fulfil this, audio is resampled.
        (Default value=44100)
    Returns
    -------
    audio_data : numpy.ndarray [shape=(signal_length, channel)]
        Audio
    sample_rate : integer
        Sample rate
    """

    file_base, file_extension = os.path.splitext(filename)
    if file_extension == '.wav':
        _audio_file = wave.open(filename)

        # Audio info
        sample_rate = _audio_file.getframerate()
        sample_width = _audio_file.getsampwidth()
        number_of_channels = _audio_file.getnchannels()
        number_of_frames = _audio_file.getnframes()

        # Read raw bytes
        data = _audio_file.readframes(number_of_frames)
        _audio_file.close()

        # Convert bytes based on sample_width
        num_samples, remainder = divmod(len(data), sample_width * number_of_channels)
        if remainder > 0:
            raise ValueError('The length of data is not a multiple of sample size * number of channels.')
        if sample_width > 4:
            raise ValueError('Sample size cannot be bigger than 4 bytes.')

        if sample_width == 3:
            # 24 bit audio
            a = np.empty((num_samples, number_of_channels, 4), dtype=np.uint8)
            raw_bytes = np.fromstring(data, dtype=np.uint8)
            a[:, :, :sample_width] = raw_bytes.reshape(-1, number_of_channels, sample_width)
            a[:, :, sample_width:] = (a[:, :, sample_width - 1:sample_width] >> 7) * 255
            audio_data = a.view('<i4').reshape(a.shape[:-1]).T
        else:
            # 8 bit samples are stored as unsigned ints; others as signed ints.
            dt_char = 'u' if sample_width == 1 else 'i'
            a = np.fromstring(data, dtype='<%s%d' % (dt_char, sample_width))
            audio_data = a.reshape(-1, number_of_channels).T

        if mono:
            # Down-mix audio
            audio_data = np.mean(audio_data, axis=0)

        # Convert int values into float
        audio_data = audio_data / float(2 ** (sample_width * 8 - 1) + 1)

        # Resample
        if fs != sample_rate:
            audio_data = librosa.core.resample(audio_data, sample_rate, fs)
            sample_rate = fs

        return audio_data, sample_rate
    return None, None


In [5]:
def load_desc_file(_desc_file):
    _desc_dict = dict()
    for line in open(_desc_file):
        words = line.strip().split('\t')
        name = words[0].split('/')[-1]
        if name not in _desc_dict:
            _desc_dict[name] = list()
        _desc_dict[name].append([float(words[2]), float(words[3]), __class_labels[words[-1]]])
    return _desc_dict


In [6]:
def extract_mbe(_y, _sr, _nfft, _nb_mel):
    spec, n_fft = librosa.core.spectrum._spectrogram(y=_y, n_fft=_nfft, hop_length=_nfft/2, power=1)
    mel_basis = librosa.filters.mel(sr=_sr, n_fft=_nfft, n_mels=_nb_mel)
    return np.log(np.dot(mel_basis, spec))

In [31]:
# ###################################################################
#              Main script starts here
# ###################################################################

is_mono = False

#__class_labels = {
 #   'brakes squeaking': 0,
 #   'car': 1,
  #  'children': 2,
  #  'large vehicle': 3,
  #  'people speaking': 4,
  #  'people walking': 5
#}
__class_labels = {
    '(object) rustling':0, 
    '(object) snapping':1,
    'cupboard':2,
    'cutlery':3, 'dishes':4,
    'drawer':5,
    'glass jingling':6,
    'object impact':7, 
    'people walking':8, 
    'washing dishes':9, 
    'water tap running':10 
  }

In [32]:
# location of data.
folds_list = [1, 2, 3, 4]
evaluation_setup_folder = '/media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/TUT-sound-events-2016-development/TUT-sound-events-2016-development-meta/evaluation_setup'
audio_folder = '/media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/TUT-sound-events-2016-development/audio/home'

In [33]:
# Output
feat_folder = '/media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/DCASE2017/feat/'
CreateFolder(feat_folder)

In [34]:
# User set parameters
nfft = 2048
win_len = nfft
hop_len = win_len / 2
nb_mel_bands = 40
sr = 44100


In [35]:
# -----------------------------------------------------------------------
# Feature extraction and label generation
# -----------------------------------------------------------------------
# Load labels
train_file = os.path.join(evaluation_setup_folder, 'home_fold{}_train.txt'.format(1))
evaluate_file = os.path.join(evaluation_setup_folder, 'home_fold{}_evaluate.txt'.format(1))

In [36]:
desc_dict = load_desc_file(train_file)
desc_dict.update(load_desc_file(evaluate_file)) # contains labels for all the audio in the dataset
desc_dict

{'a030.wav': [[5.940996, 6.282603, 1],
  [10.159102, 10.693792, 7],
  [12.127057, 12.490943, 7],
  [14.956456, 15.3649, 7],
  [17.837839, 18.261135, 1],
  [18.751267, 19.189416, 7],
  [23.310981, 23.697146, 7],
  [28.761845, 29.185141, 1],
  [31.932851, 32.43041, 1],
  [33.447805, 33.819117, 1],
  [33.945364, 35.556859, 5],
  [49.733559, 50.149429, 7],
  [51.902023, 52.392155, 7],
  [52.607516, 52.926845, 1],
  [52.993681, 53.394698, 5],
  [53.520944, 55.971605, 7],
  [56.513721, 56.959295, 1],
  [57.687067, 58.333151, 5],
  [58.444544, 58.904971, 7],
  [58.934676, 59.736711, 5],
  [59.825826, 60.353089, 1],
  [61.088287, 61.459599, 1],
  [113.071999, 113.695803, 7],
  [126.624895, 127.211568, 5],
  [127.909635, 128.659686, 7],
  [131.073215, 131.87525, 7],
  [132.268841, 132.877793, 7],
  [168.04106, 168.583176, 7],
  [172.704742, 173.49935, 7],
  [173.699859, 179.88592, 7]],
 'a031.wav': [[0.879372, 1.522274, 7],
  [2.261242, 4.411638, 8],
  [4.685056, 5.305789, 7],
  [5.379686, 7.14

In [42]:
# Extract features for all audio files, and save it along with labels
for audio_filename in os.listdir(audio_folder):
    audio_file = os.path.join(audio_folder, audio_filename)
    print('Extracting features and label for : {}'.format(audio_file))
    y, sr = load_audio(audio_file, mono=is_mono, fs=sr)
    mbe = None
    if is_mono:
        mbe = extract_mbe(y, sr, nfft, nb_mel_bands).T
        print mbe.shape
    else:
        for ch in range(y.shape[0]):
            mbe_ch = extract_mbe(y[ch, :], sr, nfft, nb_mel_bands).T
            print mbe_ch.shape
            if mbe is None:
                mbe = mbe_ch
            else:
                mbe = np.concatenate((mbe, mbe_ch), 1)
    label = np.zeros((mbe.shape[0], len(__class_labels)))
    tmp_data = np.array(desc_dict[audio_filename])
    frame_start = np.floor(tmp_data[:, 0] * sr / hop_len).astype(int)
    frame_end = np.ceil(tmp_data[:, 1] * sr / hop_len).astype(int)
    print(frame_start,frame_end)
    se_class = tmp_data[:, 2].astype(int)
    for ind, val in enumerate(se_class):
        label[frame_start[ind]:frame_end[ind], val] = 1
    tmp_feat_file = os.path.join(feat_folder, '{}_{}.npz'.format(audio_filename, 'mon' if is_mono else 'bin'))
    np.savez(tmp_feat_file, mbe, label)


Extracting features and label for : /media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/TUT-sound-events-2016-development/audio/home/a030.wav
(7820, 40)
(7820, 40)
(array([ 255,  437,  522,  644,  768,  807, 1003, 1238, 1375, 1440, 1461,
       2141, 2235, 2265, 2282, 2304, 2433, 2484, 2516, 2538, 2576, 2630,
       4869, 5453, 5508, 5644, 5696, 7236, 7437, 7480]), array([ 271,  461,  538,  662,  787,  827, 1021, 1257, 1397, 1457, 1532,
       2160, 2257, 2280, 2300, 2411, 2454, 2513, 2537, 2573, 2600, 2647,
       4897, 5479, 5541, 5680, 5723, 7261, 7472, 7748]))
Extracting features and label for : /media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/TUT-sound-events-2016-development/audio/home/a031.wav
(7781, 40)
(7781, 40)
(array([  37,   97,  201,  231,  280,  362,  545,  881,  963, 1007, 1065,
       1092, 1152, 1217, 1452, 1539, 1596, 1807, 1816, 1850, 2164, 2174,
       2291, 2356, 2815, 2993, 3085, 3356, 3490, 4208, 4282, 4500, 4622,
       5200, 5668, 5753, 5832, 6033, 6122, 6133, 

(10465, 40)
(10465, 40)
(array([  126,   721,   748,  1051,  1082,  1161,  1257,  1642,  1696,
        1879,  1918,  2033,  2111,  2412,  2503,  2697,  2790,  2897,
        2960,  3055,  3103,  3263,  3330,  3415,  3436,  3502,  3583,
        3740,  3917,  4018,  4178,  4227,  4281,  4331,  4450,  4487,
        4618,  4657,  5037,  5190,  5398,  5652,  5767,  5942,  6015,
        6141,  6252,  6584,  6962,  7007,  7815,  7865,  8258,  8323,
        8377,  8768,  8885,  8998,  9224,  9298,  9354,  9445,  9532,
        9651,  9701,  9850,  9961, 10370]), array([  712,   733,   924,  1083,  1115,  1258,  1427,  1668,  1713,
        1896,  1978,  2056,  2165,  2423,  2545,  2727,  2835,  2904,
        3017,  3070,  3122,  3272,  3403,  3422,  3470,  3515,  3603,
        3769,  4019,  4147,  4199,  4247,  4295,  4414,  4459,  4571,
        4629,  5027,  5191,  5347,  5426,  5698,  5798,  5956,  6077,
        6164,  6289,  6594,  6973,  7104,  7856,  8035,  8319,  8349,
        8413,  8782, 

In [53]:
# -----------------------------------------------------------------------
# Feature Normalization
# -----------------------------------------------------------------------

for fold in folds_list:
    train_file = os.path.join(evaluation_setup_folder, 'home_fold{}_train.txt'.format(1))
    evaluate_file = os.path.join(evaluation_setup_folder, 'home_fold{}_evaluate.txt'.format(1))
    train_dict = load_desc_file(train_file)
    test_dict = load_desc_file(evaluate_file)
    #print train_dict
    X_train, Y_train, X_test, Y_test = None, None, None, None
    for key in train_dict.keys():
        #print key
        tmp_feat_file = os.path.join(feat_folder, '{}_{}.npz'.format(key, 'mon' if is_mono else 'bin'))
        dmp = np.load(tmp_feat_file)
        tmp_mbe, tmp_label = dmp['arr_0'], dmp['arr_1']
        if X_train is None:
            X_train, Y_train = tmp_mbe, tmp_label
        else:
            X_train, Y_train = np.concatenate((X_train, tmp_mbe), 0), np.concatenate((Y_train, tmp_label), 0)
            print(X_train.shape,Y_train.shape)
            
    for key in test_dict.keys():
        tmp_feat_file = os.path.join(feat_folder, '{}_{}.npz'.format(key, 'mon' if is_mono else 'bin'))
        dmp = np.load(tmp_feat_file)
        tmp_mbe, tmp_label = dmp['arr_0'], dmp['arr_1']
        if X_test is None:
            X_test, Y_test = tmp_mbe, tmp_label
        else:
            X_test, Y_test = np.concatenate((X_test, tmp_mbe), 0), np.concatenate((Y_test, tmp_label), 0)
     # Normalize the training data, and scale the testing data using the training data weights
    scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    normalized_feat_file = os.path.join(feat_folder, 'mbe_{}_fold{}.npz'.format('mon' if is_mono else 'bin', fold))
    np.savez(normalized_feat_file, X_train, Y_train, X_test, Y_test)
    print('normalized_feat_file : {}'.format(normalized_feat_file))


((15653, 80), (15653, 11))
((28620, 80), (28620, 11))
((36440, 80), (36440, 11))
((44221, 80), (44221, 11))
((54662, 80), (54662, 11))
((65127, 80), (65127, 11))
normalized_feat_file : /media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/DCASE2017/feat/mbe_bin_fold1.npz
((15653, 80), (15653, 11))
((28620, 80), (28620, 11))
((36440, 80), (36440, 11))
((44221, 80), (44221, 11))
((54662, 80), (54662, 11))
((65127, 80), (65127, 11))
normalized_feat_file : /media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/DCASE2017/feat/mbe_bin_fold2.npz
((15653, 80), (15653, 11))
((28620, 80), (28620, 11))
((36440, 80), (36440, 11))
((44221, 80), (44221, 11))
((54662, 80), (54662, 11))
((65127, 80), (65127, 11))
normalized_feat_file : /media/manjunath/BCE0E709E0E6C8AA/Evaluation-data/DCASE2017/feat/mbe_bin_fold3.npz
((15653, 80), (15653, 11))
((28620, 80), (28620, 11))
((36440, 80), (36440, 11))
((44221, 80), (44221, 11))
((54662, 80), (54662, 11))
((65127, 80), (65127, 11))
normalized_feat_file : /media/manj