In [1]:
import os
from tqdm import tqdm
from collections import defaultdict
import pickle
import pandas as pd
import numpy as np
from scipy.io import wavfile
import librosa
from librosa.feature import melspectrogram
from python_speech_features import mfcc
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
def check_test_data():
    if os.path.isfile(config.test_p_path):
        print('Loading existing data for {} model'.format(config.mode))
        with open(config.test_p_path, 'rb') as handle:
            tmp = pickle.load(handle)
            return tmp 
    else:
        return None

In [11]:
def build_test_feat(audio_dir):
    tmp = check_test_data()
    if tmp:
        return tmp.data # a dictionary of arrays
    fsplits = defaultdict(list)
    print('Extracting features from audio')
    for fn in tqdm(os.listdir(audio_dir)):
        rate, wav = wavfile.read(os.path.join(audio_dir, fn))
        label = fn2class[fn]
        c = classes.index(label)
        fsplits[fn] = []

        for i in range(0, wav.shape[0] - config.step, config.step):
            sample = wav[i:i + config.step]
            # x = mfcc(sample, rate,
            #             numcep=config.nfeat, nfilt=config.nfilt, nfft = config.nfft)
            if config.feature_type == 'mels':
                x = melspectrogram(sample, rate, n_mels=config.n_mels, n_fft=config.nfft)
                x = librosa.power_to_db(x)
            x = (x - config.min) / (config.max - config.min)
            if config.mode == 'conv':
                x = x.reshape(1, x.shape[0], x.shape[1], 1)
            elif config.mode == 'time':
                x = np.expand_dims(x, axis=0)
            fsplits[fn].append(x)
    return fsplits

In [21]:
def build_predictions(audio_dir):
    y_true = []
    y_pred = []
    fn_prob = {}
    fsplits = build_test_feat(audio_dir)

    print("Making predictions")
    for fn in tqdm(os.listdir(audio_dir)):
        label = fn2class[fn]
        c = classes.index(label)
        splits = fsplits[fn]
        y_prob = []
        for x in splits:
            y_hat = model.predict(x)
            y_prob.append(y_hat)    
            y_pred.append(np.argmax(y_hat))
            y_true.append(c)
        fn_prob[fn] = np.mean(y_prob, axis=0).flatten()
    return y_true, y_pred, fn_prob

In [12]:
df = pd.read_csv('data/test/roadsound_labels.csv', index_col=0)
classes = list(np.unique(df.labels))
fn2class = dict(zip(df.fname, df.labels))
p_path = os.path.join('pickles', 'conv.p') ### configuration file
with open(p_path, 'rb') as handle:
    config = pickle.load(handle)

In [20]:
model = load_model('models/10epochs_20200218.h5')

In [22]:
y_true, y_pred, fn_prob = build_predictions('audio/test_roadsound')

  0%|          | 0/194 [00:00<?, ?it/s]

Extracting features from audio


100%|██████████| 194/194 [00:25<00:00,  7.55it/s]
  0%|          | 0/194 [00:00<?, ?it/s]

Making predictions


100%|██████████| 194/194 [30:21<00:00,  9.39s/it]   


In [23]:
acc_score = accuracy_score(y_true=y_true, y_pred=y_pred)

In [30]:
cur_df = pd.read_csv('data/train/roadsound_labels.csv', index_col=0)

In [31]:
noisy_df = pd.read_csv('data/train_noisy/roadsound_labels.csv', index_col=0)

In [33]:
df = pd.concat([cur_df, noisy_df])
df.set_index('fname', inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [34]:
df

Unnamed: 0_level_0,flickr_video_URL,labels,license
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40d9de68.wav,,Bicycle_bell,
e66309d9.wav,,Bicycle_bell,
d1fb51da.wav,,Bicycle_bell,
074a72f0.wav,,Bicycle_bell,
35badf6d.wav,,Bicycle_bell,
...,...,...,...
fdda6e7d.wav,http://www.flickr.com/videos/92996378@N00/7566...,Bicycle_bell,CC BY 2.0
fdeb319b.wav,http://www.flickr.com/videos/9749756@N06/56971...,Bus,CC BY 2.0
fe28bb6d.wav,http://www.flickr.com/videos/25921029@N05/4805...,Bicycle_bell,CC BY 2.0
fee6710e.wav,http://www.flickr.com/videos/53051966@N00/2470...,Bus,CC BY-SA 2.0


In [39]:
def check_data():
    if os.path.isfile(config.p_path):
        print('Loading existing data for {} model'.format(config.mode))
        with open(config.p_path, 'rb') as handle:
            tmp = pickle.load(handle)
            return tmp 
    else:
        return None

In [40]:
def build_rand_feat(df, split):
    tmp = check_data()
    if not tmp:
        tmp = Config()
        tmp.data = [None, None, None, None]
    if split == 'train' and not tmp.data[0] is None:
            return tmp.data[0], tmp.data[1]
    elif split == 'test' and not tmp.data[2] is None:
            return tmp.data[2], tmp.data[3]
    config.data = [None, None, None, None]
    X = []
    y = []
    _min, _max = float('inf'), -float('inf')
    print('Building features for '+split)
    for _ in tqdm(range(n_samples)):
        rand_class = np.random.choice(classes, p=prob_dist)
        file = np.random.choice(df[df.labels == rand_class].index)
        rate, wav = wavfile.read('clean/'+file)
        rand_index = np.random.randint(0, wav.shape[0] - config.step)
        sample = wav[rand_index:rand_index + config.step]
        if config.feature_type == 'mfccs':
            X_sample = mfcc(sample, rate, numcep=config.nfeat,
                            nfilt=config.nfilt, nfft = config.nfft)
        elif config.feature_type == 'mels':
            X_sample = melspectrogram(sample, rate, n_mels=config.n_mels,
                                        n_fft=config.nfft)
            X_sample = librosa.power_to_db(X_sample)
        elif config.feature_type == 'raw':
            X_sample = sample
        _min = min(np.amin(X_sample), _min)
        _max = max(np.amax(X_sample), _max)
        X.append(X_sample)
        y.append(classes.index(rand_class)) # encoding integer values for classes
    config.min = _min
    config.max = _max
    X, y = np.array(X), np.array(y)
    X = (X - _min) / (_max - _min)
    X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
    y = to_categorical(y)
    if split == 'train':
        config.data[0], config.data[1] = (X, y)
    elif split == 'test':
        config.data[2], config.data[3] = (X, y)

    with open(config.p_path, 'wb') as handle:
        pickle.dump(config, handle, protocol=2)
    return X, y

In [47]:
cur_df = pd.read_csv('data/train/roadsound_labels.csv', index_col=0)
noisy_df = pd.read_csv('data/train_noisy/roadsound_labels.csv', index_col=0)
df = pd.concat([cur_df, noisy_df], sort=True)
df.set_index('fname', inplace=True)
# for f in df.index:
#     rate, signal = wavfile.read('clean/'+f)
#     df.at[f, 'length'] = signal.shape[0]/rate

In [48]:
df

Unnamed: 0_level_0,flickr_video_URL,labels,license
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40d9de68.wav,,Bicycle_bell,
e66309d9.wav,,Bicycle_bell,
d1fb51da.wav,,Bicycle_bell,
074a72f0.wav,,Bicycle_bell,
35badf6d.wav,,Bicycle_bell,
...,...,...,...
fdda6e7d.wav,http://www.flickr.com/videos/92996378@N00/7566...,Bicycle_bell,CC BY 2.0
fdeb319b.wav,http://www.flickr.com/videos/9749756@N06/56971...,Bus,CC BY 2.0
fe28bb6d.wav,http://www.flickr.com/videos/25921029@N05/4805...,Bicycle_bell,CC BY 2.0
fee6710e.wav,http://www.flickr.com/videos/53051966@N00/2470...,Bus,CC BY-SA 2.0


In [58]:
f = '13eb02c3.wav'

In [45]:
rate, wav = wavfile.read('clean/'+'fdda6e7d.wav')

FileNotFoundError: [Errno 2] No such file or directory: 'clean/fdda6e7d.wav'

In [53]:
rand_index = np.random.randint(0, wav.shape[0] - config.step)

In [54]:
rand_index

30392

In [60]:
cur_df.set_index('fname', inplace=True)
for f in cur_df.index:
    rate, signal = wavfile.read('clean/'+f)
    cur_df.at[f, 'length'] = signal.shape[0]/rate

In [62]:
config.step

1600

In [64]:
config.step / rate

0.1

In [63]:
cur_df[cur_df.length > config.step / rate]

Unnamed: 0_level_0,labels,length
fname,Unnamed: 1_level_1,Unnamed: 2_level_1
40d9de68.wav,Bicycle_bell,6.895187
e66309d9.wav,Bicycle_bell,2.144875
d1fb51da.wav,Bicycle_bell,1.519000
074a72f0.wav,Bicycle_bell,3.329125
35badf6d.wav,Bicycle_bell,3.276438
...,...,...
8f40eba8.wav,Bus,7.031688
f95969ec.wav,Bus,7.439750
f82399ff.wav,Bus,7.585750
9ad34e6b.wav,Bus,7.220937
