In [1]:

from tensorflow.keras.models import load_model
#from clean import downsample_mono, envelope
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
from sklearn.preprocessing import LabelEncoder
import numpy as np
from glob import glob
import argparse
import os
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.io import wavfile
from librosa.core import resample, to_mono
import re
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

In [2]:
def Conv2D(N_CLASSES=6, SR=16000, DT=4.0):
    initializer = tf.keras.initializers.TruncatedNormal(seed=None)
    
    i = layers.Input(shape=(1, int(SR*DT)), name='input')
    x = Melspectrogram(n_dft=512, n_hop=160,
                       padding='same', sr=SR, n_mels=128,
                       fmin=0.0, fmax=SR/2, power_melgram=2.0,
                       return_decibel_melgram=True, trainable_fb=False,
                       trainable_kernel=False,
                       name='melbands')(i)
    #x = AdditiveNoise(power=10., random_gain=True, noise_type='white')(x)
    x = Normalization2D(str_axis='batch', name='batch_norm')(x)
    x = layers.Conv2D(8, kernel_size=(7,7), activation='tanh',kernel_initializer=initializer, padding='same', name='conv2d_tanh')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x)
    x = layers.Conv2D(16, kernel_size=(5,5), activation='relu',kernel_initializer=initializer, padding='same', name='conv2d_relu_1')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x)
    x = layers.Conv2D(16, kernel_size=(3,3), activation='relu',kernel_initializer=initializer, padding='same', name='conv2d_relu_2')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu',kernel_initializer=initializer, padding='same', name='conv2d_relu_3')(x)
    x = layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x)
    x = layers.Conv2D(32, kernel_size=(3,3), activation='relu',kernel_initializer=initializer, padding='same', name='conv2d_relu_4')(x)
    x = layers.Flatten(name='flatten')(x)
    x = layers.Dropout(rate=0.5, name='dropout')(x)
    x = layers.Dense(64, activation='relu',kernel_initializer=initializer, activity_regularizer=l2(0.001), name='dense')(x)
    o = layers.Dense(N_CLASSES,kernel_initializer=initializer, activation='softmax', name='softmax')(x)

    model = Model(inputs=i, outputs=o, name='2d_convolution')
    return model

In [3]:
def envelope(y, rate, threshold):
    mask = []
    y = pd.Series(y).apply(np.abs)
    y_mean = y.rolling(window=int(rate/20),
                       min_periods=1,
                       center=True).max()
    for mean in y_mean:
        if mean > threshold:
            mask.append(True)
        else:
            mask.append(False)
    return mask, y_mean

def downsample_mono(path, sr):
    rate, wav = wavfile.read(path)
    wav = wav.astype(np.float32, order='F')
    try:
        tmp = wav.shape[1]
        wav = to_mono(wav.T)
    except:
        pass
    wav = resample(wav, rate, sr)
    wav = wav.astype(np.int16)
    return sr, wav

def make_prediction(args):

    model = Conv2D()
    model.load_weights('models/weights.h5')

    wav_paths = glob('{}/**'.format(args.src_dir), recursive=True)
    wav_paths = sorted([x.replace(os.sep, '/') for x in wav_paths if '.wav' in x])
    classes = sorted(os.listdir(args.src_dir))
    labels = [os.path.split(x)[0].split('/')[-1] for x in wav_paths]
    le = LabelEncoder()
    y_true = le.fit_transform(labels)
    results = []
    file_name =[]

    for z, wav_fn in tqdm(enumerate(wav_paths), total=len(wav_paths)):
        rate, wav = downsample_mono(wav_fn, args.sr)
        mask, env = envelope(wav, rate, threshold=args.threshold)
        clean_wav = wav[mask]
        step = int(args.sr*args.dt)
        batch = []

        for i in range(0, clean_wav.shape[0], step):
            sample = clean_wav[i:i+step]
            sample = sample.reshape(1,-1)
            if sample.shape[0] < step:
                tmp = np.zeros(shape=(1,step), dtype=np.int16)
                tmp[:,:sample.shape[1]] = sample.flatten()
                sample = tmp
            batch.append(sample)
        X_batch = np.array(batch)
        y_pred = model.predict(X_batch)
        y_mean = np.mean(y_pred, axis=0)
        y_pred = np.argmax(y_mean)
        #print('Actual class: {}, Predicted class: {}'.format(real_class, classes[y_pred]))
        #print(y_pred)
        file_name.append(os.path.split(wav_fn)[1])
        results.append(y_pred)

    #np.save(os.path.join('logs', args.pred_fn), np.array(results))
    print(file_name)
    print(results)
    return file_name, results

def make_prediction_baby(args):
    model = Conv2D()
    model.load_weights('models/weights.h5')
    #model = load_model(args.model_fn,
        #custom_objects={'Melspectrogram':Melspectrogram,
                        #'Normalization2D':Normalization2D},compile = False)

    wav_paths = glob('{}/**'.format(args.src_dir), recursive=True)
    wav_paths = sorted([x.replace(os.sep, '/') for x in wav_paths if '.wav' in x])
    classes = sorted(os.listdir(args.src_dir))
    labels = [os.path.split(x)[0].split('/')[-1] for x in wav_paths]
    le = LabelEncoder()
    y_true = le.fit_transform(labels)
    results = []
    file_name =[]

    for z, wav_fn in tqdm(enumerate(wav_paths), total=len(wav_paths)):
        rate, wav = downsample_mono(wav_fn, args.sr)
        mask, env = envelope(wav, rate, threshold=args.threshold)
        clean_wav = wav[mask]
        step = int(args.sr*args.dt)
        baby_step = 32000
        batch = []

        for i in range(0, clean_wav.shape[0], baby_step):
            sample = clean_wav[i:i+step]
            sample = sample.reshape(1,-1)
            if sample.shape[0] < step:
                tmp = np.zeros(shape=(1,step), dtype=np.int16)
                tmp[:,:sample.shape[1]] = sample.flatten()
                sample = tmp
            batch.append(sample)
        X_batch = np.array(batch)
        y_pred = model.predict(X_batch)
        y_mean = np.mean(y_pred, axis=0)
        y_pred = np.argmax(y_mean)
        #print('Actual class: {}, Predicted class: {}'.format(real_class, classes[y_pred]))
        #print(y_pred)
        file_name.append(os.path.split(wav_fn)[1])
        results.append(y_pred)

    #np.save(os.path.join('logs', args.pred_fn), np.array(results))
    print(file_name)
    print(results)
    return file_name, results

def make_prediction_simple(args):
    model = Conv2D()
    model.load_weights('models/weights.h5')
    #model = load_model(args.model_fn,
        #custom_objects={'Melspectrogram':Melspectrogram,
                        #'Normalization2D':Normalization2D}, compile = False)

    wav_paths = glob('{}/**'.format(args.src_dir), recursive=True)
    wav_paths = sorted([x.replace(os.sep, '/') for x in wav_paths if '.wav' in x])
    classes = sorted(os.listdir(args.src_dir))
    labels = [os.path.split(x)[0].split('/')[-1] for x in wav_paths]
    le = LabelEncoder()
    y_true = le.fit_transform(labels)
    results = []
    file_name =[]

    for z, wav_fn in tqdm(enumerate(wav_paths), total=len(wav_paths)):
        rate, wav = downsample_mono(wav_fn, args.sr)
        mask, env = envelope(wav, rate, threshold=args.threshold)
        clean_wav = wav[mask]
        step = int(args.sr*args.dt)

        batch = []
        
        X = np.empty((1, 1, step), dtype=np.int16)
        
        if step < wav.shape[0]:
            rand_index = np.random.randint(0, wav.shape[0]-step)
            
            sample = wav[rand_index : rand_index+step]
                
        else:
             
            rand_index = np.random.randint(0, step-wav.shape[0])
            sample = np.pad(wav, (rand_index, step-wav.shape[0]-rand_index), 'constant', constant_values=0)

        X[0,] = sample.reshape(1, -1)
        y_ = model.predict(X)
        y_pred = np.argmax(y_)
        #print('Actual class: {}, Predicted class: {}'.format(real_class, classes[y_pred]))
        #print(y_pred)
        file_name.append(os.path.split(wav_fn)[1])
        results.append(y_pred)

    #np.save(os.path.join('logs', args.pred_fn), np.array(results))
    print(file_name)
    print(results)
    return file_name, results

In [54]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Audio Classification Training')
    parser.add_argument('--model_fn', type=str, default='models/conv2d.h5',
                        help='model file to make predictions')
    parser.add_argument('--pred_fn', type=str, default='y_pred',
                        help='fn to write predictions in logs dir')
    parser.add_argument('--src_dir', type=str, default='data/test',
                        help='directory containing wavfiles to predict')
    parser.add_argument('--dt', type=float, default=4.0,
                        help='time in seconds to sample audio')
    parser.add_argument('--sr', type=int, default=16000,
                        help='sample rate of clean audio')
    parser.add_argument('--threshold', type=str, default=20,
                        help='threshold magnitude for np.int16 dtype')
    args, _ = parser.parse_known_args()

    fname, label = make_prediction_baby(args)


int_axis=-1 passed but is ignored, str_axis is used instead.


100%|██████████| 228/228 [00:35<00:00,  6.43it/s]

['test_0.wav', 'test_1.wav', 'test_10.wav', 'test_100.wav', 'test_101.wav', 'test_102.wav', 'test_103.wav', 'test_104.wav', 'test_105.wav', 'test_106.wav', 'test_107.wav', 'test_108.wav', 'test_109.wav', 'test_11.wav', 'test_110.wav', 'test_111.wav', 'test_112.wav', 'test_113.wav', 'test_114.wav', 'test_115.wav', 'test_116.wav', 'test_117.wav', 'test_118.wav', 'test_119.wav', 'test_12.wav', 'test_120.wav', 'test_121.wav', 'test_122.wav', 'test_123.wav', 'test_124.wav', 'test_125.wav', 'test_126.wav', 'test_127.wav', 'test_128.wav', 'test_129.wav', 'test_13.wav', 'test_130.wav', 'test_131.wav', 'test_132.wav', 'test_133.wav', 'test_134.wav', 'test_135.wav', 'test_136.wav', 'test_137.wav', 'test_138.wav', 'test_139.wav', 'test_14.wav', 'test_140.wav', 'test_141.wav', 'test_142.wav', 'test_143.wav', 'test_144.wav', 'test_145.wav', 'test_146.wav', 'test_147.wav', 'test_148.wav', 'test_149.wav', 'test_15.wav', 'test_150.wav', 'test_151.wav', 'test_152.wav', 'test_153.wav', 'test_154.wav', '




In [23]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Audio Classification Training')
    parser.add_argument('--model_fn', type=str, default='models/conv2d.h5',
                        help='model file to make predictions')
    parser.add_argument('--pred_fn', type=str, default='y_pred',
                        help='fn to write predictions in logs dir')
    parser.add_argument('--src_dir', type=str, default='data/test',
                        help='directory containing wavfiles to predict')
    parser.add_argument('--dt', type=float, default=4.0,
                        help='time in seconds to sample audio')
    parser.add_argument('--sr', type=int, default=16000,
                        help='sample rate of clean audio')
    parser.add_argument('--threshold', type=str, default=20,
                        help='threshold magnitude for np.int16 dtype')
    args, _ = parser.parse_known_args()

    fname, label = make_prediction_baby(args)


  0%|          | 0/228 [00:00<?, ?it/s]

int_axis=-1 passed but is ignored, str_axis is used instead.


100%|██████████| 228/228 [00:36<00:00,  6.30it/s]

['test_0.wav', 'test_1.wav', 'test_10.wav', 'test_100.wav', 'test_101.wav', 'test_102.wav', 'test_103.wav', 'test_104.wav', 'test_105.wav', 'test_106.wav', 'test_107.wav', 'test_108.wav', 'test_109.wav', 'test_11.wav', 'test_110.wav', 'test_111.wav', 'test_112.wav', 'test_113.wav', 'test_114.wav', 'test_115.wav', 'test_116.wav', 'test_117.wav', 'test_118.wav', 'test_119.wav', 'test_12.wav', 'test_120.wav', 'test_121.wav', 'test_122.wav', 'test_123.wav', 'test_124.wav', 'test_125.wav', 'test_126.wav', 'test_127.wav', 'test_128.wav', 'test_129.wav', 'test_13.wav', 'test_130.wav', 'test_131.wav', 'test_132.wav', 'test_133.wav', 'test_134.wav', 'test_135.wav', 'test_136.wav', 'test_137.wav', 'test_138.wav', 'test_139.wav', 'test_14.wav', 'test_140.wav', 'test_141.wav', 'test_142.wav', 'test_143.wav', 'test_144.wav', 'test_145.wav', 'test_146.wav', 'test_147.wav', 'test_148.wav', 'test_149.wav', 'test_15.wav', 'test_150.wav', 'test_151.wav', 'test_152.wav', 'test_153.wav', 'test_154.wav', '




In [12]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Audio Classification Training')
    parser.add_argument('--model_fn', type=str, default='models/conv2d.h5',
                        help='model file to make predictions')
    parser.add_argument('--pred_fn', type=str, default='y_pred',
                        help='fn to write predictions in logs dir')
    parser.add_argument('--src_dir', type=str, default='data/test',
                        help='directory containing wavfiles to predict')
    parser.add_argument('--dt', type=float, default=4.0,
                        help='time in seconds to sample audio')
    parser.add_argument('--sr', type=int, default=16000,
                        help='sample rate of clean audio')
    parser.add_argument('--threshold', type=str, default=20,
                        help='threshold magnitude for np.int16 dtype')
    args, _ = parser.parse_known_args()

    fname, label = make_prediction_simple(args)

100%|██████████| 228/228 [00:33<00:00,  6.73it/s]

['test_0.wav', 'test_1.wav', 'test_10.wav', 'test_100.wav', 'test_101.wav', 'test_102.wav', 'test_103.wav', 'test_104.wav', 'test_105.wav', 'test_106.wav', 'test_107.wav', 'test_108.wav', 'test_109.wav', 'test_11.wav', 'test_110.wav', 'test_111.wav', 'test_112.wav', 'test_113.wav', 'test_114.wav', 'test_115.wav', 'test_116.wav', 'test_117.wav', 'test_118.wav', 'test_119.wav', 'test_12.wav', 'test_120.wav', 'test_121.wav', 'test_122.wav', 'test_123.wav', 'test_124.wav', 'test_125.wav', 'test_126.wav', 'test_127.wav', 'test_128.wav', 'test_129.wav', 'test_13.wav', 'test_130.wav', 'test_131.wav', 'test_132.wav', 'test_133.wav', 'test_134.wav', 'test_135.wav', 'test_136.wav', 'test_137.wav', 'test_138.wav', 'test_139.wav', 'test_14.wav', 'test_140.wav', 'test_141.wav', 'test_142.wav', 'test_143.wav', 'test_144.wav', 'test_145.wav', 'test_146.wav', 'test_147.wav', 'test_148.wav', 'test_149.wav', 'test_15.wav', 'test_150.wav', 'test_151.wav', 'test_152.wav', 'test_153.wav', 'test_154.wav', '




In [13]:
dict = {'id': fname}
df = pd.DataFrame(dict)
df

Unnamed: 0,id
0,test_0.wav
1,test_1.wav
2,test_10.wav
3,test_100.wav
4,test_101.wav
...,...
223,test_95.wav
224,test_96.wav
225,test_97.wav
226,test_98.wav


In [14]:
n = []
for i in fname:
    x = re.split("_", i)
    number = re.split("\.", x[1])
    n.append(int(number[0]))
print(n)

[0, 1, 10, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 11, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 12, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 13, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 14, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 15, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 16, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 17, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 19, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 2, 20, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 21, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 22, 220, 221, 222, 223, 224, 225, 226, 227, 23, 24, 25, 26, 27, 28, 29, 3, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 4, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 5, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 6, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 7, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 8, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 9, 90, 91, 9

In [15]:
classes = ['awake', 'diaper', 'hug', 'hungry', 'sleepy', 'uncomfortable']
label_txt = []
for i in range(len(label)):
    label_txt.append(classes[label[i]])
df['label'] = label_txt
df['n'] = n
df.n = df.n.astype(int)

df.sort_values("n", axis = 0, ascending = True, 
                inplace = True, na_position ='last') 

df
#final = df[['id','label']]
#final
#final

Unnamed: 0,id,label,n
0,test_0.wav,hungry,0
1,test_1.wav,hug,1
112,test_2.wav,hug,2
151,test_3.wav,uncomfortable,3
162,test_4.wav,uncomfortable,4
...,...,...,...
139,test_223.wav,sleepy,223
140,test_224.wav,awake,224
141,test_225.wav,awake,225
142,test_226.wav,uncomfortable,226


In [16]:
final = df[['id','label']]
final

Unnamed: 0,id,label
0,test_0.wav,hungry
1,test_1.wav,hug
112,test_2.wav,hug
151,test_3.wav,uncomfortable
162,test_4.wav,uncomfortable
...,...,...
139,test_223.wav,sleepy
140,test_224.wav,awake
141,test_225.wav,awake
142,test_226.wav,uncomfortable


In [17]:
final.to_csv('Result3_3_voulumaugmented_noise_200.csv',index=0)