In [1]:
import numpy as np
import python_speech_features
import scipy.io.wavfile as wav


class FeaturesExtractor:

    def __init__(self, **kwargs):
        if 'winfunc' in kwargs and kwargs['winfunc'] == 'hamming':
            kwargs['winfunc'] = np.hamming
        self.params = kwargs

    def get_features(self, files: list) -> np.ndarray:
        """ Extract MFCC features from the files list. """
        mfccs = [self.make_features(file, **self.params) for file in files]
        X = self.align(mfccs)
        return X

    @staticmethod
    def make_features(file_path: str, **kwargs) -> np.ndarray:
        """ Use `python_speech_features` lib to extract MFCC features from the audio file. """
        fs, audio = wav.read(file_path)
        feat, energy = python_speech_features.fbank(audio, samplerate=fs, **kwargs)
        features = np.log(feat)
        return features

    @staticmethod
    def align(arrays: list, default=0) -> np.ndarray:
        """ Pad arrays along time dimensions. Return the single array (batch_size, time, features). """
        max_array = max(arrays, key=len)
        X = np.full(shape=[len(arrays), *max_array.shape], fill_value=default, dtype=np.float64)
        for index, array in enumerate(arrays):
            time_dim, features_dim = array.shape
            X[index, :time_dim] = array
        return X


In [5]:
file_path = "/home/gautam-admin/EEG/deepspeech_from_brain/data/wav/austin1.wav"
fs, audio = wav.read(file_path)
feat, energy = python_speech_features.fbank(audio, samplerate=fs)
features = np.log(feat)

In [7]:
features.shape

(25697, 26)

In [8]:
from keras import layers, optimizers, models
import tcn
import keras
from keras import backend as K

In [10]:
K.get_value(K.ctc_decode(out, input_length=np.ones(out.shape[0])*out.shape[1],
                         greedy=True)[0][0])

NameError: name 'out' is not defined

In [9]:
img_w = 128
# Input Parameters
img_h = 64
# Network parameters
conv_filters = 16
kernel_size = (3, 3)
pool_size = 2
time_dense_size = 32
rnn_size = 512
minibatch_size = 32
unique_tokens = 28

if K.image_data_format() == 'channels_first':
    input_shape = (1, img_w, img_h)
else:
    input_shape = (img_w, img_h, 1)

act = 'relu'
input_data = layers.Input(name='the_input', shape=input_shape, dtype='float32')
inner = layers.Conv2D(conv_filters, kernel_size, padding='same',
               activation=act, kernel_initializer='he_normal',
               name='conv1')(input_data)
inner = layers.MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
inner = layers.Conv2D(conv_filters, kernel_size, padding='same',
               activation=act, kernel_initializer='he_normal',
               name='conv2')(inner)
inner = layers.MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)

conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
inner = layers.Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)

# cuts down input size going into RNN:
inner = layers.Dense(time_dense_size, activation=act, name='dense1')(inner)

# Two layers of bidirectional GRUs
# GRU seems to work as well, if not better than LSTM:
gru_1 = layers.GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
gru_1b = layers.GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
gru1_merged = layers.add([gru_1, gru_1b])
gru_2 = layers.GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
gru_2b = layers.GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)

# transforms RNN output to character activations:
inner = layers.Dense(unique_tokens, kernel_initializer='he_normal',
              name='dense2')(layers.concatenate([gru_2, gru_2b]))
y_pred = layers.Activation('softmax', name='softmax')(inner)
keras.Model(inputs=input_data, outputs=y_pred)

<keras.engine.training.Model at 0x7f2c6ff68510>

In [12]:
import h5py
f = h5py.File('/home/gautam-admin/EEG/deepspeech_from_brain/models/model_1_100_True.hdf5', 'r')
print(list(f.keys()))


['activation_1', 'activation_10', 'activation_11', 'activation_12', 'activation_13', 'activation_14', 'activation_15', 'activation_16', 'activation_17', 'activation_2', 'activation_3', 'activation_4', 'activation_5', 'activation_6', 'activation_7', 'activation_8', 'activation_9', 'add_1', 'add_2', 'add_3', 'add_4', 'add_5', 'add_7', 'conv1d_1', 'conv1d_10', 'conv1d_11', 'conv1d_12', 'conv1d_13', 'conv1d_14', 'conv1d_15', 'conv1d_16', 'conv1d_17', 'conv1d_18', 'conv1d_2', 'conv1d_3', 'conv1d_4', 'conv1d_5', 'conv1d_6', 'conv1d_7', 'conv1d_8', 'conv1d_9', 'input_1', 'spatial_dropout1d_1', 'spatial_dropout1d_10', 'spatial_dropout1d_11', 'spatial_dropout1d_12', 'spatial_dropout1d_2', 'spatial_dropout1d_3', 'spatial_dropout1d_4', 'spatial_dropout1d_5', 'spatial_dropout1d_6', 'spatial_dropout1d_7', 'spatial_dropout1d_8', 'spatial_dropout1d_9', 'time_distributed_1']


In [13]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

<HDF5 file "model_1_100_True.hdf5" (mode r)>

In [14]:
def reset_weights(model):
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)

Help on package h5py:

NAME
    h5py

DESCRIPTION
    This is the h5py package, a Python interface to the HDF5
    scientific data format.

PACKAGE CONTENTS
    _conv
    _errors
    _hl (package)
    _objects
    _proxy
    defs
    h5
    h5a
    h5ac
    h5d
    h5ds
    h5f
    h5fd
    h5g
    h5i
    h5l
    h5o
    h5p
    h5r
    h5s
    h5t
    h5z
    highlevel
    ipy_completer
    tests (package)
    utils
    version

SUBMODULES
    filters

FUNCTIONS
    enable_ipython_completer()
        Call this from an interactive IPython session to enable tab-completion
        of group and attribute names.
    
    get_config(...)
        () => H5PYConfig
        
        Get a reference to the global library configuration object.
    
    get_enum = py_get_enum(...)
        (DTYPE dt_in) => DICT
        
        Deprecated; use check_dtype() instead.
    
    get_vlen = py_get_vlen(...)
        (OBJECT dt_in) => TYPE
        
        Deprecated; use check_dtype() instead.
    
    

In [41]:
import tcn
from tcn import TCN
i = layers.Input(batch_shape=(None, None, 90))

o = TCN(return_sequences=True)(i)  # The TCN layers are here.
o = layers.TimeDistributed(layers.Dense(13, activation='linear'))(o)
regressor = keras.Model(inputs=[i], outputs=[o])

In [40]:
model.load_weights('/home/gautam-admin/EEG/deepspeech_from_brain/models/model_1_100_True.hdf5')

In [34]:
list(f.values())[0]

<HDF5 group "/activation_1" (0 members)>