In [29]:
"""
Simple example using LSTM recurrent neural network to classify IMDB
sentiment dataset.
References:
    - Long Short Term Memory, Sepp Hochreiter & Jurgen Schmidhuber, Neural
    Computation 9(8): 1735-1780, 1997.
    - Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
    and Christopher Potts. (2011). Learning Word Vectors for Sentiment
    Analysis. The 49th Annual Meeting of the Association for Computational
    Linguistics (ACL 2011).
Links:
    - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
    - http://ai.stanford.edu/~amaas/data/sentiment/
"""
from __future__ import division, print_function, absolute_import
import os
import numpy as np
import sklearn.preprocessing as prep
import librosa
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
#train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
#                                valid_portion=0.1)

print("Imports doned")
def wav2mfcc(file_path, max_pad_len=300):
    print("wav2mfcc start",file_path)
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    print("file read")
    print(wave.shape,sr)
    wave=prep.maxabs_scale(wave) #Normalize amplitude to +-1
    mfcc = librosa.feature.mfcc(wave, sr=sr).transpose()
    print(mfcc.shape)
    print(mfcc[1,:])
    dif=max_pad_len-mfcc.shape[0]
    #padding sample
    if dif>0:
        #fill holes
        print("filling holes")
        for i in range(dif):
            mfcc=np.vstack([mfcc,np.zeros(mfcc.shape[1])])
    else:
        #take away excess
        print("excess")
        mfcc=mfcc[:max_pad_len]
    print(mfcc.shape)
    print(mfcc[199,:])
    return mfcc

class EasyASR(object):
    '''
    classdocs
    '''

    def __init__(self, filePath="/home/francisco/voz"):
        '''
        Constructor
        '''
        basepath=filePath
        self.mfccs={}
        mfccAll=[]
        labels=os.listdir(basepath)
        print(len(labels))
        for i,nn in enumerate(labels):
            n=nn.split(".")[0]#take away extension .wav
            self.mfccs[n] = wav2mfcc(basepath+'/{}.wav'.format(n))
ea=EasyASR()

Imports doned
2
wav2mfcc start /home/francisco/voz/recording.wav
file read
(102350,) 16000
(200, 20)
[ -1.74761046e+02   3.25462522e+01  -1.28745401e+01   2.07543217e+01
   8.96673142e+00   2.17993289e+01   1.51516180e+00   1.65216094e+01
  -3.47102561e-01   1.11789198e+01   5.11714193e+00   1.60621764e+01
   3.92731608e+00   1.34172782e+01   2.18723316e+00   1.05491680e+01
   1.30924433e+01   6.29194202e+00  -9.30889407e-02   4.55653326e+00]
filling holes
(300, 20)
[-213.71106211   61.51051599  -47.66622437    1.79295504   14.38711338
   38.0233543     9.56786974   22.73441325   14.6864944    17.19112725
   -2.69500166   15.10978483   16.09496528   19.61873191   -6.28172785
   -0.48229283    7.3208475    17.82623085   11.93250591    5.19791082]
wav2mfcc start /home/francisco/voz/nada.wav
file read
(21632,) 16000
(43, 20)
[-154.04626133  121.17766382  -24.79870298  -11.80892048  -30.26455887
  -18.6659575     2.7918911     8.70540668   -2.57157187   -9.93394808
   -7.79782125   -7.1828

In [2]:
trainX, trainY = train
testX, testY = test
print (trainX[0],trainY[0])

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY,2)
testY = to_categorical(testY,2)
print (trainX.shape,trainY.shape)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8, return_seq=True)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] 0
(22500, 100) (22500, 2)


In [3]:
# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
batch_size=32)

Training Step: 7039  | total loss: [1m[32m0.12234[0m[0m | time: 58.578s
| Adam | epoch: 010 | loss: 0.12234 - acc: 0.9629 -- iter: 22496/22500
Training Step: 7040  | total loss: [1m[32m0.12032[0m[0m | time: 60.111s
| Adam | epoch: 010 | loss: 0.12032 - acc: 0.9635 | val_loss: 0.57867 - val_acc: 0.7936 -- iter: 22500/22500
--
