In [None]:
import os
import librosa
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1wSop6SO3zFNR5nQXpvFqaNfAnJWXjGoV' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1wSop6SO3zFNR5nQXpvFqaNfAnJWXjGoV" -O train.zip && rm -rf /tmp/cookies.txt

--2020-04-13 09:29:02--  https://docs.google.com/uc?export=download&confirm=Gcoe&id=1wSop6SO3zFNR5nQXpvFqaNfAnJWXjGoV
Resolving docs.google.com (docs.google.com)... 74.125.197.102, 74.125.197.113, 74.125.197.100, ...
Connecting to docs.google.com (docs.google.com)|74.125.197.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-c0-docs.googleusercontent.com/docs/securesc/1a9gk0rjsle29ekdh6gmte2v54dlc9n9/b4ads9qudffgoepch6h2158colu0hqqm/1586770125000/00907147178380347415/06243619793914361784Z/1wSop6SO3zFNR5nQXpvFqaNfAnJWXjGoV?e=download [following]
--2020-04-13 09:29:02--  https://doc-08-c0-docs.googleusercontent.com/docs/securesc/1a9gk0rjsle29ekdh6gmte2v54dlc9n9/b4ads9qudffgoepch6h2158colu0hqqm/1586770125000/00907147178380347415/06243619793914361784Z/1wSop6SO3zFNR5nQXpvFqaNfAnJWXjGoV?e=download
Resolving doc-08-c0-docs.googleusercontent.com (doc-08-c0-docs.googleusercontent.com)... 74.125.20.132, 2607:f8b0:400e:c07::84
Connecting 

In [None]:
!7z x train.zip


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,4 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 4761108398 bytes (4541 MiB)

Extracting archive: train.zip

ERRORS:
Headers Error

--
Path = train.zip
Type = zip
ERRORS:
Headers Error
Physical Size = 4761108398
64-bit = +

  0%      0% 1 - train/train_english/english_0001.wav                                               0% 2 - train/train_english/english_0002.wav                                               0% 3 - train/train_english/english_0003.wav                                             

In [None]:
def get_features(path):
  '''
  Returns mfcc given thae wav file
  '''
  y, sr = librosa.load(path)
  mat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=64, n_fft=int(sr*0.025), hop_length=int(sr*0.010))
  return mat

In [None]:
def get_mfcc_lang(lang):
  '''
  Returns a concatenated numpy matrix for any language.

  This function takes quite some time to run
  '''
  if os.path.exists(f"{lang}.npy"):  # If you run it for second time, directly read it from disc rather than computing again.
    return np.load(f"{lang}.npy")
  dir = f"train/train_{lang}/"
  files = os.listdir(dir)
  with open(f"file_sequence_{lang}.txt", 'w') as f:  
    f.write(str(files))

  features = 0
  for i, file in enumerate(files):
    mat = get_features(os.path.join(dir, file)) # This in N*64 dimension matrix
    if i==0:
      features = np.transpose(mat).copy()
    else:
      features = np.vstack((features, np.transpose(mat)))
      # appending matrix in a single matrix (a+b+c+d....)*64 dimension matrix
      # This is same as joining all single language file together and computing their mfcc.
    if i%10==0:
      print(f"{i} done")
  np.save(f"{lang}.npy", features)
  print(f"***********{lang} done**********")
  return features

In [None]:
#step 3
train_english = get_mfcc_lang("english")
train_hindi = get_mfcc_lang("hindi")
train_mandarin = get_mfcc_lang("mandarin")

0 done
10 done
20 done
30 done
40 done
50 done
60 done
70 done
80 done
90 done
100 done
110 done
120 done
***********english done**********
0 done
10 done
20 done
30 done
***********hindi done**********
0 done
10 done
20 done
30 done
40 done
50 done
60 done
70 done
80 done
***********mandarin done**********


In [None]:
def get_size_per_sec(path):
  '''
  Since we need to form our final matrix in the form of N_eng*seq_length*64, we want to estimate that seq_length
  We get that 1 sec is approximately equal to 100 length, and so we use 500 seq_length as our sequence should be between 3 to 10 sec
  This seq_length can be changed to anything between 300 and 10000. You can experiment with that
  '''
  y, sr = librosa.load(path)
  mat = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=64, n_fft=int(sr*0.025), hop_length=int(sr*0.010))
  a = librosa.get_duration(y=y, sr=sr)
  return mat.shape[1]/a

In [None]:
get_size_per_sec("any_file_path_here")  #around 100

In [None]:
def reshape_training_data(ar, seq_length=500):
  '''
  So we have english mfcc as N*64 dimension array. We need to convert it to (N//seq_length, seq_length, 64) dim
  '''
  num = ar.shape[0]
  decided_num = int(num/seq_length)*500
  n_english = int(num/seq_length)
  ar = ar[0:decided_num,]
  ar = ar.reshape((n_english, seq_length, ar.shape[1]))
  return ar

In [None]:
train_english = reshape_training_data(train_english)
train_hindi = reshape_training_data(train_hindi)
train_mandarin = reshape_training_data(train_mandarin)

In [None]:
def create_training_data():
  '''
  Merge the training data and return X_train and y_train to be fed to the model
  '''
  global train_english, train_hindi, train_mandarin
  X_train = train_english
  # del train_english
  # print(y_train.shape)
  X_train = np.vstack((X_train, train_hindi))
  # y_train = np.concatenate(y_train, np.array(list([1]*train_hindi.shape[0])))
  # del train_hindi
  X_train = np.vstack((X_train, train_mandarin))
  # y_train = np.concatenate(y_train, np.array(list([2]*train_mandarin.shape[0])))
  y_train = np.array(list([0]*train_english.shape[0])+list([1]*train_hindi.shape[0])+list([2]*train_mandarin.shape[0]))
  # del train_mandarin
  return X_train, y_train

In [None]:
X, y = create_training_data() # You get training data after step 4 here.

In [None]:
del train_mandarin
del train_hindi
del train_english

In [None]:
# LSTM MODEL STARTER

In [None]:
X, y = shuffle(X, y)  #shuffle the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=42)   #train_test_split

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test,random_state=42)  # test_val_split

In [None]:
del X
del y

In [None]:
b = np.zeros((y_train.size, y_train.max()+1))
b[np.arange(y_train.size),y_train] = 1
y_train=b

In [None]:
b = np.zeros((y_test.size, y_test.max()+1))
b[np.arange(y_test.size),y_test] = 1
y_test=b

In [None]:
b = np.zeros((y_val.size, y_val.max()+1))
b[np.arange(y_val.size),y_val] = 1
y_val=b

In [None]:
# create the model
model = Sequential()
model.add(LSTM(100))
model.add(Dense(50, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, batch_size=64)

Train on 21501 samples, validate on 4608 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7fe938edcda0>

In [None]:
model.save_weights('weights.hd5', overwrite=True)

In [None]:
streaming_model = Sequential()
streaming_model.add(LSTM(100 , batch_input_shape=(1, 500, 64), stateful=True))
streaming_model.add(Dense(50, activation='relu'))
streaming_model.add(Dense(3, activation='softmax'))
streaming_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
streaming_model.load_weights("weights.hd5")

In [None]:
feature_dim = 64
##### demo the behaivor
print('\n\n******the streaming-inference model can replicate the sequence-based trained model:\n')
x = reshape_training_data(np.transpose(get_features("train/train_english/english_0001.wav")))
num_seqs = x.shape[0]



******the streaming-inference model can replicate the sequence-based trained model:



In [None]:
train_seq_length = 500
predictions=[]
for s in range(num_seqs):
    print(f'\n\nRunning Sequence {s} with STATE ACTIVE:\n')
    in_seq = x[s].reshape( (1, train_seq_length, feature_dim) )
    seq_pred = model.predict(in_seq)
    print(seq_pred)
    predictions.append(np.argmax(seq_pred))
    # seq_pred = seq_pred.reshape(train_seq_length)
    # for n in range(train_seq_length):
    #     in_feature_vector = x[s][n].reshape(1,1,feature_dim)
    #     single_pred = streaming_model.predict(in_feature_vector)[0][0]
    #     print(f'Seq-model Prediction, Streaming-Model Prediction, difference [{n}]: {seq_pred[n] : 3.2f}, {single_pred : 3.2f}, {seq_pred[n] - single_pred: 3.2f}')
    # streaming_model.reset_states()




Running Sequence 0 with STATE ACTIVE:

[[0.93286824 0.01206412 0.05506762]]


Running Sequence 1 with STATE ACTIVE:

[[0.9450689  0.00507285 0.04985823]]


Running Sequence 2 with STATE ACTIVE:

[[0.8414836  0.02157838 0.13693807]]


Running Sequence 3 with STATE ACTIVE:

[[0.8135546  0.00860169 0.17784375]]


Running Sequence 4 with STATE ACTIVE:

[[0.8136229  0.04277909 0.14359796]]


Running Sequence 5 with STATE ACTIVE:

[[0.75282913 0.04691087 0.20026007]]


Running Sequence 6 with STATE ACTIVE:

[[0.8649642  0.00659776 0.12843806]]


Running Sequence 7 with STATE ACTIVE:

[[0.8965097  0.06503945 0.03845086]]


Running Sequence 8 with STATE ACTIVE:

[[0.7848736  0.02130924 0.19381717]]


Running Sequence 9 with STATE ACTIVE:

[[0.7090966  0.09248041 0.19842292]]


Running Sequence 10 with STATE ACTIVE:

[[0.7265454  0.10639045 0.16706412]]


Running Sequence 11 with STATE ACTIVE:

[[0.85876614 0.00693236 0.13430153]]


Running Sequence 12 with STATE ACTIVE:

[[0.17048603 0.80993

In [None]:
predictions

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:

most_common,num_most_common = Counter(predictions).most_common(1)[0]

In [None]:
most_common

0

In [None]:

print('\n\n******streaming-inference state needs reset between sequences to replicate sequence-based trained model:\n')
for s in range(num_seqs):
    print(f'\n\nRunning Sequence {s} with NO STATE RESET:\n')
    in_seq = x[s].reshape( (1, train_seq_length, feature_dim) )
    seq_pred = training_model.predict(in_seq)
    seq_pred = seq_pred.reshape(train_seq_length)
    for n in range(train_seq_length):
        in_feature_vector = x[s][n].reshape(1,1,feature_dim)
        single_pred = streaming_model.predict(in_feature_vector)[0][0]
        print(f'Seq-model Prediction, Streaming-Model Prediction, difference [{n}]: {seq_pred[n] : 3.2f}, {single_pred : 3.2f}, {seq_pred[n] - single_pred: 3.2f}')
    #### NO STATE RESET HERE: streaming model will treat multiples sequences as one long sequence, 
    #### so after first sequence, the streaming output will differ, difference will decay with time from start up as effect of intial state fades

for s in range(2):
    N = np.random.randint(1, 10)
    print(f'\n\n******streaming-inference can work on an sequences of indefinite length -- running length {N}:\n')
    for n in range(N):
        x_sample =  np.random.randint(0, high=2, size = ( 1, 1, feature_dim) )
        x_sample = np.sign( x_sample - 0.5 )
        single_pred = streaming_model.predict(x_sample)[0][0]
        print(f'Streaming-Model Prediction[{n}]:  {single_pred : 3.2f}')
    streaming_model.reset_states()



******the streaming-inference model can replicate the sequence-based trained model:



NameError: ignored

In [None]:
##### define the streaming-infernece model
feature_dim=64
streaming_in = Input(batch_shape=(1,None,feature_dim))  ## stateful ==> needs batch_shape specified

foo = GRU(4, return_sequences=False, stateful=True )(streaming_in)
streaming_pred = Dense(1)(foo)
streaming_model = Model(inputs=streaming_in, outputs=streaming_pred)

streaming_model.compile(loss='mean_squared_error', optimizer='adam')
streaming_model.summary()

In [None]:
pred = model.predict(X_test)
print(accuracy_score(y_test, pred))
print(precision_score(y_test, pred))
print(recall_score(y_test, pred))


ValueError: ignored