# Trainig the model used for Yes/No recognition (real-time  prediction)

In [0]:
# feature extractoring and preprocessing data
import librosa
import glob
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import keras
import warnings
warnings.filterwarnings('ignore')
from keras import models
from keras import layers
from scipy.fftpack import fft
 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [0]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    # short time fourier transform
    stft = np.abs(librosa.stft(X))
    # Mel frequency cepstral coefficients
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
                                              sr=sample_rate).T, axis=0)
    return  mfccs, chroma, mel, contrast, tonnetz
 
 
def parse_audio_files(parent_dir, sub_dirs, file_ext="*.wav"):
    features, labels = np.empty((0, 193)), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn)
            ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
            features = np.vstack([features, ext_features])
            str = (fn.split('\\')[2])[0]
            if str == 'n':
                labels = np.append(labels, 0)
            else:
                labels = np.append(labels, 1)
 
    return np.array(features), np.array(labels, dtype=np.int)

In [0]:
parent_dir = 'db'
tr_sub_dirs = ["train"]
ts_sub_dirs = ["test"]
 
tr_features, tr_labels = parse_audio_files(parent_dir, tr_sub_dirs)
ts_features, ts_labels = parse_audio_files(parent_dir, ts_sub_dirs)

In [0]:
model = models.Sequential()
 
# creating layers
model.add(layers.Dense(512, activation='relu', input_shape=(tr_features.shape[1],)))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

In [0]:
# compiling the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
 
# fitting data into the model
model.fit(tr_features,
          tr_labels,
          epochs=1,
          batch_size=1)
results = model.evaluate(ts_features, ts_labels)

In [0]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

# Prediction


> here we used the save model to predict the user input (voice) in a while loop


In [0]:
import librosa
import glob
import numpy as np
import os
import warnings

In [0]:
def audio_input():
    folder = 'db/test1/'
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)
 
    # audio specifications
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 2  # for recording 1 sec audio :\
    WAVE_OUTPUT_FILENAME = "test.wav"
 
    audio = pyaudio.PyAudio()
 
    # start Recording
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)
    print("recording...")
    frames = []
 
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("finished recording")
 
    # stop Recording
    stream.stop_stream()
    stream.close()
    audio.terminate()
 
    # writing the wav file into the test db/test1
    waveFile = wave.open('db/test1/' + WAVE_OUTPUT_FILENAME, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(frames))
    waveFile.close()

In [0]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
                                              sr=sample_rate).T, axis=0)
    return mfccs, chroma, mel, contrast, tonnetz
 
 
def parse_audio_files_pred(parent_dir, sub_dirs, file_ext="*.wav"):
    features = np.empty((0, 193))
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn)
            ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
            features = np.vstack([features, ext_features])
    return np.array(features)
  
parent_dir = 'db'
pred_sub_dirs = ["test1"]  

In [0]:
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [0]:
while True:
    audio_input()
    pred_features = parse_audio_files_pred(parent_dir, pred_sub_dirs)
    predictions = loaded_model.predict(pred_features)
    result = np.argmax(predictions)
    if result == '1':
        print('Yes')
    else:
        print('No')

# Plotting the features used in this model as input 

In [0]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [0]:
# this addresses must be set according to your local device
fn_no = r'C:\Users\pcstorm\PycharmProjects\Anaconda\signal-project-keras\signal2\db\test\no0.wav'
fn_yes = r'C:\Users\pcstorm\PycharmProjects\Anaconda\signal-project-keras\signal2\db\test\yes20.wav'

In [0]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    # short time fourier transform
    stft = np.abs(librosa.stft(X))
    # Mel frequency cepstral coefficients
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
                                              sr=sample_rate).T, axis=0)
    return  mfccs, chroma, mel, contrast, tonnetz
 

In [0]:
mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn_no)
mfccs2, chroma2, mel2, contrast2, tonnetz2 = extract_feature(fn_yes)
 
plot = plt.subplot(5, 2, 1)
plot.set_title("mfcc-no")
plot.plot(mfccs)
 
plot = plt.subplot(5, 2, 2)
plot.set_title("mfcc-yes")
plot.plot(mfccs2)
 
plot = plt.subplot(5, 2, 3)
plot.set_title("chroma-no")
plot.plot(chroma)
 
plot = plt.subplot(5, 2, 4)
plot.set_title("chroma-yes")
plot.plot(chroma2)
 
plot = plt.subplot(5, 2, 5)
plot.set_title("mel-no")
plot.plot(mel)
 
plot = plt.subplot(5, 2, 6)
plot.set_title("mel-yes")
plot.plot(mel2)
 
plot = plt.subplot(5, 2, 7)
plot.set_title("contrast-no")
plot.plot(contrast)
 
plot = plt.subplot(5, 2, 8)
plot.set_title("contrast-yes")
plot.plot(contrast2)
 
plot = plt.subplot(5, 2, 9)
plot.set_title("tonnetz-no")
plot.plot(tonnetz)
 
plot = plt.subplot(5, 2, 10)
plot.set_title("tonnetz-yes")
plot.plot(tonnetz2)
 
 
plt.show()