In [None]:
import os
clear = lambda: os.system('clear')
clear()
import time
import numpy as np
import pyaudio
import wave
import plotly.graph_objs as go
import plotly.io as pio
import librosa
import librosa.display
import tensorflow as tf
from pathlib import Path
from IPython.display import display, Audio
import pvleopard as pv
import config

In [None]:
DATASET_ROOT = "../test_ds"
AUDIO_SUBFOLDER ="leaders_ds"
DATASET_AUDIO_PATH = os.path.join(DATASET_ROOT, AUDIO_SUBFOLDER)

SAMPLING_RATE = 16000
SHUFFLE_SEED = 43
BATCH_SIZE = 128
SCALE = 0.5
filename = "../test_ds/leaders_ds/Julia_Gillard/data.wav"

Recording

In [None]:
print("\033[31m[*]\033[0m You will be asked to speak for few seconds for the recognition of the speaker.")
print("\033[31m[*]\033[0m Get Ready!")
time.sleep(5)
""" Taking the voice input """

chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 2
fs = 16000  # Record at 16000 samples per second
seconds = 6

p = pyaudio.PyAudio()  # Create an interface to PortAudio

print("\033[31m[*]\033[0m Recording")

stream = p.open(format=sample_format,
				channels=channels,
				rate=fs,
				frames_per_buffer=chunk,
				input=True)

frames = []  # Initialize array to store frames

# Store data in chunks for 1 seconds
for i in range(0, int(fs / chunk * seconds)):
	data = stream.read(chunk)
	frames.append(data)

# Stop and close the stream
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()

print("\033[31m[*]\033[0m Finished recording")

# Save the recorded data as a WAV file
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()


Waveforms and Spectrogram

In [None]:
y, sr = librosa.load(filename)
# Plot waveform
waveform_fig = go.Figure()
waveform_fig.add_trace(go.Scatter(x=np.arange(len(y))/sr, y=y))
waveform_fig.update_layout(title="Waveform", xaxis_title="Time (seconds)", yaxis_title="Amplitude")
pio.show(waveform_fig)

# Plot Spectrogram
D = librosa.stft(y)
DB = librosa.amplitude_to_db(abs(D))

spectrogram_fig = go.Figure()
spectrogram_fig.add_trace(go.Heatmap(x=np.arange(DB.shape[1])/sr, y=np.arange(DB.shape[0]), z=DB))
spectrogram_fig.update_layout(title="Spectrogram", xaxis_title="Time (seconds)", yaxis_title="Frequency (Hz)")
pio.show(spectrogram_fig)

Speech-To-Text

In [None]:
leopard = pv.create(access_key=config.access_key)

transcript, words = leopard.process_file(filename)
print(transcript)
#for word in words:
#    print(
#      "{word=\"%s\" start_sec=%.2f end_sec=%.2f confidence=%.2f}"
#      % (word.word, word.start_sec, word.end_sec, word.confidence))

Predicting

In [None]:
def paths_and_labels_to_dataset(audio_paths, labels):
	"""Constructs a dataset of audios and labels."""
	path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
	audio_ds = path_ds.map(lambda x: path_to_audio(x))
	label_ds = tf.data.Dataset.from_tensor_slices(labels)
	return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
	"""Reads and decodes an audio file."""
	audio = tf.io.read_file(path)
	audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
	return audio


def audio_to_fft(audio):
	audio = tf.squeeze(audio, axis=-1)
	fft = tf.signal.fft(
		tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
	)
	fft = tf.expand_dims(fft, axis=-1)

	# Return the absolute value of the first half of the FFT
	# which represents the positive frequencies
	return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])


def predict(path, labels):
	test = paths_and_labels_to_dataset(path, labels)

	test = test.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
	BATCH_SIZE
	)
	test = test.prefetch(tf.data.experimental.AUTOTUNE)

	for audios, labels in test.take(1):
		# Get the signal FFT
		ffts = audio_to_fft(audios)
		# Predict
		y_pred = model.predict(ffts)
		# Take random samples
		#rnd = np.random.randint(0, 3, 3)
		audios = audios.numpy()#[rnd, :]
		labels = labels.numpy()#[rnd]
		#print(np.argmax(y_pred, axis=1))
		y_pred = np.argmax(y_pred, axis=-1)#[rnd]

		for index in range(len(labels)):
			print("\033[31m[*]\033[0m Model's prediction:\33[92m {}\33[0m".format(trained_class_names[y_pred[index]]))

			if y_pred[index] == labels[index]:
				print("\033[31m[*]\033[0m Correct! Belongs to:\33[92m {}\33[0m".format(class_names[labels[index]]))
			else:
				print("\033[31m[*]\033[0m Wrong! Belongs to:\33[31m {}\33[0m".format(class_names[labels[index]]))

			display(Audio(audios[index, :, :].squeeze(), rate=SAMPLING_RATE))

audio_paths = []
labels = []

trained_class_names = os.listdir("../dataset/leaders_ds/audio")

class_names = os.listdir(DATASET_AUDIO_PATH)
for label, name in enumerate(class_names):
    #print("Processing speaker {}".format(name,))
    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} speakers.".format(len(audio_paths), len(np.unique(labels)))
)

""" Predict """
model = tf.keras.models.load_model('../models/leaders/v2/model.h5')
predict(audio_paths, labels)