### Libraries

In [None]:
!pip -q install pydub

In [None]:
# all imports
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
from io import BytesIO
import soundfile as sf
from scipy.io import wavfile as wav
import numpy as np
import librosa
import joblib
import librosa.display
import keras
from keras.models import load_model
from pydub import AudioSegment
import IPython.display as ipd 
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#drive.flush_and_unmount()

In [None]:
RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  audio = AudioSegment.from_file(BytesIO(b))
  samples = np.array(audio.get_array_of_samples())
  rate = audio.frame_rate
  return rate, samples

## Recording test tracks

In [None]:
Speaker = "Anna"
print(Speaker)

Anna


In [None]:
tracks = []
for i in range(2):
  print('Parla')
  rate, audio = record()
  tracks.append(audio)
  print('Pausa di 3 secondi')
  time.sleep(3) # Sleep for 3 seconds

Parla


<IPython.core.display.Javascript object>

Pausa di 3 secondi
Parla


<IPython.core.display.Javascript object>

Pausa di 3 secondi


In [None]:
ipd.Audio(tracks[0], rate=rate)

## Classifier

In [None]:
# Training metrics
metrics = np.load('/content/drive/MyDrive/Progetto DIGITAL/01_Classification - 1D/metrics_audio_classification.npz')
mean = metrics['Mean']
stdev = metrics['Sd']

In [None]:
model=load_model('/content/drive/MyDrive/Progetto DIGITAL/01_Classification - 1D/Modelli/NN_BinaryClassifierCombo_1D.h5')

In [None]:
# Features functions 

def sdev(input):
    return np.std(input, keepdims=True)

def aavg(input):
    return np.mean(np.abs(input), keepdims=True)

def energy(input):
    return np.sum((input*1.0)**2, keepdims=True)

def zcr(input):
    return np.array([np.nonzero(np.diff(input>0))[0].size])

def combo(input):
    return np.concatenate((aavg(input),sdev(input),energy(input), zcr(input)))

    # MFCC Features function
def mfcc(input, rate=48000, min_len=300, sampling=1): 
    signal = input[::sampling]
    mfcc = librosa.feature.mfcc(signal*1.0, sr=int(rate/sampling))
    # Add additional necessary zeroes to reach the required length
    pad_width = min_len - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    mfcc = mfcc.flatten()
    return np.array(mfcc)

def combo_mfcc(input):
  return np.concatenate((aavg(input),sdev(input),energy(input),zcr(input),mfcc(input)))


def identity(input):
    return input

def standard_features(input,feature_extractor=identity):
    features = []
    for track in input:
        audio_features = feature_extractor(track)
        features.append(audio_features)
    features = np.array(features)
    eps = 0.001
    features = (features - mean + eps)/(stdev + eps)
    features = [row for row in features] 
    return np.array(features)   

In [None]:
def recognition(model, voice, call_bbot=False):
  speakers = ['Anna', 'Beatrice', 'Guglielmo']
  prediction = model.predict(np.expand_dims(voice, axis=0))
  if call_bbot:
      bbot = prediction[3].round()
      if bbot[0][0]==0:
        return print("Ciao Sono B-Bot")
      else:
        return print("Comando assistente vocale non riconosciuto")
  idx=[]
  for i in prediction[:-1]:
    idx_max=np.argmax(i[0])
    idx.append(idx_max)
  if sum(idx)>1:
    print("Non è stato possibile determinare l'identità dello speaker")
  elif sum(idx)==0:
    print('Speaker: '+Speaker)
    print("Lo speaker è sconosciuto")
  else:
    who = speakers[idx.index(1)]
    print('Speaker: '+Speaker)
    print("È stata riconosciuta la voce di "+ str(who))

### Test

In [None]:
features = standard_features(tracks, feature_extractor=combo_mfcc)

In [None]:
for i in range(2):
    print('Numero registrazione: '+ str(i+1))
    recognition(model,features[i],call_bbot=True)
    print('\n')

Numero registrazione: 1
Comando assistente vocale non riconosciuto


Numero registrazione: 2
Ciao Sono B-Bot


