<a href="https://colab.research.google.com/github/diffunity/kpmg-corona-blue/blob/audio/audio_model/voice_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Content
- Analyse emotions for sample files
- Test it with your recording voice!

## 1. Analyse emotions for sample files

In [1]:
# Download Files
import gdown

url_1 = 'https://drive.google.com/uc?id=1m2NDz9MBvKIh5E26i_4ke_tgPRd9ZmZc'
output_1 = 'features.csv'
gdown.download(url_1, output_1, quiet=False)


url_2 = 'https://drive.google.com/uc?id=1WLFFDq5VbZkKxQAn3UQy-VdERF71BO3B'
output_2 = 'sample.zip'
gdown.download(url_2, output_2, quiet=False)
!unzip sample.zip

url_3 = 'https://drive.google.com/uc?id=1gpl29-tpGIXGlVyjKSbJ_arVmJTYeRI9'
output_3 = 'saved_model.h5'
gdown.download(url_3, output_3, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1m2NDz9MBvKIh5E26i_4ke_tgPRd9ZmZc
To: /content/features.csv
270MB [00:02, 130MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WLFFDq5VbZkKxQAn3UQy-VdERF71BO3B
To: /content/sample.zip
10.8MB [00:00, 37.7MB/s]


Archive:  sample.zip
  inflating: Happy, delighted.wav    
  inflating: Neutral.wav             
  inflating: Sad.wav                 
  inflating: Shocked, suprised, anger, disgust.wav  
  inflating: Tired, sad.wav          


Downloading...
From: https://drive.google.com/uc?id=1gpl29-tpGIXGlVyjKSbJ_arVmJTYeRI9
To: /content/saved_model.h5
6.75MB [00:00, 39.6MB/s]


'saved_model.h5'

In [2]:
# preprocess.py
!pip install pydub
import IPython
from tqdm import tqdm
from IPython.display import Audio
from base64 import b64encode, b64decode
import librosa
import numpy as np
import soundfile as sf
from pydub import AudioSegment, effects
from pydub.silence import split_on_silence
import urllib3
import json
import base64
import ast
import os
import shutil

############################################################
# audio segmentation
############################################################
def split_audio(record):
  sound_file = AudioSegment.from_wav(record)
  audio_chunks = split_on_silence(sound_file, min_silence_len=500, silence_thresh=-40)

  for i, chunk in enumerate(audio_chunks):
      out_file = "./chunk/{}.wav".format(str(i).zfill(4))
      chunk.export(out_file, format="wav")


############################################################
# volume normalize
############################################################
def normalize_audio(wav):
  rawsound = AudioSegment.from_file(wav, "wav")
  normalizedsound = effects.normalize(rawsound)
  normalizedsound.export(wav, format="wav")


############################################################
# feature extraction
############################################################
def extract_features(data):
  # ZCR
  result = np.array([])
  zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
  result = np.hstack((result, zcr))  # stacking horizontally

  # Chroma_stft
  stft = np.abs(librosa.stft(data))
  chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=16000).T, axis=0)
  result = np.hstack((result, chroma_stft))  # stacking horizontally

  # MFCC
  mfcc = np.mean(librosa.feature.mfcc(y=data, sr=16000).T, axis=0)
  result = np.hstack((result, mfcc))  # stacking horizontally

  # Root Mean Square Value
  rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
  result = np.hstack((result, rms))  # stacking horizontally

  # MelSpectogram
  mel = np.mean(librosa.feature.melspectrogram(y=data, sr=16000).T, axis=0)
  result = np.hstack((result, mel))  # stacking horizontally

  return result

def get_features(path):
  data, sample_rate = librosa.load(path, duration=2.5, offset=0.6, sr=16000)
  res1 = extract_features(data)
  result = [np.array(res1)]
  return result

Collecting pydub
  Downloading https://files.pythonhosted.org/packages/7b/d1/fbfa79371a8cd9bb15c2e3c480d7e6e340ed5cc55005174e16f48418333a/pydub-0.24.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.24.1


In [4]:
# model.py
import os
import json
import yaml
import gdown

import pandas as pd
import numpy as np

import librosa
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import keras
from sklearn.model_selection import train_test_split

from pydub import AudioSegment, effects

import warnings
warnings.filterwarnings(action='ignore')

class model:
  def __init__(self):

      # load model
      self.model = keras.models.load_model("saved_model.h5")

      # train set scaling
      self.Features = pd.read_csv("features.csv")
      self.X = self.Features.values

      self.X = self.Features.iloc[: ,:-1].values
      self.Y = self.Features['labels'].values

      self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X,
                                                                              self.Y,
                                                                              random_state=0,
                                                                              shuffle=True)

      self.scaler = StandardScaler()
      self.x_train = self.scaler.fit_transform(self.x_train)


  def inference(self, filename):

      # temp directory
      os.makedirs('./chunk', exist_ok=True)
      os.makedirs("./raw", exist_ok=True)

      split_audio(filename)

      for wav in os.listdir("./chunk"):
          normalize_audio("./chunk/" + wav)

      output_list = []
      for wav in os.listdir("./chunk"):
          try:
              feature_vector = []
              sample_rate = 16000
              feature = get_features("./chunk/" + wav)

              for ele in feature:
                  feature_vector.append(ele)

              feature_vector = np.array(feature_vector, dtype="object")[:1]
              feature_vector = self.scaler.transform(feature_vector)
              feature_vector = np.expand_dims(feature_vector, axis=2)

              Y = np.array(['Depressed', 'Non-depressed'], dtype=object)
              encoder = OneHotEncoder()
              Y = encoder.fit_transform(np.array(Y).reshape(-1, 1)).toarray()

              pred_test = self.model.predict(feature_vector)
              y_pred = encoder.inverse_transform(pred_test)

              output_list.append(y_pred[0][0])
              if y_pred[0][0] == "Depressed":
                  IPython.display.display(Audio("./chunk/" + wav,rate=16000))

          except:
              pass


      # depressed_rate = num_depressed/(num_depressed+num_non_depressed)

      model_output = dict()
      num_depressed = output_list.count('Depressed')
      num_non_depressed = output_list.count('Non-depressed')
      model_output["output"] = {"Depressed": num_depressed, "Non-depressed": num_non_depressed}
      depression_rate = num_depressed/(num_non_depressed + num_depressed)
      model_output["depression_rate"] = "{:.1%}".format(depression_rate)

      shutil.rmtree("./chunk")
      shutil.rmtree("./raw")

      return model_output
      ###########


In [9]:
# Check the sample file
sample_list = ["Happy, delighted.wav", "Neutral.wav", "Sad.wav", 
               "Shocked, suprised, anger, disgust.wav", "Tired, sad.wav"]

Audio(sample_list[0],rate=16000)

In [10]:
# Predict the result
SER = model()
SER.inference(filename=sample_list[0])

# You can check the depressed voices below

{'depression_rate': '4.5%', 'output': {'Depressed': 1, 'Non-depressed': 21}}

## Test it with your recording voice!

In [11]:
!pip install ffmpeg-python
"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [None]:
audio, sr = get_audio()

In [None]:
import scipy
scipy.io.wavfile.write('recording.wav', sr, audio)

In [None]:
SER = model()
SER.inference(filename="recording.wav")



{'caramel_rate': '80.0%', 'output': {'Depressed': 1, 'Non-depressed': 4}}