# Setting Dataset From Kaggle

In [2]:
! pip install -q kaggle

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"dindamajesty","key":"8ae43dc81423fa527c39798fde8aede5"}'}

In [4]:
! mkdir ~/.kaggle

In [5]:
! cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
! kaggle datasets list

ref                                                         title                                              size  lastUpdated          downloadCount  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              234KB  2021-07-24 09:20:05           9914  
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-04-28 17:03:01           6086  
imsparsh/musicnet-dataset                                   MusicNet Dataset                                   22GB  2021-02-18 14:12:19           2262  
dhruvildave/wikibooks-dataset                               Wikibooks Dataset                                   2GB  2021-07-03 18:37:20           2507  
fatiimaezzahra/famous-iconic-women                          Famous Iconic Wo

In [8]:
! kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Downloading ravdess-emotional-speech-audio.zip to /content
 98% 420M/429M [00:03<00:00, 114MB/s]
100% 429M/429M [00:03<00:00, 122MB/s]


In [9]:
! mkdir ravdess-emotional-speech-audio

In [10]:
! unzip ravdess-emotional-speech-audio.zip -d ravdess-emotional-speech-audio

Archive:  ravdess-emotional-speech-audio.zip
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-01-01-01-02-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-01-01-02-01-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-01-01-02-02-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-01-01-01-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-01-01-02-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-01-02-01-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-01-02-02-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-02-01-01-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-02-01-02-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-02-02-01-01.wav  
  inflating: ravdess-emotional-speech-audio/Actor_01/03-01-02-02-02-02-01.w

# Import libraries and Dataset

In [11]:
import pandas as pd
import librosa
import os, glob
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [12]:
for dirname, _, filenames in os.walk('/ravdess-emotional-speech-audio/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Preprocessing

Function to extract features

In [13]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    result=np.hstack((result, mfccs))
    chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    result=np.hstack((result, chroma))
    mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    result=np.hstack((result, mel))
    return result

Dictionary of emotions

In [14]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

def gender(g):
    if int(g[0:2]) % 2 == 0:
        return 'female'
    else:
        return 'male'


Function to load data

In [15]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in tqdm(glob.glob("ravdess-emotional-speech-audio/Actor_*/*.wav")):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
        feature=extract_feature(file)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=1)

Split dataset for training and validating

In [16]:
X_train, X_val, y_train, y_val = load_data()

100%|██████████| 1440/1440 [06:12<00:00,  3.86it/s]


Check shapes

In [17]:
print((X_train.shape[0], X_val.shape[0]))

(1152, 288)


Scale data

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [19]:
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 180


# Build Model using Scikit Learn

Select model

In [58]:
from sklearn.neural_network import MLPClassifier

model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
model.fit(X_train,y_train)
print(model.score(X_train, y_train))

1.0


Make predictions on validation set

In [59]:
y_pred=model.predict(X_val)
print(model.score(X_val, y_val))

0.6388888888888888


In [60]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

Unnamed: 0,Actual,Predicted
0,disgust_female,sad_female
1,calm_female,disgust_female
2,disgust_male,disgust_male
3,disgust_male,disgust_male
4,sad_male,sad_male
...,...,...
283,happy_female,happy_female
284,calm_male,calm_male
285,happy_male,happy_male
286,happy_female,happy_female


# Make prediction on audio recording data

In [61]:
# Code for recording audio from the browser
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import IPython
import uuid
from google.colab import output


class InvokeButton(object):
  def __init__(self, title, callback):
    self._title = title
    self._callback = callback

  def _repr_html_(self):
    from google.colab import output
    callback_id = 'button-' + str(uuid.uuid4())
    output.register_callback(callback_id, self._callback)

    template = """<button id="{callback_id}" style="cursor:pointer;background-color:#EEEEEE;border-color:#E0E0E0;padding:5px 15px;font-size:14px">{title}</button>
        <script>
          document.querySelector("#{callback_id}").onclick = (e) => {{
            google.colab.kernel.invokeFunction('{callback_id}', [], {{}})
            e.preventDefault();
          }};
        </script>"""
    html = template.format(title=self._title, callback_id=callback_id)
    return html

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb+') as f:
    f.write(b)
  return 'audio.wav'

In [62]:
import IPython.display as ipd

def synth():
  print("Now recording for 10 seconds, say what you will...")
  record(5)
  print("Audio recording complete")
  in_fpath = Path("audio.wav")
InvokeButton('Start recording', synth)

Now recording for 10 seconds, say what you will...


<IPython.core.display.Javascript object>

Audio recording complete


In [69]:
ipd.Audio("audio.wav")

In [70]:
audio=extract_feature("audio.wav")

In [71]:
x_audio = []
x_audio.append(audio)
np.array(x_audio)

array([[-6.80208008e+02,  1.15726486e+02, -1.55880175e+01,
        -8.24496984e-01,  1.62278855e+00, -2.51722574e+00,
        -1.85417569e+00, -5.83212996e+00, -3.33107686e+00,
        -1.11673632e+01, -1.42849779e+01, -1.06128092e+01,
        -8.28648663e+00, -3.86088777e+00, -5.69326353e+00,
        -4.36932945e+00, -2.72033811e+00, -3.46044278e+00,
        -2.89467287e+00, -1.00255001e+00, -1.06433928e+00,
         3.19776487e+00,  4.68212068e-01,  1.68457055e+00,
        -1.63197780e+00,  5.83823025e-01,  5.18843055e-01,
         3.27878714e+00,  4.11042738e+00, -1.57946491e+00,
        -4.46969128e+00, -2.62020683e+00, -1.10985219e+00,
        -8.28337073e-01, -6.06346428e-01, -1.81487560e+00,
        -1.67805493e+00, -9.03460860e-01, -1.99172699e+00,
         7.55446911e-01,  6.15848362e-01,  5.85773528e-01,
         5.58524549e-01,  5.91568053e-01,  5.95906436e-01,
         5.16625464e-01,  4.99508888e-01,  4.98656154e-01,
         4.94254947e-01,  4.73056555e-01,  5.23905396e-0

In [72]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_audio_val = scaler.transform(x_audio)

In [73]:
y_pred_audio=model.predict(X_audio_val)
hasil = str(y_pred_audio[0])
emotion_value = hasil.split("_")[0]
gender_value = hasil.split("_")[1]
print("The system detects that you are a",gender_value, "and your current emotions is", emotion_value)

The system detects that you are a male and your current emotions is sad
