In [2]:
import gradio as gr
import librosa
import numpy as np
import pandas as pd
import subprocess
import tempfile
import soundfile as sf
import librosa.util
import os
import json

from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
model = load_model('./Result/features_maxpool.h5')





In [4]:
Features = pd.read_csv('./Features/features_lstm.csv')

Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.321275,0.729664,0.750033,0.730624,0.735275,0.713529,0.660531,0.684966,0.733049,0.753972,...,4e-06,3e-06,2.148075e-06,2e-06,5.116493e-06,8e-06,7e-06,5e-06,4.245834e-07,neutral
1,0.329644,0.812227,0.842488,0.818939,0.817773,0.826914,0.70506,0.680788,0.741229,0.77648,...,0.000103,0.000109,0.0001025398,0.000106,0.0001073939,0.000113,0.000109,0.00011,0.0001017026,neutral
2,0.320629,0.740885,0.739738,0.723335,0.736451,0.681295,0.652557,0.701629,0.746103,0.748172,...,4e-06,3e-06,2.148074e-06,2e-06,5.116453e-06,8e-06,7e-06,5e-06,4.245776e-07,neutral
3,0.173025,0.674528,0.752073,0.725325,0.729231,0.728289,0.665119,0.660569,0.720074,0.7585,...,2e-06,1e-06,9.037616e-07,1e-06,2.595818e-06,4e-06,3e-06,2e-06,2.633245e-07,neutral
4,0.187771,0.680198,0.720626,0.727919,0.707194,0.688234,0.673348,0.659386,0.66493,0.74404,...,1e-06,2e-06,1.496186e-06,1e-06,8.177922e-07,1e-06,3e-06,2e-06,1.389039e-07,neutral


In [5]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [6]:
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16643, 162), (16643, 4), (4161, 162), (4161, 4))

In [7]:
x_train

array([[2.57256086e-01, 6.66075630e-01, 7.53175834e-01, ...,
        4.27676872e-03, 4.37785700e-03, 4.32584729e-03],
       [1.90836589e-01, 5.88535607e-01, 4.78869438e-01, ...,
        1.53513465e-04, 9.25308195e-05, 7.09372534e-06],
       [3.85632461e-02, 3.40149820e-01, 2.84093827e-01, ...,
        7.82231655e-06, 2.78822995e-06, 1.55291218e-07],
       ...,
       [3.51725260e-02, 1.81827247e-01, 2.22813800e-01, ...,
        4.83322128e-06, 2.92683194e-06, 2.38465447e-07],
       [1.31622314e-01, 5.22881925e-01, 4.80539739e-01, ...,
        8.85621412e-05, 3.21167572e-05, 2.31947320e-06],
       [8.08654785e-02, 4.12712008e-01, 3.97607505e-01, ...,
        7.77307068e-05, 3.92414040e-05, 3.12272323e-06]])

In [8]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16643, 162), (16643, 4), (4161, 162), (4161, 4))

In [9]:
x_train

array([[ 2.05917922,  1.14842133,  1.74900153, ...,  0.46360944,
         0.56244877,  0.65110203],
       [ 1.1412455 ,  0.59629822, -0.24527631, ..., -0.19767108,
        -0.17928799, -0.14844011],
       [-0.96320935, -1.17233078, -1.66134495, ..., -0.22103677,
        -0.19482132, -0.14972464],
       ...,
       [-1.01006993, -2.29966538, -2.10686646, ..., -0.22151616,
        -0.19479733, -0.14970924],
       [ 0.32288975,  0.12881171, -0.23313279, ..., -0.20808786,
        -0.18974492, -0.14932398],
       [-0.37858215, -0.6556523 , -0.8360714 , ..., -0.20982499,
        -0.18851173, -0.14917527]])

In [9]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16643, 162, 1), (16643, 4), (4161, 162, 1), (4161, 4))

In [10]:
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_actual = encoder.inverse_transform(y_test)



In [11]:
df_pred_vs_actual = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df_pred_vs_actual['Predicted Labels'] = y_pred.flatten()
df_pred_vs_actual['Actual Labels'] = y_actual.flatten()

df_pred_vs_actual.head(15)

Unnamed: 0,Predicted Labels,Actual Labels
0,happy,happy
1,sad,sad
2,neutral,neutral
3,sad,sad
4,neutral,neutral
5,angry,happy
6,happy,happy
7,happy,happy
8,angry,angry
9,sad,sad


In [44]:
def noise(data):
    noise_amp = 0.04*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def time_shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(y=data, rate=rate)

def pitch_change(data, sampling_rate):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=0.8)

def higher_speed(data):
    return librosa.effects.time_stretch(y=data, rate=1.25)

def lower_speed(data):
    return librosa.effects.time_stretch(y=data, rate=0.75)

In [45]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=22050).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=22050).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=22050).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

In [46]:
# def get_features(path):
def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)

    
    # data = librosa.util.normalize(data.astype(np.float32))
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically

    timeshift_data = time_shift(data)
    res3 = extract_features(timeshift_data)
    result = np.vstack((result, res3))

    speedchange_data = stretch(data)
    res4 = extract_features(speedchange_data)
    result = np.vstack((result, res4))

    pitchchange_data = pitch_change(data, sample_rate)
    res5 = extract_features(pitchchange_data)
    result = np.vstack((result, res5))

    highspeed_data = higher_speed(data)
    res6 = extract_features(highspeed_data)
    result = np.vstack((result, res6))

    lowerspeed_data = lower_speed(data)
    res7 = extract_features(lowerspeed_data)
    result = np.vstack((result, res7))

    
    return result

In [47]:
def predict_audio_classification(path):

    # rate, y = audio

    # with sf.SoundFile('temp_audio.wav', 'w', rate, 2) as f:
    #     f.write(data)

    # audio, sr = librosa.load('temp_audio.wav', sr=rate, duration=2.5, offset=0.6)

    # os.remove('temp_audio.wav')
    feature_test = get_features(path)
    test_input = []
    for ele in feature_test:
        test_input.append(ele)
    
    feat_test = pd.DataFrame(test_input)
    test_X = feat_test.values
    test_X = scaler.transform(test_X)
    test_X = np.expand_dims(test_X, axis=2)

    pred_sub = model.predict(test_X, batch_size = 32)
    y_sub = encoder.inverse_transform(pred_sub)
    classes = y_sub.flatten()[2]

    return classes

    # print(classes)    

    # return classes
    # _classes = {
    #     'angry': 'marah', 
    #     'calm': 'tenang', 
    #     'disgust': 'jijik', 
    #     'fearful': 'menakutkan', 
    #     'happy': 'senang', 
    #     'neutral': 'netral', 
    #     'sad': 'sedih', 
    #     'surprised': 'terkejut'
    # }

    # # Membuat dictionary untuk menyimpan jumlah prediksi untuk setiap kelas
    # class_count = {label: 0 for label in _classes.values()}
    
    # # Menghitung jumlah prediksi untuk setiap kelas
    # for pred_label in y_sub.flatten():
    #     class_count[_classes[pred_label]] += 1
    
    # # Menghitung total prediksi
    # total_predictions = len(y_sub)
    
    # # Menghitung akurasi untuk setiap kelas
    # class_accuracy = {}
    # for label, count in class_count.items():
    #     class_accuracy[label] = count / total_predictions
    
    # class_accuracy = {k: float(v) for k, v in class_accuracy.items()}

    # return class_accuracy
    

In [48]:
# audio_input = gr.Audio(label="Upload audio file")
# label = gr.Textbox(label="Prediction")

audio_input = gr.Textbox(label="Path audio file (.wav)")
label = gr.Label(label="Probabilitas prediksi")

gr.Interface(fn=predict_audio_classification, inputs=audio_input, outputs=label).launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


  data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Traceback (most recent call last):
  File "c:\Users\LENOVO YOGA\Music\FINAL PROJECT\finalproject\lib\site-packages\librosa\core\audio.py", line 175, in load
    y, sr_native = __soundfile_load(path, offset, duration, dtype)
  File "c:\Users\LENOVO YOGA\Music\FINAL PROJECT\finalproject\lib\site-packages\librosa\core\audio.py", line 208, in __soundfile_load
    context = sf.SoundFile(path)
  File "c:\Users\LENOVO YOGA\Music\FINAL PROJECT\finalproject\lib\site-packages\soundfile.py", line 658, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "c:\Users\LENOVO YOGA\Music\FINAL PROJECT\finalproject\lib\site-packages\soundfile.py", line 1216, in _open
    raise LibsndfileError(err, prefix="Error opening {0!r}: ".format(self.name))
soundfile.LibsndfileError



  data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Keyboard interruption in main thread... closing server.




In [41]:
data, sample_rate = librosa.load('./BU TEJO NGAMUKK.mp3', duration=2.5, offset=0.6)

data, sample_rate

(array([-0.01382498, -0.01306044,  0.00397783, ...,  0.16554311,
         0.20215431,  0.21546394], dtype=float32),
 22050)