<span style="font-size:36px"><b>Model Inference</b></span>

Copyright &copy; 2020 Gunawan Lumban Gaol

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language overning permissions and limitations under the License.

# Import Packages

In [37]:
import os
import sys
import glob

import numpy as np
import IPython.display as ipd
import librosa
import librosa.display as ld
import matplotlib.pyplot as plt
from pydub.utils import mediainfo
from ipywidgets import interact
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import load_model

from gurih.data.splitter import Splitter
from gurih.data.normalizer import AudioNormalizer
from gurih.data.transcriptor import ASRTranscriptor
from gurih.features.extractor import MFCCFeatureExtractor
from gurih.models.model import BaselineASRModel
# from gurih.models.utils import CharMap

# Load Saved Model

In [2]:
model = BaselineASRModel(input_shape=(3000, 39), vocab_len=29)
model.compile()

Model directory is set to ../../models/
Documentation directory is set to ../../docs/

Model: "BaselineASR_f200_k11_s2_pvalid_nlstm200_ndense29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, 3000, 39)]        0         
_________________________________________________________________
masking (Masking)            (None, 3000, 39)          0         
_________________________________________________________________
conv1 (Conv1D)               (None, 1495, 200)         86000     
_________________________________________________________________
bidirectional (Bidirectional (None, 1495, 400)         641600    
_________________________________________________________________
the_output (TimeDistributed) (None, 1495, 30)          12030     
Total params: 739,630
Trainable params: 739,630
Non-trainable params: 0
_________________________________________________________

In [3]:
filename = os.path.abspath("../../models/BaselineASR_f200_k11_s2_pvalid_nlstm200_ndense29.h5") # must provide abs path
model.load(filename)

Loaded model D:\Data Science Academy (2)\voice-to-text-bahasa\models\BaselineASR_f200_k11_s2_pvalid_nlstm200_ndense29.h5 from disk.


# Create Audio Transcript

In [4]:
# Single file
# mp3_file = "blabla.mp3"
# X = [mp3_file]

In [5]:
# Multiple files from a directory
input_dir = "../../dataset/sample/"
mp3_files = glob.glob(input_dir+"*.mp3")
X = list(mp3_files)

If audio duration is more than `max_seq_length` of the model, then the audio will be splitted before transcription.

In [None]:
def infer(x, model, sr=16000, force_mono=False):
    """
    Parameters
    ----------
    
    """
    if mediainfo(x)['channels'] > 1:
        warnings.warn("Performing channel split and transcripting for each channel.\
                       You can force mono process by passing force_mono=True.")
        if force_mono == True:
            x_freq = mono_process(x, sr)
    else:
        x_freq = mono_process(x, sr)
    
    transcriptor = Transcriptor(model, CharMap.IDX_TO_CHAR_MAP)
    y_pred = transcriptor.predict(x)
    
    return y_pred

def mono_process(x, sr=16000, plot=False):
    # Normalize audio
    normalizer = AudioNormalizer(sample_rate=sr,
                                 mono=True,
                                 write_audio_output=False, # don't output normalized audio
                                 output_dir=".",
                                 encode=False) # don't output .json
    x_norm = normalizer.fit_transform(x)
    
    # Split audio
    splitter = Splitter(max_frame_length=80000,
                        strides=80000,
                        padding='same',
                        low_memory=True)
    x_splitted = splitter.fit_transform(x)

    # Extract MFCC Features
    mfcc_extractor = MFCCFeatureExtractor(sample_rate=sr,
                                          frame_size=0.025,
                                          frame_stride=0.01,
                                          filter_num=26,
                                          cep_num=13,
                                          NFFT=512,
                                          low_freq=0,
                                          high_freq=None,
                                          pre_emphasis_coeff=0.97,
                                          cep_lifter=22,
                                          dct_type=2,
                                          dct_norm="ortho",
                                          append_energy=True,
                                          append_delta=True,
                                          low_memory=False,
                                          write_output=False,
                                          output_dir=".")

    x_freq = mfcc_extractor.fit_transform(x_splitted)    

    # Create figure for visualization
    if plot == True:
        plt.figure(figsize=(15, 6))
        plt.subplot(2, 1, 1)
        _ = ld.waveplot(np.asfortranarray(x_norm), sr=sr)
        plt.title("Normalized Audio")
        plt.subplot(2, 1, 2)
        _ = ld.specshow(x_freq, sr=sr)
        plt.title("MFCC Features Audio")
        plt.show()
    
    return x_freq

In [10]:
@interact
def plot_sample(x=X):
    data, sr = librosa.load(x, mono=False, sr=22050)
    plt.figure(figsize=(12, 7))
    plt.subplot(3, 1, 1)
    _ = librosa.display.waveplot(data, sr=sr)
    plt.title(x)
    plt.subplot(3, 1, 2)
    _ = librosa.display.waveplot(np.asfortranarray(data[0]), sr=sr)
    plt.title("Operator Side")
    plt.subplot(3, 1, 3)
    _ = librosa.display.waveplot(np.asfortranarray(data[1]), sr=sr)
    plt.title("Client Side")
    plt.tight_layout()
    plt.show()
    return ipd.Audio(x)

interactive(children=(Dropdown(description='x', options=('../../dataset/sample\\2019-09-03_Annisa Rahmawaty_39…