In [1]:
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset

from utils.feature_extractor_utils import *

# load model

In [2]:
class BiLSTM(nn.Module):
    def __init__(
        self,
        input_dim=30,
        hidden_dim=60,
        dense_dim=512,
        output_dim=4,
        num_layers=2,
        use_gpu=False,
        batch_size=1,
        is_training=False,
        dropout=0.2
    ):
        """
        Args:
            ;input_dim: 30
            ;hidden_dim: 60
            ;dense_dim: 512
            ;output_dim: 4
            ;num_layers: 2 #stack two bilstm layers
        """
        super(BiLSTM, self).__init__()
        # inti self values
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.dropout = dropout
        self.num_layers = num_layers
        self.is_training = True

        # define layers
        self.bilstm = nn.LSTM(
            input_size=input_dim,
            num_layers=num_layers,
            hidden_size=hidden_dim,
            bidirectional=True,
        )
        self.dense_hidden = nn.Linear(hidden_dim * 2, dense_dim)
        self.dense_out = nn.Linear(dense_dim, output_dim)

        # define hidden, cell for BiLSTM
        ## (num_layers * num_directions, batch, hidden_size)
        if use_gpu:
            self.h_0 = Variable(
                torch.zeros(2 * num_layers, self.batch_size, hidden_dim).cuda()
            )
            self.c_0 = Variable(
                torch.zeros(2 * num_layers, self.batch_size, hidden_dim).cuda()
            )
        else:
            self.h_0 = Variable(
                torch.zeros(2 * num_layers, self.batch_size, hidden_dim)
            )
            self.c_0 = Variable(
                torch.zeros(2 * num_layers, self.batch_size, hidden_dim)
            )

    def forward(self, audio_features):
        # audio_features = (seq_len, batch, input_size)
        lstm_output, (h_1, c_1) = self.bilstm(audio_features)

        # (seq_len, batch, input_size)  => (batch, input_size), only last output
        hidden_1 = self.dense_hidden(lstm_output[-1])
        y = self.dense_out(hidden_1)

        # for cross entropy loss
        if self.is_training:
            return y
        else:
            return F.softmax(y)

In [3]:
bilstm = BiLSTM()
bilstm.load_state_dict(torch.load("../data/bilstm_state_dict.pt"))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

# Predict wav file with pretrained model

{'ang': 0, 'exc': 1, 'neu': 2, 'sad': 3}

In [4]:
wav_file_directory = "../data/speech_sample.wav"

In [5]:
def predict_label_of_wav(wav_file):
    extracted_feature = feature_generator(wav_file)
    extracted_feature = np.expand_dims(extracted_feature, 0) # batch_dim
    feature_tensor = torch.tensor(extracted_feature).float().permute(2,0,1)
    predicted_label = bilstm(feature_tensor).argmax(dim=1).cpu().numpy()[0]
    return predicted_label

In [6]:
predict_label_of_wav(wav_file_directory)



3