In [19]:
import os
import torch
import librosa
import numpy as np
import pandas as pd
# import soundfile as sf
import IPython.display as ipd

import models
from data import get_SLU_datasets, read_config, SLUDataset

config_path = './unfreeze_word_layers.cfg'
config = read_config(config_path)
_,_,_=get_SLU_datasets(config)

base_path = config.slu_path

In [16]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

# 1. Load the SLU Model

In [57]:
model = models.Model(config).eval()
model.load_state_dict(torch.load("./experiments/unfreeze_word_layers/training/model_state.pth", map_location=DEVICE)) # load trained model

<All keys matched successfully>

# 2. SLU for Test Dataset

In [48]:
train_df = pd.read_csv(os.path.join(base_path, "data", "train_data.csv"), index_col=0)
valid_df = pd.read_csv(os.path.join(base_path, "data", "valid_data.csv"), index_col=0)
test_df = pd.read_csv(os.path.join(base_path, "data", "test_data.csv"), index_col=0)
print(f'Train Dataset Size : {len(train_df)}')
print(f'Valid Dataset Size : {len(valid_df)}')
print(f'Test Dataset Size : {len(test_df)}')

Train Dataset Size : 23132
Valid Dataset Size : 3118
Test Dataset Size : 3793


In [71]:
train_df[train_df['transcription'].str.contains('Switch')]

Unnamed: 0,path,speakerId,transcription,action,object,location
3,wavs/speakers/2BqVo8kVB2Skwgyb/1811b6e0-4474-1...,2BqVo8kVB2Skwgyb,Switch on the lights,activate,lights,none
4,wavs/speakers/2BqVo8kVB2Skwgyb/1d9f3920-4474-1...,2BqVo8kVB2Skwgyb,Switch off the lights,deactivate,lights,none
20,wavs/speakers/2BqVo8kVB2Skwgyb/907594e0-4478-1...,2BqVo8kVB2Skwgyb,Switch on the lamp,activate,lamp,none
26,wavs/speakers/2BqVo8kVB2Skwgyb/a7f64650-4478-1...,2BqVo8kVB2Skwgyb,Switch on the kitchen lights,activate,lights,kitchen
28,wavs/speakers/2BqVo8kVB2Skwgyb/ae75f0c0-4478-1...,2BqVo8kVB2Skwgyb,Switch on the lights in the kitchen,activate,lights,kitchen
...,...,...,...,...,...,...
23109,wavs/speakers/zZezMeg5XvcbRdg3/620eaab0-45e0-1...,zZezMeg5XvcbRdg3,Switch language,change language,none,none
23127,wavs/speakers/zZezMeg5XvcbRdg3/b946b340-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my Chinese. Switch the lang...,change language,Chinese,none
23128,wavs/speakers/zZezMeg5XvcbRdg3/beb27cb0-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my German. Switch the language,change language,German,none
23129,wavs/speakers/zZezMeg5XvcbRdg3/c45f94e0-45e0-1...,zZezMeg5XvcbRdg3,I need to practice my Korean. Switch the language,change language,Korean,none


In [56]:
test_df_samples = test_df.sample(10).reset_index(drop=True)
test_df_samples

Unnamed: 0,path,speakerId,transcription,action,object,location
0,wavs/speakers/k5bqyxx2lzIbrlg9/ed66b390-4528-1...,k5bqyxx2lzIbrlg9,Switch on the lights in the bedroom,activate,lights,bedroom
1,wavs/speakers/ppymZZDb2Bf4NQnE/b0a0f130-44f3-1...,ppymZZDb2Bf4NQnE,I need to practice my Chinese. Switch the lang...,change language,Chinese,none
2,wavs/speakers/k5bqyxx2lzIbrlg9/0d65fea0-4527-1...,k5bqyxx2lzIbrlg9,That’s too loud,decrease,volume,none
3,wavs/speakers/V4ZbwLm9G5irobWn/610afcc0-4524-1...,V4ZbwLm9G5irobWn,Turn up the heat,increase,heat,none
4,wavs/speakers/V4ejqNL4xbUKkYrV/e6e7be40-459d-1...,V4ejqNL4xbUKkYrV,Use a different language,change language,none,none
5,wavs/speakers/ppymZZDb2Bf4NQnE/04da2390-44f2-1...,ppymZZDb2Bf4NQnE,Kitchen heat up,increase,heat,kitchen
6,wavs/speakers/k5bqyxx2lzIbrlg9/5d5e7d40-4529-1...,k5bqyxx2lzIbrlg9,Turn volume down,decrease,volume,none
7,wavs/speakers/V4ZbwLm9G5irobWn/112d2020-4524-1...,V4ZbwLm9G5irobWn,Turn on the lights,activate,lights,none
8,wavs/speakers/4BrX8aDqK2cLZRYl/79ded690-452e-1...,4BrX8aDqK2cLZRYl,Make it hotter,increase,heat,none
9,wavs/speakers/7B4XmNppyrCK977p/5bba89a0-45cb-1...,7B4XmNppyrCK977p,More heat,increase,heat,none


In [63]:
test_path = os.path.join(base_path, test_df_samples['path'][8])
wav, sr = librosa.load(test_path, sr=None)
ipd.Audio(wav, rate=sr)

In [65]:
signal = torch.tensor(wav, device=DEVICE).float().unsqueeze(0)
model.decode_intents(signal)

[['increase', 'heat', 'none']]

# 3. SLU for Custom Utterances

In [58]:
test_path = "test_wav/test.wav"
wav, sr = librosa.load(test_path, sr=None)
ipd.Audio(wav, rate=sr)

In [8]:
signal = torch.tensor(wav, device=DEVICE).float().unsqueeze(0)
model.decode_intents(signal)