In [4]:
import json
import pandas as pd
import soundfile as sf
from io import StringIO
from transformers import (
    Wav2Vec2ForCTC, 
    AutoTokenizer, 
    AutoModelForSequenceClassification
)

In [5]:
data = pd.read_json(open("../data/slurp_dataset/slurp/devel.jsonl", "r"), lines=True)

In [6]:
data.head()

Unnamed: 0,slurp_id,sentence,sentence_annotation,intent,action,tokens,scenario,recordings,entities
0,13804,siri what is one american dollar in japanese yen,siri what is one [currency_name : american dol...,qa_currency,currency,"[{'surface': 'siri', 'id': 0, 'lemma': 'siri',...",qa,"[{'file': 'audio-1434542201-headset.flac', 'we...","[{'span': [4, 5], 'type': 'currency_name'}, {'..."
1,16421,how many unread emails do i have,how many unread emails do i have,email_query,query,"[{'surface': 'how', 'id': 0, 'lemma': 'how', '...",email,"[{'file': 'audio-1499695168-headset.flac', 'we...",[]
2,3843,order me chinese food,order me [food_type : chinese] food,takeaway_order,order,"[{'surface': 'order', 'id': 0, 'lemma': 'order...",takeaway,"[{'file': 'audio-1490201253-headset.flac', 'we...","[{'span': [2], 'type': 'food_type'}]"
3,3296,does the nearby chinese restaurant do delivery,does the nearby [food_type : chinese] [busines...,takeaway_query,query,"[{'surface': 'does', 'id': 0, 'lemma': 'do', '...",takeaway,"[{'file': 'audio-1502299642-headset.flac', 'we...","[{'span': [3], 'type': 'food_type'}, {'span': ..."
4,10732,remove pepper from my grocery list,remove pepper from my [list_name : grocery] list,lists_remove,remove,"[{'surface': 'remove', 'id': 0, 'lemma': 'remo...",lists,"[{'file': 'audio-1490184504-headset.flac', 'we...","[{'span': [4], 'type': 'list_name'}]"


In [7]:
data.columns

Index(['slurp_id', 'sentence', 'sentence_annotation', 'intent', 'action',
       'tokens', 'scenario', 'recordings', 'entities'],
      dtype='object')

In [8]:
data = data[["slurp_id", "sentence", "intent", "recordings"]]
data.head()

Unnamed: 0,slurp_id,sentence,intent,recordings
0,13804,siri what is one american dollar in japanese yen,qa_currency,"[{'file': 'audio-1434542201-headset.flac', 'we..."
1,16421,how many unread emails do i have,email_query,"[{'file': 'audio-1499695168-headset.flac', 'we..."
2,3843,order me chinese food,takeaway_order,"[{'file': 'audio-1490201253-headset.flac', 'we..."
3,3296,does the nearby chinese restaurant do delivery,takeaway_query,"[{'file': 'audio-1502299642-headset.flac', 'we..."
4,10732,remove pepper from my grocery list,lists_remove,"[{'file': 'audio-1490184504-headset.flac', 'we..."


In [9]:
def filter_best_recording(recordings):
    headset_subset = list(filter(lambda x: "headset" in x["file"], recordings))
    if not headset_subset:
        return None
    best_recording = min(headset_subset, key=lambda x: x["wer"])
    return best_recording["file"]

In [10]:
data.recordings = data.recordings.apply(filter_best_recording)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2033 entries, 0 to 2032
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   slurp_id    2033 non-null   int64 
 1   sentence    2033 non-null   object
 2   intent      2033 non-null   object
 3   recordings  1890 non-null   object
dtypes: int64(1), object(3)
memory usage: 63.7+ KB


In [12]:
data = data.dropna()

In [13]:
data

Unnamed: 0,slurp_id,sentence,intent,recordings
0,13804,siri what is one american dollar in japanese yen,qa_currency,audio-1434542201-headset.flac
1,16421,how many unread emails do i have,email_query,audio-1499695168-headset.flac
2,3843,order me chinese food,takeaway_order,audio-1490201253-headset.flac
3,3296,does the nearby chinese restaurant do delivery,takeaway_query,audio-1434527358-headset.flac
4,10732,remove pepper from my grocery list,lists_remove,audio-1490184504-headset.flac
...,...,...,...,...
2028,5342,can you give me local news on wayne county she...,news_query,audio-1501407228-headset.flac
2029,5728,every light of room increase its intensity,iot_hue_lightup,audio-1501772225-headset.flac
2030,5989,i would like some coffee now,iot_coffee,audio-1497621031-headset.flac
2031,13202,what is the population of los angeles,qa_factoid,audio-1490799323-headset.flac


In [17]:
from src.utils.inference_onnx import Wave2Vec2ONNXInference

In [19]:
asr = Wave2Vec2ONNXInference(
    "facebook/wav2vec2-base-960h",
    "../models/wav2vec2-base-960h.onnx"
)

In [21]:
from pathlib import Path

In [22]:
asr.predict(Path("../data/slurp_dataset/audio/slurp_real") / data.recordings.values[0])

'sery what is one american dull in japanesian'

In [24]:
data.sentence.values[0]

'siri what is one american dollar in japanese yen'

In [2]:
from src.utils.inference_onnx import NLUONNXInference

nlu = NLUONNXInference(
    "sankar1535/slurp-intent_baseline-distilbert-base-uncased",
    "../models/slurp-intent_baseline-distilbert-base-uncased.onnx"
)

In [3]:
nlu.processor.id2label

AttributeError: 'DistilBertTokenizerFast' object has no attribute 'id2label'

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("sankar1535/slurp-intent_baseline-distilbert-base-uncased")

In [6]:
model.config.id2label

{0: 'datetime_convert',
 1: 'lists_query',
 2: 'alarm_remove',
 3: 'iot_hue_lighton',
 4: 'post',
 5: 'transport_ticket',
 6: 'music_query',
 7: 'qa_maths',
 8: 'cooking_query',
 9: 'iot_hue_lightchange',
 10: 'iot_hue_lightup',
 11: 'likeness',
 12: 'sendemail',
 13: 'podcasts',
 14: 'general_greet',
 15: 'social_query',
 16: 'qa_factoid',
 17: 'iot_hue_lightdim',
 18: 'transport_query',
 19: 'factoid',
 20: 'iot_wemo_off',
 21: 'wemo_on',
 22: 'volume_other',
 23: 'recommendation_events',
 24: 'query',
 25: 'wemo_off',
 26: 'hue_lightoff',
 27: 'iot_cleaning',
 28: 'ticket',
 29: 'iot_wemo_on',
 30: 'recommendation_locations',
 31: 'alarm_query',
 32: 'play_game',
 33: 'remove',
 34: 'convert',
 35: 'currency',
 36: 'addcontact',
 37: 'play_music',
 38: 'definition',
 39: 'calendar_query',
 40: 'audio_volume_other',
 41: 'coffee',
 42: 'transport_traffic',
 43: 'play_radio',
 44: 'datetime_query',
 45: 'iot_coffee',
 46: 'takeaway_order',
 47: 'radio',
 48: 'settings',
 49: 'cooking_