In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from ast import literal_eval
import json

In [22]:
train_light_en = "../Datasets/smart-devices-en-fr/smart-lights-en-close.csv"
train_speaker_en = "../Datasets/smart-devices-en-fr/smart-speaker-en-close.csv"
train_speaker_fr = "../Datasets/smart-devices-en-fr/smart-speaker-fr-close.csv"

In [11]:
df_light_en = pd.read_csv(train_light_en, header=0)

In [12]:
df_light_en

Unnamed: 0,id,phones,label,intent,text,data-type,file
0,0,"['ɒ', 's', 't', 'ɻ̩', 'v', 'ɒ', 'i', 'b̞', 'ɔ'...",5,SwitchLightOn,Activate all the lights in the entire house.,smart-lights-en-close,0.wav
1,1,"['a', 'x', 'ə', 'v', 'ɪ', 'n', 't', 'v', 'ɛ', ...",5,SwitchLightOn,Activate basement lights,smart-lights-en-close,1.wav
2,2,"['a', 'l', 't͡ʃʲ', 'a', 's', 't', 'ə', 'v', 'a...",2,SetLightBrightness,Adjust the bedroom light intensity to thirty nine,smart-lights-en-close,10.wav
3,3,"['t͡ɕ', 'ɛ', 'n', 'j', 'u', 'n', 'p', 'uə', 'ʁ...",3,SetLightColor,Can you please change the light color to pink,smart-lights-en-close,100.wav
4,4,"['ts', 'ɛ', 'ɪ', 'ə', 'n', 'd', 'ɑ', 'l', 'a',...",2,SetLightBrightness,Set the brightness to five.,smart-lights-en-close,1000.wav
...,...,...,...,...,...,...,...
1655,1655,"['ʃ', 'ɒ', 's', 'ɪ', 'm', 'a', 'j', 'ð', 'ɛ', ...",2,SetLightBrightness,Set the brightness level of light to twenty,smart-lights-en-close,995.wav
1656,1656,"['t͡ʃ', 'a', 'l', 's', 'ɛ', 'f', 'ʏ', 'r', 'a'...",2,SetLightBrightness,Set the brightness level to seventy-three,smart-lights-en-close,996.wav
1657,1657,"['t͡ʃ', 'a', 'l', 's', 'ɪ', 'b', 'a', 'ɪ', 'ɛ'...",2,SetLightBrightness,Set the brightness of the light bulbs to fifty.,smart-lights-en-close,997.wav
1658,1658,"['zʲ', 'a', 's', 't', 'ə', 'b', 'ɹ', 'a', 'ɪ',...",2,SetLightBrightness,Set the brightness on the light bulbs to fifty.,smart-lights-en-close,998.wav


In [13]:
df_speaker_en = pd.read_csv(train_speaker_en, header=0)

In [17]:
df_speaker_en['label'].unique()

array([8])

In [19]:
df_speaker_fr = pd.read_csv(train_speaker_fr, header=0)

In [21]:
df_speaker_fr['label'].unique()

array([8])

In [28]:
df_en_all = pd.concat([df_speaker_en, df_light_en], ignore_index=True)

In [33]:
df_en_all['label'].value_counts()

8    1278
2     296
3     294
4     276
1     269
0     268
5     257
Name: label, dtype: int64

In [80]:
def save_train_test(df_train, df_test, save_dir="../memory"):
    df_train.to_csv(os.path.join(save_dir, "train.csv"), encoding='utf-8', index=False)
    df_test.to_csv(os.path.join(save_dir, "test.csv"), encoding='utf-8', index=False)
    print("train.csv and test.csv saved in {}".format(os.path.abspath(save_dir)))

In [39]:
train_en, test_en = train_test_split(df_en_all, random_state=1, test_size=0.15, shuffle=True)

In [41]:
test_en['label'].value_counts()

8    177
3     48
0     48
4     45
5     42
2     42
1     39
Name: label, dtype: int64

In [46]:
train_en = train_en.reset_index(drop=True)
test_en = test_en.reset_index(drop=True)

In [51]:
save_train_test(train_en, test_en)

In [62]:
def build_phone_vocab(df_all):
    phone2idx = {}
    # Convert string of list to string, i.e., "['a', 'b']" to ['a', 'b']
    df_all['phones'] = df_all['phones'].apply(lambda x: literal_eval(x))
    phones_samples = df_all['phones'].to_numpy()
    for sample in phones_samples:
        for phone in sample:
            if phone not in phone2idx:
                phone2idx[phone] = len(phone2idx)
    return phone2idx

In [81]:
def save_phone_idx(phone2idx, save_dir="../memory"):
    with open(os.path.join(save_dir, "phone_idx.json"), 'w', encoding='utf-8') as f:
        json.dump(phone2idx, f)
    print("phone_idx.json saved in {}".format(os.path.abspath(save_dir)))

In [63]:
phone2idx = build_phone_vocab(df_en_all)

In [72]:
save_phone_idx(phone2idx)

In [83]:
len(phone2idx)

121

### Save data as "train_x.npy" and "train_y.npy"

In [11]:
def load_phone_idx(file_path="../memory/phone_idx.json"):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [2]:
def read_data(filename):
    return pd.read_csv(filename)

In [3]:
ls

preprocess_data.ipynb  preprocess_enfr.py


In [35]:
train_en,test_en = read_data("../memory/enfr/train.csv"), read_data("../memory/enfr/test.csv")

In [34]:
def get_xy(df, phone2idx):
    df_phones = df['phones'].apply(lambda x: literal_eval(x))
    phones_samples = df_phones.to_numpy()
    labels = df['label'].to_numpy()
    X_all = []
    for sample in phones_samples:
        x = [phone2idx[phone] for phone in sample]
        X_all.append(x)
    X_all = np.array(X_all, dtype=object)
    return X_all, labels

In [26]:
phone2idx = load_phone_idx("../memory/enfr/phone_idx.json")

In [37]:
train_x, train_labels = get_xy(train_en, phone2idx)

In [28]:
train_x.shape == train_labels.shape

True

In [30]:
def save_numpy_data(filename, data):
    with open(filename, 'wb') as f:
        np.save(f, data)

In [32]:
save_numpy_data("../memory/enfr/en/train_x.npy", train_x)
save_numpy_data("../memory/enfr/en/train_labels.npy", train_labels)