In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time
from pathlib import Path

from tensorflow.keras.models import model_from_json
from scipy.io import wavfile
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from tensorflow.keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Dense, Input, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard

train_path = './data/train/audio/'
test_path = './data/test/'
train_words = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

In [2]:
# '''
# t_data_dir = Path(train_path)
# # print(t_data_dir)
# files = [(str(file), file.parts[-2]) for file in t_data_dir.glob("**/*.wav") if file]
# # print(*files)
# t_df = pd.DataFrame(files, columns=['path', 'word'])
# t_df.head()

# words = t_df['word'].unique().tolist()
# # print(words)
# silence = ['_background_noise_']
# unknown = [word for word in words if word not in silence + train_words]
# known = [word for word in words if word in silence + train_words]
# # print(unknown)
# # print(known)
# tt_df = t_df.copy()
# tt_df.loc[tt_df['word'].isin(silence), 'word'] = 'unknown'
# tt_df.loc[tt_df['word'].isin(unknown), 'word'] = 'unknown'
# tt_df.head()
# labelbinarizer = LabelBinarizer()
# label = labelbinarizer.fit_transform(tt_df['word'])
# print(label)
# print(len(label))
# print(label[0])
# print(tt_df['word'][0])
# print(label[3000])
# print(tt_df['word'][3000])
# print(label[8000])
# print(tt_df['word'][8000])
# print(label[8000][4])
# print(len(tt_df['word']))
# '''

In [3]:
# if 0:
#     print(0)
# else:
#     print(1)

In [4]:
# def find_label_lst(df, labels):
#     ret = {}
#     tmp = {}
    
#     len_labels = len(labels)
#     len_label = len(labels[0])
#     for i in range(len_labels):
#         for j in range(len_label):
#             if labels[i][j] and j not in ret and df['word'][i] not in ret.values():
#                 ret[j] = df['word'][i]
#                 tmp[j] = i
                
#     print(ret)
# #     print(tmp)
#     arr = []
#     for i in range(len(ret)):
#         arr.append(ret[i])
#     print(arr)


# find_label_lst(tt_df, label)

In [5]:
# arr = {}
# arr[3] = 2
# arr[1] = 5
# arr[0] = 1
# arr[2] = 3
# if 5 in arr:
#     arr[3] = 3

# lst = []
# for i in range(4):
#     lst.append(arr[i])
# print(arr)
# print(lst)

In [6]:
class get_data:
    def __init__(self, path, train_words, is_train=True):
        self.path = path
        self.train_words = train_words
        self.get_data()
        self.prepare_data()
        if is_train:
            self.split_data()
        
    def prepare_data(self):
        words = self.df['word'].unique().tolist()
        silence = ['_background_noise_']
        unknown = [word for word in words if word not in silence + self.train_words]
        # Mark silence files as unknown too
        self.df.loc[self.df['word'].isin(silence), 'word'] = 'unknown'
        self.df.loc[self.df['word'].isin(unknown), 'word'] = 'unknown'
            
    def get_data(self):
        data_dir = Path(self.path)
        files = [(str(file), file.parts[-2]) for file in data_dir.glob("**/*.wav") if file]
        self.df = pd.DataFrame(files, columns=['path', 'word'])
#         print(self.df.isnull().values.any())
        
    def split_data(self):
        labelbinarizer = LabelBinarizer()
        self.X = self.df.path
        self.y = labelbinarizer.fit_transform(self.df['word'])
        self.ts_f, self.vs_f, self.ts_l, self.vs_l =\
        train_test_split(
            self.X,
            self.y,
            test_size=0.2,
            stratify=self.y
        )
        
    def find_label_lst(self):
        df = self.df
        labels = self.y
        ret = {}

        len_labels = len(labels)
        len_label = len(labels[0])
        for i in range(len_labels):
            for j in range(len_label):
                if labels[i][j] and j not in ret and df['word'][i] not in ret.values():
                    ret[j] = df['word'][i]  
        arr = []
        for i in range(len(ret)):
            arr.append(ret[i])
        self.pred_label = arr

In [73]:
class create_model:
    def __init__(
        self,
        train_words,
        shape=(129, 124, 1),
        save_model=False,
        load_model=False,
        save_model_name='default',
        load_model_name='default',
        best_model_name='best_model'
    ):
        self.shape = shape
        self.train_words = train_words
        
        self.save_model = save_model
        self.load_model = load_model
        self.save_model_json_path = './model/' + save_model_name + '.json'
        self.save_model_HDF5_path = './model/' + save_model_name + '.h5'
        self.load_model_json_path = './model/' + load_model_name + '.json'
        self.load_model_HDF5_path = './model/' + load_model_name + '.h5'
        
        if load_model:
            self.load()
        else:
            self.build()
        self.compile_model()
        print('create model !')
        
    def save(self):
        model_json = self.model.to_json()
        with open(self.save_model_json_path, 'w') as json_file:
            json_file.write(model_json)
        self.model.save_weights(self.save_model_HDF5_path)
        print("Saving the model...")
        
    def load(self):
        try:
            json_file = open(self.load_model_json_path, 'r')
            loaded_model_json = json_file.read()
            json_file.close()

            self.model = model_from_json(loaded_model_json)
            # load weights into new model
            self.model.load_weights(self.load_model_HDF5_path)
            print("Loaded model...")
        except:
            print("Loading error!")
    
    def build(self):
        input_layer = Input(shape=self.shape)
        
        ''' basic one
        model = BatchNormalization()(input_layer)
        model = Conv2D(16, (3, 3), activation='elu')(model)
        model = Dropout(0.25)(model)
        model = MaxPooling2D((2, 2))(model)

        model = Flatten()(model)
        model = Dense(32, activation='elu')(model)
        model = Dropout(0.25)(model)
        
        model = Dense(len(self.train_words) + 1, activation='softmax')(model)
        '''
        
        model = BatchNormalization()(input_layer)
        
        model = Conv2D(8, (2, 2), activation='relu')(model)
        model = Conv2D(8, (2, 2), activation='relu')(model)
        model = Dropout(0.2)(model)
        model = MaxPooling2D((2, 2))(model)
        
        model = Conv2D(16, (3, 3), activation='relu')(model)
        model = Conv2D(16, (3, 3), activation='relu')(model)
        model = Dropout(0.2)(model)
        model = MaxPooling2D((2, 2))(model)
        
        model = Conv2D(32, (3, 3), activation='relu')(model)
        model = Dropout(0.2)(model)
        model = MaxPooling2D((2, 2))(model)

        model = Flatten()(model)
        model = BatchNormalization()(Dense(128, activation='relu')(model))
        model = BatchNormalization()(Dense(128, activation='relu')(model))
        
        # 11 because background noise has been taken out
        model = Dense(len(self.train_words) + 1, activation='softmax')(model)

        self.model = Model(inputs=input_layer, outputs=model)
        
    def compile_model(self):
        self.model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
    def train(
        self,
        ts_f,
        ts_l,
        vs_f,
        vs_l,
        epochs=10,
        batch_size=32,
        has_tb=False,
        tb_path='./tensorboard/test_v0',
    ):
        self.ts_f = ts_f
        self.ts_l = ts_l
        self.vs_f = vs_f
        self.vs_l = vs_l
        self.epochs = epochs
        self.batch_size = batch_size
        self.has_tb = has_tb
        self.tb_path = tb_path
        
        self.train_gen = self.batch_generator(self.ts_f.values, self.ts_l, self.batch_size)
        self.vali_gen = self.batch_generator(self.vs_f.values, self.vs_l, self.batch_size)
        tensorboard = TensorBoard(
            log_dir=self.tb_path,
            batch_size=self.batch_size
        )
        
        fit_arg = dict(
            generator=self.train_gen,
            epochs=self.epochs,
            steps_per_epoch=self.ts_f.shape[0] // self.batch_size,
            validation_data=self.vali_gen,
            validation_steps=self.vs_f.shape[0] // self.batch_size
        )
        if self.has_tb:
            fit_arg['callbacks'] = [tensorboard]
        if self.save_model:
            
        self.model.fit_generator(**fit_arg)
        if self.save_model:
            self.save()
    
    def batch_generator(self, X, y, batch_size=16):
        # Return a random image from X, y
        while True:
            idx_lst = np.random.randint(0, X.shape[0], batch_size)
            imgs = X[idx_lst]
            labels = y[idx_lst]
            spgs = self.get_spectrogram(imgs)
            
            yield np.concatenate([spgs]), labels
            
    def get_spectrogram(self, paths, nsamples=16000):
        # Given list of paths, return spectrogram
#         for path in paths:
#             print(path)
        wavs = [wavfile.read(path)[1] for path in paths]
        
        data = []
        for wav in wavs:
            if wav.size < 16000:
                d = np.pad(wav, (nsamples - wav.size, 0), mode='constant')
            else:
                d = wav[0:nsamples]
            data.append(d)
            
        spg = [signal.spectrogram(d, nperseg=256, noverlap=128)[2] for d in data]
        spg = [s.reshape(129, 124, -1) for s in spg]
        return (spg)
    
#     def predict(self, test_df):
#         self.predictions = []
#         paths = test_df['path'].tolist()
        
#         for path in paths:
#             spg = self.get_spectrogram([path])
#             pred = self.model.predict(np.array(spg))
#             self.predictions.extend(pred)
            
#         labelbinarizer = LabelBinarizer()
#         labels = [labelbinarizer.inverse_transform(p.reshape(1, -1), threshold=0.5)[0] for p in self.predictions]
#         test_df['labels'] = labels
#         test_df['path'] = test_df['path'].apply(lambda x: str(x).split('/')[-1])
#         self.submission = pd.DataFrame(
#             {
#                 'fname': test_df['path'].tolist(),
#                 'label': labels
#             }
#         )

    def predict(self, test_df, pred_label):
        self.predictions = []
        paths = test_df['path'].tolist()
        len_paths = len(paths)
        for i in range(len_paths):
            spg = self.get_spectrogram([paths[i]])
            pred = self.model.predict(np.array(spg))
            pred = np.argmax(pred, axis=1)
            pred = pred_label[pred[0]]
            self.predictions.append(pred)
            if i % 1000 == 0:
                print(f"{i}/{len_paths}")

        test_df['pred'] = self.predictions
        test_df['fname'] = test_df['path'].apply(lambda x: str(x).split('/')[-1])
        self.submission = pd.DataFrame(
            {
                'fname': test_df['fname'].tolist(),
                'label': test_df['pred'].tolist()
            }
        )
        
    def save_predict(self, path='submission_v2.csv'):
        self.submission.to_csv(path, index=False)

In [8]:
train = get_data(train_path, train_words)

In [9]:
train.find_label_lst()
print(train.pred_label)

['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'unknown', 'up', 'yes']


In [74]:
model = create_model(
    train_words,
#     save_model=True,
#     save_model_name='default_8epos'
    load_model=True,
    load_model_name='default_8epos'
)

Loaded model...
create model !


In [34]:
model.train(
    train.ts_f,
    train.ts_l,
    train.vs_f,
    train.vs_l,
    epochs=8,
    has_tb=True,
    tb_path='./tensorboard/test_v8'
)

W0328 04:22:47.776945 140125303052032 callbacks.py:1197] `batch_size` is no longer needed in the `TensorBoard` Callback and will be ignored in TensorFlow 2.0.


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Saving the model...


In [84]:
model.train(
    train.ts_f,
    train.ts_l,
    train.vs_f,
    train.vs_l,
    epochs=1,
    has_tb=False,
    tb_path='./tensorboard/test_v9'
)

W0329 21:20:10.632958 139889126586112 callbacks.py:1197] `batch_size` is no longer needed in the `TensorBoard` Callback and will be ignored in TensorFlow 2.0.


 165/1618 [==>...........................] - ETA: 10:38 - loss: 0.0382 - accuracy: 0.9862





KeyboardInterrupt: 

In [86]:
print(*model.train_gen)

KeyboardInterrupt: 

In [60]:
test = get_data(test_path, train_words, is_train=False)

In [54]:
test.df.head()

Unnamed: 0,path,word
0,data/test/audio/clip_0201b98c2.wav,unknown
1,data/test/audio/clip_78befd0e0.wav,unknown
2,data/test/audio/clip_0e7824991.wav,unknown
3,data/test/audio/clip_773b7caf8.wav,unknown
4,data/test/audio/clip_f14da8108.wav,unknown


In [75]:
model.predict(test.df, train.pred_label)

0/158538
1000/158538
2000/158538
3000/158538
4000/158538
5000/158538
6000/158538
7000/158538
8000/158538
9000/158538
10000/158538
11000/158538
12000/158538
13000/158538
14000/158538
15000/158538
16000/158538
17000/158538
18000/158538
19000/158538
20000/158538
21000/158538
22000/158538
23000/158538
24000/158538
25000/158538
26000/158538
27000/158538
28000/158538
29000/158538
30000/158538
31000/158538
32000/158538
33000/158538
34000/158538
35000/158538
36000/158538
37000/158538
38000/158538
39000/158538
40000/158538
41000/158538
42000/158538
43000/158538
44000/158538
45000/158538
46000/158538
47000/158538
48000/158538
49000/158538
50000/158538
51000/158538
52000/158538
53000/158538
54000/158538
55000/158538
56000/158538
57000/158538
58000/158538
59000/158538
60000/158538
61000/158538
62000/158538
63000/158538
64000/158538
65000/158538
66000/158538
67000/158538
68000/158538
69000/158538
70000/158538
71000/158538
72000/158538
73000/158538
74000/158538
75000/158538
76000/158538
77000/158538

In [76]:
model.save_predict()

In [77]:
model.submission.head()

Unnamed: 0,fname,label
0,,unknown
1,,unknown
2,,unknown
3,,no
4,,unknown


In [78]:
submit_v0 = model.submission

In [82]:
# submit_v0['fname'] = test.df['path'].apply(lambda x: str(x).split('/')[-1])
submit_v0.to_csv('submission_v0.csv', index=False)

In [11]:
# paths = test.df['path'].tolist()
# path = paths[3000]

path = train.df['path'][3000]

# print(path)
spg = model.get_spectrogram([path])
# print(spg)
pred = model.model.predict(np.array(spg))
pred = np.argmax(pred, axis=1)
# pred = [lab]
print(pred)
print(train.pred_label[pred[0]])
# pridictions = []
# for path in paths:
#     spg = model.get_spectrogram([path])
#     pred = model.model.predict(np.array(spg))
#     predictions.extend(pred)

[7]
stop


In [61]:
s_point = 5000
t_df = test.df.iloc[s_point : s_point + 5,]
t_df

Unnamed: 0,path,word
5000,data/test/audio/clip_ab0bcda7b.wav,unknown
5001,data/test/audio/clip_ff0e145d3.wav,unknown
5002,data/test/audio/clip_c119f835f.wav,unknown
5003,data/test/audio/clip_277b78495.wav,unknown
5004,data/test/audio/clip_8aea89f4b.wav,unknown
