# Prepare Files from TensorFlow Speech Recognition Challenge
### Table of Contents:
1. [Imports](#imports)
2. [Load data](#loaddata)
3. [Extract silence clips](#silence)
4. [Read word clips](#words)
5. [Silence df](#silencedf)
6. [Re-add silence data](#addsilence)
7. [Pickle training data](#pickle)

<a id="imports"></a>
## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import glob
import scipy
import matplotlib.pyplot as plt
import re
import os
%matplotlib inline

<a id="loaddata"></a>
## 2. Load raw wav files

In [2]:
POSSIBLE_LABELS = 'yes no up down left right on off stop go unknown silence'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}

Load data (borrowed from [this kernel](https://www.kaggle.com/sainathadapa/keras-starter-code))

In [3]:
def load_data(data_dir):
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = glob.glob('train/audio/*/*wav')

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))

    possible = set(POSSIBLE_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]

            sample = (label, label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    
    columns_list = ['label', 'label_id', 'user_id', 'wav_file']
    
    train_df = pd.DataFrame(train, columns = columns_list)
    valid_df = pd.DataFrame(val, columns = columns_list)
    
    return train_df, valid_df

In [4]:
train_df, valid_df = load_data('./')

There are 57929 train and 6798 val samples


### Pickle validation df for later

In [5]:
pd.to_pickle(valid_df, 'valid_df_dec10.pkl')

<a id="silence"></a>
## 3. Extract 1-sec clips of background noise

In [6]:
silence_df = train_df[train_df.label == 'silence']

In [7]:
silence_df

Unnamed: 0,label,label_id,user_id,wav_file
13113,silence,11,white,train/audio/_background_noise_/white_noise.wav
13114,silence,11,dude,train/audio/_background_noise_/dude_miaowing.wav
13115,silence,11,exercise,train/audio/_background_noise_/exercise_bike.wav
13116,silence,11,pink,train/audio/_background_noise_/pink_noise.wav
13117,silence,11,doing,train/audio/_background_noise_/doing_the_dishe...
13118,silence,11,running,train/audio/_background_noise_/running_tap.wav


In [8]:
from scipy.io import wavfile

In [9]:
silence_data = np.array(silence_df.wav_file.apply(wavfile.read))



In [10]:
for sound in silence_data:
    print(sound[0], sound[1].shape)

16000 (960000,)
16000 (988891,)
16000 (980062,)
16000 (960000,)
16000 (1522930,)
16000 (978488,)


### Grab random 1-sec chunks

In [11]:
silence_chunks = []
clip_len = 16000
for sound in silence_data:
    for i in range(1200):
        ind = np.random.randint(0, (len(sound[1])-clip_len))
        silence_chunks.append(sound[1][ind:ind+16000])

In [12]:
silence_chunks = np.array(silence_chunks)

In [13]:
silence_chunks.shape

(7200, 16000)

In [14]:
silence_chunks

array([[ -4163,   3851,  10032, ...,   7667,   1763,  -6531],
       [ 10960,  -8411,  -3434, ...,   5189,   6680, -10271],
       [-12228,   1646,   8555, ..., -12629,   5617,  -3621],
       ..., 
       [   510,  -1164,    136, ...,   -530,   1297,  -1638],
       [  -348,   3059,   -295, ...,  -1193,    726,   1191],
       [  2987,  -1575,  -3197, ...,  -1839,   -974,   1681]], dtype=int16)

<a id="words"></a>
## 4. Read data for word files

In [15]:
word_data = np.array(train_df[train_df.label_id != 11].wav_file.apply(wavfile.read))

In [16]:
word_data2 = [sr_wav[1] for sr_wav in word_data]
word_data2 = np.array(word_data2)
word_data2.shape

(57923,)

In [17]:
train_df = train_df[train_df.label_id != 11]
train_df.shape

(57923, 4)

In [18]:
train_df.head(1)

Unnamed: 0,label,label_id,user_id,wav_file
0,left,4,cb8f8307,train/audio/left/cb8f8307_nohash_1.wav


<a id="silencedf"></a>
## 5. Create silence DataFrame

In [19]:
silence_df2 = pd.DataFrame(columns=['label', 'label_id', 'user_id', 'wav_file'])
for i in range(6):
    silence_df2 = silence_df2.append([silence_df.iloc()[i]]*1200)
silence_df2.shape

(7200, 4)

In [20]:
silence_df

Unnamed: 0,label,label_id,user_id,wav_file
13113,silence,11,white,train/audio/_background_noise_/white_noise.wav
13114,silence,11,dude,train/audio/_background_noise_/dude_miaowing.wav
13115,silence,11,exercise,train/audio/_background_noise_/exercise_bike.wav
13116,silence,11,pink,train/audio/_background_noise_/pink_noise.wav
13117,silence,11,doing,train/audio/_background_noise_/doing_the_dishe...
13118,silence,11,running,train/audio/_background_noise_/running_tap.wav


In [21]:
silence_df2.head(2)

Unnamed: 0,label,label_id,user_id,wav_file
13113,silence,11,white,train/audio/_background_noise_/white_noise.wav
13113,silence,11,white,train/audio/_background_noise_/white_noise.wav


<a id="addsilence"></a>
## 6. Add silence data back to training data

In [22]:
train_df = train_df.append(silence_df2)
train_df.reset_index(inplace=True)
train_df.tail(2)

Unnamed: 0,index,label,label_id,user_id,wav_file
65121,13118,silence,11,running,train/audio/_background_noise_/running_tap.wav
65122,13118,silence,11,running,train/audio/_background_noise_/running_tap.wav


In [23]:
train_df.drop(labels=["index"], axis=1, inplace=True)
train_df.tail(2)

Unnamed: 0,label,label_id,user_id,wav_file
65121,silence,11,running,train/audio/_background_noise_/running_tap.wav
65122,silence,11,running,train/audio/_background_noise_/running_tap.wav


In [24]:
all_sounds = list(word_data2)
all_sounds.extend(silence_chunks)
len(all_sounds)

65123

<a id="pickle"></a>
## 7. Pickle training df

In [25]:
train_df['sound'] = all_sounds

In [26]:
train_df.head()

Unnamed: 0,label,label_id,user_id,wav_file,sound
0,left,4,cb8f8307,train/audio/left/cb8f8307_nohash_1.wav,"[-7, 21, -10, 6, -13, 7, 15, -23, 14, -12, 29,..."
1,left,4,b7a0754f,train/audio/left/b7a0754f_nohash_2.wav,"[-2, 6, 7, 3, -2, 2, 6, 6, 4, 5, 5, 11, 9, -1,..."
2,left,4,0132a06d,train/audio/left/0132a06d_nohash_3.wav,"[1, 2, 1, 2, 0, 1, 0, -2, 0, 1, 1, -1, -2, 0, ..."
3,left,4,f92e49f3,train/audio/left/f92e49f3_nohash_4.wav,"[2, 1, -1, -1, 0, 3, 1, -3, -5, -4, -2, 2, 2, ..."
4,left,4,88053e92,train/audio/left/88053e92_nohash_1.wav,"[1, 7, 10, 6, 9, 11, 7, 1, -5, -9, -11, -14, -..."


In [27]:
train_df.tail()

Unnamed: 0,label,label_id,user_id,wav_file,sound
65118,silence,11,running,train/audio/_background_noise_/running_tap.wav,"[570, -849, -678, 1372, -2457, -339, 560, -118..."
65119,silence,11,running,train/audio/_background_noise_/running_tap.wav,"[-2927, -659, 2299, 198, 1218, -1074, -2849, 4..."
65120,silence,11,running,train/audio/_background_noise_/running_tap.wav,"[510, -1164, 136, 425, -1003, -2090, 373, 4706..."
65121,silence,11,running,train/audio/_background_noise_/running_tap.wav,"[-348, 3059, -295, -837, 991, 1134, -419, -579..."
65122,silence,11,running,train/audio/_background_noise_/running_tap.wav,"[2987, -1575, -3197, 1662, 1680, -171, 39, 972..."


In [48]:
train_df.to_pickle('train_df_dec10.pkl')

In [28]:
train_df.label.value_counts()

unknown    36818
silence     7200
stop        2134
yes         2116
up          2115
go          2112
right       2111
on          2110
left        2106
no          2105
off         2101
down        2095
Name: label, dtype: int64