Copyright 2019 Google LLC

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Make sure that jupyter is installed by running below command (it will allow to create folders in user dir):

```shell
pip install jupyter --user
```

In [1]:
## run the following command first and then restart the runtime, after this no need to run this command again, continue from the below cell
# !pip install --user --upgrade tensorflow-model-optimization

In [2]:
!git clone https://github.com/google-research/google-research.git

Cloning into 'google-research'...
remote: Enumerating objects: 35754, done.[K
remote: Counting objects: 100% (478/478), done.[K
remote: Compressing objects: 100% (370/370), done.[K
remote: Total 35754 (delta 166), reused 375 (delta 99), pack-reused 35276[K
Receiving objects: 100% (35754/35754), 284.31 MiB | 24.36 MiB/s, done.
Resolving deltas: 100% (19281/19281), done.
Checking out files: 100% (12286/12286), done.


In [3]:
import sys
import os
import tarfile
import urllib
sys.path.append('./google-research')

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # no need to use gpu

In [5]:
import tensorflow as tf
import tensorflow.compat.v1 as tf1
import logging
from kws_streaming.models import model_flags
from kws_streaming.models import models
from kws_streaming.layers.modes import Modes
from kws_streaming.train import test
from kws_streaming.models import utils
from kws_streaming.data import input_data
from kws_streaming.data import input_data_utils as du
from kws_streaming.models import model_params

In [6]:
tf1.__version__

'2.6.0'

In [7]:
config = tf1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf1.Session(config=config)

In [8]:
tf1.disable_eager_execution()

In [9]:
DATA_VERSION = 2

In [10]:
current_dir = os.getcwd()

if DATA_VERSION == 2:
  DATA_URL = "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
  DATA_PATH = os.path.join(current_dir, "data2/")
elif DATA_VERSION == 1:
  DATA_URL = "http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz"
  DATA_PATH = os.path.join(current_dir, "data1/")
else:
  assert(False)

In [11]:
DATA_PATH

'/content/data2/'

In [12]:
# create folder in current dir.
# not data will be downloaded in DATA_PATH, feel free to specify your own DATA_PATH
os.makedirs(DATA_PATH)

In [13]:
base_name = os.path.basename(DATA_URL)
base_name

'speech_commands_v0.02.tar.gz'

In [14]:
# it can take some time to download 2.3GB. After unpacking total size is 5.4GB
arch_file_name = os.path.join(DATA_PATH, base_name)
if not os.path.isfile(arch_file_name):
  # download data
  if sys.version_info >= (2, 5):
    file_path = urllib.request.urlretrieve(DATA_URL, filename=arch_file_name)[0]
  else:
    file_path = urllib.urlretrieve(DATA_URL, filename=arch_file_name)[0]

  # unpack it
  file_name, file_extension = os.path.splitext(base_name)
  tar = tarfile.open(file_path)
  tar.extractall(DATA_PATH)
  tar.close()

In [15]:
# default parameters for data splitting
flags = model_params.Params()
flags.data_dir = DATA_PATH
flags.data_url = DATA_URL
flags = model_flags.update_flags(flags)

In [16]:
audio_processor = input_data.AudioProcessor(flags)

In [17]:
testing_set_size = audio_processor.set_size('testing')
print("testing_set_size " + str(testing_set_size))
training_set_size = audio_processor.set_size('training')
print("training_set_size " + str(training_set_size))
validation_set_size = audio_processor.set_size('validation')
print("validation_set_size " + str(validation_set_size))

testing_set_size 4890
training_set_size 36923
validation_set_size 4445


In [18]:
# V2
# testing_set_size 4890
# training_set_size 36923
# validation_set_size 4445

# V1
# testing_set_size 3081
# training_set_size 22246
# validation_set_size 3093

In [19]:
# all words used for modeling: we have target words + unknown words (with index 1)
audio_processor.word_to_index

{'_silence_': 0,
 'backward': 1,
 'bed': 1,
 'bird': 1,
 'cat': 1,
 'dog': 1,
 'down': 5,
 'eight': 1,
 'five': 1,
 'follow': 1,
 'forward': 1,
 'four': 1,
 'go': 11,
 'happy': 1,
 'house': 1,
 'learn': 1,
 'left': 6,
 'marvin': 1,
 'nine': 1,
 'no': 3,
 'off': 9,
 'on': 8,
 'one': 1,
 'right': 7,
 'seven': 1,
 'sheila': 1,
 'six': 1,
 'stop': 10,
 'three': 1,
 'tree': 1,
 'two': 1,
 'up': 4,
 'visual': 1,
 'wow': 1,
 'yes': 2,
 'zero': 1}

In [20]:
# find the start of the file name where label begins
string = audio_processor.data_index["validation"][0]['file']
res = [i for i in range(len(string)) if string.startswith('/', i)] 
start_file = res[-2]+1
start_file

15

In [21]:
audio_processor.data_index["validation"][0]['file'][start_file:]

'bird/da7689f1_nohash_0.wav'

In [22]:
index_to_label = {}
unknown_category = []
# labeles used for training
for word in audio_processor.word_to_index.keys():
  if audio_processor.word_to_index[word] == du.SILENCE_INDEX:
    index_to_label[audio_processor.word_to_index[word]] = du.SILENCE_LABEL
  elif audio_processor.word_to_index[word] == du.UNKNOWN_WORD_INDEX:
    index_to_label[audio_processor.word_to_index[word]] = du.UNKNOWN_WORD_LABEL
    unknown_category.append(word)
  else:
    index_to_label[audio_processor.word_to_index[word]] = word

# training labels
index_to_label

{0: '_silence_',
 1: '_unknown_',
 2: 'yes',
 3: 'no',
 4: 'up',
 5: 'down',
 6: 'left',
 7: 'right',
 8: 'on',
 9: 'off',
 10: 'stop',
 11: 'go'}

In [23]:
# words belonging to unknown categry
unknown_category

['bed',
 'six',
 'cat',
 'forward',
 'two',
 'visual',
 'zero',
 'follow',
 'happy',
 'marvin',
 'four',
 'backward',
 'five',
 'nine',
 'house',
 'one',
 'eight',
 'three',
 'learn',
 'seven',
 'dog',
 'wow',
 'sheila',
 'bird',
 'tree']

In [24]:
def get_distribution(mode):
  distrib_label = {}
  distrib_words = {}
  files = {}
  for data in audio_processor.data_index[mode]:
    word = data['label']
    file = data['file'][start_file:]
    index = audio_processor.word_to_index[word]
    label = index_to_label[index]
    if word in files:
      files[word].append(file)
    else:
      files[word] = [file]

    if label in distrib_label:
      distrib_label[label] = distrib_label[label] + 1
    else:
      distrib_label[label] = 1

    if word in distrib_words:
      distrib_words[word] = distrib_words[word] + 1
    else:
      distrib_words[word] = 1
  return distrib_words, distrib_label, files

In [25]:
# distribution of labeles in testing data
distrib_words_testing, distrib_labels_testing, files_testing = get_distribution('testing')
distrib_labels_testing

{'_silence_': 407,
 '_unknown_': 407,
 'down': 405,
 'go': 401,
 'left': 411,
 'no': 404,
 'off': 401,
 'on': 395,
 'right': 395,
 'stop': 410,
 'up': 424,
 'yes': 418}

In [26]:
# distribution of labeles in training data
distrib_words_training, distrib_labels_training, files_training = get_distribution('training')
distrib_labels_training

{'_silence_': 3076,
 '_unknown_': 3076,
 'down': 3133,
 'go': 3105,
 'left': 3036,
 'no': 3129,
 'off': 2969,
 'on': 3085,
 'right': 3018,
 'stop': 3110,
 'up': 2947,
 'yes': 3227}

In [27]:
def parse_files(set_list_fname, label='yes'):
  set_files = []
  with open(set_list_fname) as f:
    while True:
      line = f.readline()
      if not line:
        break
      if line.split('/')[0] == label:
        set_files.append(line[:-1])
  return set_files

In [28]:
def validate(my_list1, list2, print_in_list2=False):
  cnt_my_val2 = 0
  cnt_my_val1 = 0
  for my_val in my_list1:
    if my_val in list2:
      cnt_my_val2 = cnt_my_val2 + 1
      if print_in_list2:
        print(my_val)
    else:
      cnt_my_val1 = cnt_my_val1 + 1
      if not print_in_list2:
        print(my_val)
  return cnt_my_val1, cnt_my_val2

In [29]:
file_list = os.path.join(DATA_PATH, "testing_list.txt")

# validate that all wav used during testing belongs to testing_list
for word in files_testing.keys():
  if word != '_silence_':
    word_files = parse_files(file_list, label=word)
    _, cnt_val = validate(files_testing[word], word_files, False)
    assert(cnt_val-len(files_testing[word])==0)

In [30]:

distrib_words_training, distrib_labels_training, files_training = get_distribution('training')

# validate that all wav used during testing do not belong to training data
for word in files_testing.keys():
  if word != '_silence_': # silence file does not matter becasue it is multiplied by zero
    word_files = files_testing[word]
    _, cnt_val = validate(files_training[word], word_files, True)
    assert(cnt_val == 0)