In [1]:
DATADIR = '../data' # unzipped train and test data
OUTDIR = './model-k' # just a random name
# Data Loading
import os
import re
from glob import glob
import struct
import pandas as pd
from tqdm import tqdm
from scipy.io import wavfile
import numpy as np
import webrtcvad


POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}


def load_data(data_dir):
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))

    possible = set(POSSIBLE_LABELS)
    train, val = [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            #label_id = name2id[label]

            sample = (label, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    return train, val

trainset, valset = load_data(DATADIR)

There are 57929 train and 6798 val samples


In [2]:
import struct
#from tqdm import tqdm

def detect_voice(file, aggressiveness=1):
    vad = webrtcvad.Vad()
    vad.set_mode(aggressiveness)
    sample_rate, samples = wavfile.read(file)
    raw_samples = struct.pack("%dh" % len(samples), *samples)
    
    window_duration = 0.03 # duration in seconds

    samples_per_window = int(window_duration * sample_rate + 0.5)

    bytes_per_sample = 2

    for start in np.arange(0, len(samples), samples_per_window):
        
        
        try:
            stop = min(start + samples_per_window, len(samples))
            is_speech = vad.is_speech(raw_samples[start * bytes_per_sample: stop * bytes_per_sample], 
                                      sample_rate = sample_rate)
            return is_speech
            #segments.append(dict(
            #   start = start,
            #   stop = stop,
            #   is_speech = is_speech))
            
        except ValueError:
            pass
    raise ValueError('Not valid file')

In [3]:
traindf = pd.DataFrame.from_records(trainset, columns=['label', 'user', 'file_path'])
valdf = pd.DataFrame.from_records(valset, columns=['label', 'user', 'file_path'])

In [4]:
tqdm.pandas(desc="my_bar!")

In [5]:
traindf['is_speech'] = traindf.file_path.progress_apply(detect_voice)

valdf['is_speech'] = valdf.file_path.progress_apply(detect_voice)

my_bar!: 100%|██████████| 57929/57929 [00:53<00:00, 1075.60it/s]
my_bar!: 100%|██████████| 6798/6798 [00:06<00:00, 1075.82it/s]


In [8]:
sample_rate, samples = wavfile.read('../data/train/audio/yes/41285056_nohash_2.wav')
import IPython.display as ipd
ipd.Audio(samples, rate=sample_rate)

In [118]:
speech_samples = np.concatenate([ samples[segment['start']:segment['stop']] for segment in segments if segment['is_speech']])

import IPython.display as ipd
ipd.Audio(speech_samples, rate=sample_rate)