# Setup

In [1]:
!pip install sox
!pip install numpy



In [2]:
import glob
import sox
import json
import numpy as np
import os

In [3]:
# input data directory with wav and json files
input_data_dir = os.getcwd() + '/data/vad_data/'
sample_rate = 16000

# Checks and statistics

### Check if:

- the sampling rate is the same for all audio files,
- all wav files have a corresponding json file,
- the structure of json files is consistent

### Statistics:

- total duration of the dataset,
- min, max, mean, median, variance, standard deviation,

In [4]:
seconds_all_files = []
file_found = False
for f_wav in glob.glob(input_data_dir + "*.wav"):
    for f_json in glob.glob(input_data_dir + "*.json"):
        data = json.load(open(f_json))
        if not data["speech_segments"]:
            print("Something might be wrong with file {}.".format(f_json))
        if f_wav.strip(".wav") == f_json.strip(".json"):
            file_found = True
            break
    if not file_found: 
        print("A json file is missing for {}.".format(f_wav))

    seconds = sox.file_info.duration(f_wav)
    seconds_all_files.append(seconds)
    assert sox.file_info.sample_rate(f_wav) == sample_rate

In [5]:
total_dur = np.sum(seconds_all_files)
min_val = np.min(seconds_all_files)
max_val = np.max(seconds_all_files)
mean_val = np.mean(seconds_all_files)
median_val = np.median(seconds_all_files)
var_val = np.var(seconds_all_files)
std_val = np.std(seconds_all_files)

print('Total duration of the dataset is {dur} hours.'.format(dur=round(total_dur/60/60, 2)))
print('Min duration is {min} seconds, max duration is {max} seconds.'.format(min=round(min_val, 1), max=round(max_val, 1)))
print('Mean and median values are {mean} seconds and {median} seconds.'.format(mean=round(mean_val, 1), median=round(median_val, 1)))
print('Variance and standard deviation are {var} seconds and {std} seconds.'.format(var=round(var_val, 1), std=round(std_val,1)))

Total duration of the dataset is 3.28 hours.
Min duration is 1.4 seconds, max duration is 17.2 seconds.
Mean and median values are 12.3 seconds and 13.9 seconds.
Variance and standard deviation are 15.2 seconds and 3.9 seconds.
