# Pre-Processing Base Model

+ **Goal**: Using the [LibriVox dataset](http://www.openslr.org/12), located in `data/base/raw/test-clean` and `data/base/raw/train-clean`, pre-process the data into images (spectrograms) and output to `data/base/process/train-clean` and `data/base/process/test-clean`.

In [3]:
#!pip install psutil
import os
import sys

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import librosa.display
from matplotlib import cm
from sklearn import preprocessing
import soundfile as sf
import pandas as pd
import psutil

rootdir = "/nb/transfer/data/base/raw/train-clean-100"
PROCESSED_DIR = '/nb/transfer/data/base/'
samples_per_observation = 16000 # 1-second windows (given librivox corpus)
print(psutil.virtual_memory())

svmem(total=13660872704, available=6186033152, percent=54.7, used=7124361216, free=211632128, active=12238913536, inactive=1002651648, buffers=54595584, cached=6270283776, shared=52719616)


## Getting spectrograms

In [9]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
sr = 16000
## We can choose any colormap we want https://matplotlib.org/examples/color/colormaps_reference.html
convert = plt.get_cmap(cm.jet)

def get_spectrograms(rows):
    imgs = np.zeros((rows.shape[0], 1025, 32, 3))
    for i in range(0, rows.shape[0]):
        X = librosa.stft(rows[i])
        Xdb = librosa.amplitude_to_db(X)
        Xdb = min_max_scaler.fit_transform(Xdb)
        numpy_output_static = convert(Xdb)[:,:,:3]
        #Somehow, convert returns the spectrograms flipped upside down, so we need to fix that 
        numpy_output_static = np.flip(numpy_output_static, 0)
        imgs[i] = numpy_output_static
    return imgs

## Reading audio

First we extract the numpy arrays from the audio itself.

In [5]:
def extract_observations(signal, label):
    """
    Create two numpy arrays: observations and labels.
    
    Args:
        signal: An np arry with shape (1, num_samples)
        label: The label of the speaker shape (1, num_samples)
    """
    num_samples = signal.shape[0]
    truncated = signal[:-(num_samples % samples_per_observation)]
    truncated = truncated.reshape((num_samples // samples_per_observation, -1))
    labels = np.zeros((truncated.shape[0], 1), dtype=np.int)
    labels[:] = label
    return truncated.reshape((num_samples // samples_per_observation, -1)), labels

def build_rows(path, speaker_label):
    signal, sample_rate = sf.read(path)
    observations, labels = extract_observations(signal, speaker_label)
    return observations, labels

In [6]:
## Just figuring out male/female ratio in our dataset
df = pd.read_csv(
    '/nb/transfer/data/base/raw/SPEAKERS.TXT',
    skiprows=12,
    sep='\s+\|\s+',
    names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'],
    engine='python')
print(df.head())
males = df[df.SEX == 'M']
females = df[df.SEX == 'F']
print("Number of females: " + str(len(females)))
print("Number of males: " + str(len(males)))

   ID SEX           SUBSET  MINUTES              NAME
0  14   F  train-clean-360    25.03   Kristin LeMoine
1  16   F  train-clean-360    25.11    Alys AtteWater
2  17   M  train-clean-360    25.04    Gord Mackenzie
3  19   F  train-clean-100    25.19  Kara Shallenberg
4  20   F  train-other-500    30.07            Gesine
Number of females: 1201
Number of males: 1283


In [10]:
import psutil

## Features and labels of current dataset chunk
features = None
labels = None

## Features and labels of current speaker
nfeatures = None 
nlabels = None

speaker_label = 0
chunk_count = 100
speakers_per_chunk = 5
oppos = {
    'M':'F',
    'F':'M'
}
current = 'F'


for speaker in os.listdir(rootdir):
    # Each speaker has multiple books which he/she read
    print(int(speaker))
    # Ugly but works ok. Skipping speakers if they have the same gender as previous
    # processed speaker. If so, we don't add them to keep gender ratio
    if(df.loc[df['ID'] == int(speaker)].SEX.item() == oppos[current]):
        current = oppos[current]
    else:
        continue
    speaker_path = os.path.join(rootdir, speaker)
    for book_id in os.listdir(speaker_path):
        # Going over all audiosampes for a given book
        book_id_path = os.path.join(speaker_path, book_id)
        for file in os.listdir(book_id_path):
            # Sometimes there are .txt files in audiosamples' folders
            if (not file.endswith('.txt')):
                if (nfeatures is None):
                    ndata, nlabels = build_rows(os.path.join(book_id_path, file), speaker_label)
                    ## If size of the audiofile is less than samples_per_observation
                    if (ndata.shape[1] == 0):
                        continue
                    nfeatures = get_spectrograms(ndata)
                else:
                    ndata, n_data_labels = build_rows(os.path.join(book_id_path, file), speaker_label)
                    ## If size of the audiofile is less than samples_per_observation
                    if (ndata.shape[1] == 0):
                        continue
                    imgs = get_spectrograms(ndata)
                    nfeatures = np.vstack((nfeatures, imgs))
                    nlabels = np.vstack((nlabels, n_data_labels))
    ## Add this speaker to our current chunk
    if (features is None):
        features = nfeatures
        labels = nlabels
    else:
        features = np.vstack((features, nfeatures))
        labels = np.vstack((labels, nlabels))
    ## We mess with all features and nfeatures that because vstack reassables whole array, so if we already have data in features array for
    ## multiple speakers, vstacking by 1 row will take painfully long time. 
    ## add 
    nfeatures = None
    nlabels = None
    
    speaker_label += 1
    print(features.shape)
    print(labels.shape)
    ## Save speaker_per_chunk speakers' spectrograms into a chunk
    if (speaker_label % speakers_per_chunk == 0):
        print(features.shape)
        print(labels.shape)
        # Permute the given chunk
        permutation = np.random.permutation(features.shape[0])
        features = np.take(features, permutation,axis=0)
        labels = np.take(labels, permutation, axis=0)
        
        # Save data, start assembling new chunk
        np.save(os.path.join(PROCESSED_DIR, 'chunk_%d_features' % chunk_count), features)
        np.save(os.path.join(PROCESSED_DIR, 'chunk_%d_labels' % chunk_count), labels)
        chunk_count += 1
        features = None
        labels = None
    if (speaker_label == 50):
        break
    print('Speaker %d processed' % speaker_label)
    print(psutil.virtual_memory()) ## uncomment if you want to view the memory usage
print("Dataset building finished")

481


KeyboardInterrupt: 

## Moving To HDF5

The training data is larger than memory, we'd like to transfer everything to one HDF5 file, but we need to do it in chunks. See the `batcher.py` file for how we make this happen.

In [2]:
from collections import defaultdict
import glob
import os
import re

import numpy as np

from batcher import TrainingBatcher

ROOT_DIR = '/nb'
TRANSFER_DIR = os.path.join(ROOT_DIR, 'transfer')
DATA_DIR = os.path.join(TRANSFER_DIR, 'data')
BASE_DIR = os.path.join(DATA_DIR, 'base')
PROCESSED_DIR = os.path.join(BASE_DIR, 'processed')
HDF5_FILE = os.path.join(PROCESSED_DIR, 'train.hdf5')

### Write To File

Executing the below code will write all numpy files to `HDFS_FILE`.

In [None]:
numpy_files = defaultdict(list)

for f in glob.glob(os.path.join(PROCESSED_DIR, '*.npy')):
    numpy_files[int(re.findall(r'\d+', f)[0])].append(f)

with TrainingBatcher(HDF5_FILE, 'w') as data:
    for key, l in numpy_files.items():
        X = np.load([i for i in l if 'features' in i][0])
        y = np.load([i for i in l if 'labels' in i][0])
        print(X.shape)
        print(y.shape)
        data.write(X, y)
        print(data.X_ds.shape)
        del X
        del y

(7222, 1025, 32, 3)
(7222, 1)
(7222, 1025, 32, 3)
(6860, 1025, 32, 3)
(6860, 1)
(14082, 1025, 32, 3)
