In [None]:
# Setup the folder structure. It might already be there, and that's fine
from os import makedirs

makedirs("data/sentences", exist_ok=True)

Use the _magic_ [sh](https://ipython.readthedocs.io/en/stable/interactive/magics.html#cellmagic-sh) directive to run CLI commands.

In [None]:
%%sh
# We add -d to only output directories
tree . -d

In [None]:
# Download a dataset made of sentences found in moview reviews. The sentences need to be at least 3 seconds long when
# being read out loud. The sentences are packed in tarball gunzip archive as two files.
from urllib.request import urlretrieve

urlretrieve("http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz",
            "data/sentences/sentences.tar.gz")

In [None]:
%%sh
ls -al data/sentences/

In [None]:
# Extract the files from the tarball archive.
# We use the "with" syntax so that the file handler is closed automatically afterwards.
import tarfile

with tarfile.open("data/sentences/sentences.tar.gz") as tar:
    tar.extractall("data/sentences")

In [None]:
%%sh
# We can now see two files that have the extension 5000
# The 5000 represents how many samples are in each of the files.
ls -al data/sentences/

In [None]:
%%sh
# Sentences are one per line. here are the first 3
head -n3 data/sentences/quote.tok.gt9.5000

In [None]:
# Read the sentences from both files and create a list of sentences.
# The sentences are written using a specific encoding which must be used again when reading the files.

sentences = []
with open("data/sentences/plot.tok.gt9.5000", "r", encoding = "ISO-8859-1") as fp:
    sentences.extend(fp.read().split("\n")[0:5000])
with open("data/sentences/quote.tok.gt9.5000", "r", encoding = "ISO-8859-1") as fp:
    sentences.extend(fp.read().split("\n")[0:5000])

In [None]:
# First few sentences
sentences[:3]

In [None]:
makedirs("data/mp3", exist_ok=True)

In [None]:
import boto3

# This is a client object that allows us to communicate with the AWS service
# We need a instance of the Polly client
session = boto3.session.Session()
client = session.client("polly", region_name="us-east-1")

In [None]:
# Define a list with the voices in AWS Polly
voices = ["Ivy", "Joanna", "Joey", "Justin", "Kendra", "Kimberly", "Matthew", "Salli"]

In [None]:
# closing is a convenience stream utility so that we
# don't have to close the stream manually.
# io allows us to work with stream of data coming from Polly.
import random
from contextlib import closing
import io

# We need an id in the form of a counter for the sentences sequence,
# and the sentence itself
def process_input(i, sentence):
    # We randomly pick a voice to say the sentence out loud.
    voice = random.choice(voices)

    # The path to the mp3 we are about to download and write to disk
    # The voice that was used is part of the file name. We can use it for
    # labeling our training data later one
    file_mask = "data/mp3/sample-{:05}-{}.mp3".format(i, voice)

    # Ask Polly to do its magic
    response = client.synthesize_speech(
        OutputFormat="mp3",
        Text=sentence,
        TextType="text",
        VoiceId=voice
    )
    # Get the bytes stream containing the mp3
    with closing(response["AudioStream"]) as stream:
        # Write the stream to a bytes buffer in memory
        with io.BytesIO() as buffer:
            # If we fail, for whatever reason (Eg: AWS is throttling us), we skip the file
            try:
                buffer.write(stream.read())
                buffer.seek(0)
                # Finally write the mp3 to disk
                with open(file_mask, "wb") as out:
                    out.write(buffer.read())
            except _:
                pass

In [None]:
# This takes about 5 seconds

# To move things faster we'll do "multi threading"
from multiprocessing.pool import ThreadPool

# We launch a maximum of 10 functions at a time for each sentence in the set
# To make the rest of tutorial reasonable fast, we'll use just 200 sentences instead of the 10.000 we have available.
# This will, of course, have a negative impact on the final model.
with ThreadPool(processes=10) as pool:
    pool.starmap(process_input, enumerate(sentences[:200]))

In [None]:
%%sh
# Show a few mp3 files
# We use the pipe operator for applying a filter to the output
ls -al data/mp3/ | head -6

In [None]:
from os import listdir
from os.path import isfile, join

# create a list of the mp3 files we have. We look at all the entities in the data.mp3 folder,
# but keep only the files (which should be just the mp3 files)
mp3_files = sorted([f for f in listdir("data/mp3") if isfile(join("data/mp3", f))])

In [None]:
from IPython.display import Audio
# demo the audio
Audio(filename=f"data/mp3/{mp3_files[3]}", autoplay=True)

In [None]:
# We only want to use a couple of seconds, so we trim the recordings
sample_start = random.randint(500, 1000)
sample_finish = sample_start + 2000

In [None]:
makedirs("data/wav", exist_ok=True)

In [None]:
from pydub import AudioSegment

# Take an mp3 file and convert it to waveform file.
# The first argument is not used
def process_mp3(_, mp3):
    # To create spectrograms, we need to have waveform files, which is just another audio format
    # We use a library for this conversion
    sound = AudioSegment.from_mp3(f"data/mp3/{mp3}")[sample_start:sample_finish]
    sound.export(f"data/wav/{mp3[:-3]}wav", format="wav")

In [None]:
# This takes about 20 seconds

# We launch a maximum of 20 functions at a time
# enumerate gives us an index we don't use (hence the _ in the arguments list of the process_mp3 function),
# and the mp3 file path
with ThreadPool(processes=20) as pool:
    pool.starmap(process_mp3, enumerate(mp3_files))

In [None]:
%%sh
# Show a few waveform files
ls -al data/wav/ | head -6

In [None]:
wav_files = sorted([f for f in listdir("data/wav/") if isfile(join("data/wav/", f))])

In [None]:
Audio(filename=f"data/wav/{wav_files[3]}", autoplay=True)
# The waveform file has a shorter length than the original, because we trim it to a small fixed length.

In [None]:
import numpy as np
# matplotlib allows us to plot graphical representations. We use a specific backend engine for this, agg.
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
import wave

# function for generating a spectrogram image file from a waveform audio file
def graph_spectrogram(wav_file):
    wav = wave.open(f"data/wav/{wav_file}", "r")
    frames = wav.readframes(-1)
    sound_info = np.frombuffer(frames, "int16")
    frame_rate = wav.getframerate()
    wav.close()
    fig = plt.figure()
    fig.set_size_inches((1.4, 1.4))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    plt.set_cmap("hot")
    plt.specgram(sound_info, Fs=frame_rate)
    plt.savefig(f"data/spectrograms/{wav_file[:-3]}png", format="png")
    plt.close(fig)

In [None]:
makedirs("data/spectrograms", exist_ok=True)

In [None]:
# This takes about 10 seconds
# This is better if it happens sequentially, because of the amount of memory used by the plotting library

for wav_file in wav_files:
    graph_spectrogram(wav_file)

In [None]:
%%sh
# Show a few mp3 files
ls -al data/spectrograms/ | head -6

In [None]:
spectrograms = sorted([join("data/spectrograms/", f) for f in listdir("data/spectrograms/") if isfile(join("data/spectrograms/", f))])

In [None]:
from IPython.display import Image

Image(filename = spectrograms[3])

In [None]:
# We use pandas data frames, becuase it makes it easy for us to create a train/test dataset
import pandas as pd

df = pd.DataFrame({"spectrogram": spectrograms})
df["label"] = df.spectrogram.str.extract("sample-\\d+-(\\w+)\\.png", expand=False).apply(lambda x: voices.index(x))
df["voice"] = df.spectrogram.str.extract('sample-\\d+-(\\w+)\\.png', expand=False)

In [None]:
df

In [None]:
# create a stratified split
train = df.groupby("voice").apply(lambda x: x.sample(frac=.8)).reset_index(0, drop=True)
validation = df.loc[np.logical_not(df.index.isin(train.index)), :]

In [None]:
train.groupby("voice")["label"].count().reset_index()

In [None]:
validation.groupby("voice")["label"].count().reset_index()

In [None]:
# The computer vision library for Python allows us to read images as numerical array
import cv2
import mxnet as mx

# MXNet understands numbers, so our spectrograms need to become just that. Labels as well.
def transform(row):
    img = cv2.imread(row["spectrogram"])
    img = mx.nd.array(img)
    img = img.astype(np.float32)
    # MXNet CN require a specific order of an image dimensions(colors, x, y) instead of (x, y, colors)
    img = mx.nd.transpose(img, (2, 0, 1))
    # Normalize data between 0 and 1
    img = img / 255
    label = np.float32(row["label"])
    return img, label

In [None]:
train_nd = [transform(row) for _, row in train.iterrows()]
validation_nd = [transform(row) for _, row in validation.iterrows()]

In [None]:
train_nd[0]

In [None]:
batch_size = 16
epochs = 5

In [None]:
from mxnet.gluon.data import DataLoader

train_data = DataLoader(train_nd, batch_size, shuffle=True)
validation_data = DataLoader(validation_nd, batch_size, shuffle=True)

In [None]:
from mxnet.gluon.nn import Sequential, Conv2D, MaxPool2D, Dropout, Flatten, Dense

net = Sequential()
with net.name_scope():
    net.add(Conv2D(channels=32, kernel_size=(3, 3), padding=0, activation="relu"))
    net.add(Conv2D(channels=32, kernel_size=(3, 3), padding=0, activation="relu"))
    net.add(MaxPool2D(pool_size=(2, 2))) # Reduces overfitting, reduces spatial input, reduces coputation
    net.add(Dropout(.25)) # Reduces overfitting
    net.add(Flatten())
    net.add(Dense(8))

In [None]:
from mxnet.initializer import Xavier

# Also known as Glorot
net.initialize(Xavier(magnitude=2.24), ctx=mx.cpu())

In [None]:
from mxnet.gluon import Trainer

trainer = Trainer(net.collect_params(), optimizer="adam")

In [None]:
from mxnet.gluon.contrib import estimator
from mxnet.metric import Accuracy
from mxnet.gluon.loss import SoftmaxCrossEntropyLoss

est = estimator.Estimator(net=net, loss=SoftmaxCrossEntropyLoss(), metrics=Accuracy(), trainer=trainer)
est.fit(train_data=train_data, epochs=5, val_data=validation_data)

In [None]:
Audio(filename="Kimberly recites some shameless self promotion ad.mp3", autoplay=True)

In [None]:
%%sh
# Copy the ready-made sample into the mp3 folder becuase that's where the processing function expects it to be
cp Kimberly\ recites\ some\ shameless\ self\ promotion\ ad.mp3 data/mp3/

In [None]:
%%sh
ls -al data/mp3 | head -6

In [None]:
process_mp3(None, "Kimberly recites some shameless self promotion ad.mp3")

In [None]:
%%sh
ls -al data/wav | head -6

In [None]:
graph_spectrogram("Kimberly recites some shameless self promotion ad.wav")

In [None]:
%%sh
ls -al data/spectrograms | head -6

In [None]:
Image(filename = "data/spectrograms/Kimberly recites some shameless self promotion ad.png")

In [None]:
row = {
    "spectrogram": "data/spectrograms/Kimberly recites some shameless self promotion ad.png",
    "label": -1
}

In [None]:
img_as_ndarray, _ = transform(row)

In [None]:
img_as_ndarray.shape

In [None]:
one_ndarray_batch = mx.ndarray.expand_dims(img_as_ndarray, axis=0)

In [None]:
one_ndarray_batch

In [None]:
raw_prediction = net(one_ndarray_batch)

In [None]:
raw_prediction

In [None]:
idx = mx.nd.argmax(raw_prediction, axis=1) \
    .asnumpy() \
    .astype(np.int) \
    .ravel()[0]

In [None]:
idx

In [None]:
voices[idx]

In [None]:
test_phrase = input("What phrase to pronounce?")
test_voice = input("Ivy, Joanna, Joey, Justin, Kendra, Kimberly, Matthew or Salli?")

In [None]:
session = boto3.session.Session()
client = session.client("polly", region_name="us-east-1")
test_response = client.synthesize_speech(
    OutputFormat="mp3",
    Text=test_phrase,
    TextType="text",
    VoiceId=test_voice
)
with closing(test_response["AudioStream"]) as stream:
    # Write the stream to a bytes buffer in memory
    with io.BytesIO() as buffer:
        # If we fail, for whatever reason (Eg: AWS is throttling us), we skip the file
        try:
            buffer.write(stream.read())
            buffer.seek(0)
            # Finally write the mp3 to disk
            with open("test.mp3", "wb") as out:
                out.write(buffer.read())
        except _:
            pass

In [None]:
Audio(filename="test.mp3", autoplay=True)

In [None]:
%%sh
cp test.mp3 data/mp3/

In [None]:
process_mp3(None, "test.mp3")
graph_spectrogram("test.wav")
test_row = {
    "spectrogram": "data/spectrograms/test.png",
    "label": -1
}
test_img_as_ndarray, _ = transform(test_row)
test_raw_prediction = net(mx.ndarray.expand_dims(test_img_as_ndarray, axis=0))
test_idx = mx.nd.argmax(test_raw_prediction, axis=1) \
    .asnumpy() \
    .astype(np.int) \
    .ravel()[0]
voices[test_idx]