In [1]:
%%time
%%capture
!pip install pydub > /dev/null

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 657 ms


In [2]:
%%time
import random
import base64
import json
import tarfile
import wave
from contextlib import closing
from os import listdir, makedirs
from os.path import isfile, join
from pickle import dump
from sagemaker.mxnet import MXNet
from shutil import rmtree, copy2
from urllib.request import urlretrieve
from tempfile import gettempdir

import boto3
import cv2
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
import sagemaker
from pydub import AudioSegment

CPU times: user 1.57 s, sys: 588 ms, total: 2.16 s
Wall time: 1.47 s


In [3]:
sagemaker_session = sagemaker.Session()

## Data Generation

In [4]:
rmtree("data/sentences", True)
makedirs("data/sentences")

urlretrieve("http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz",
            "data/sentences/sentences.tar.gz")

tar = tarfile.open("data/sentences/sentences.tar.gz")
tar.extractall("data/sentences")
tar.close()

with open("data/sentences/plot.tok.gt9.5000", "r", encoding = "ISO-8859-1") as first_file:
    first_sentences = first_file.read().split("\n")[0:5000]
with open("data/sentences/quote.tok.gt9.5000", "r", encoding = "ISO-8859-1") as second_file:
    second_sentences = second_file.read().split("\n")[0:5000]
    
rmtree("data/sentences", True)
makedirs("data/sentences")

with open("data/sentences/sentences.txt", "w") as sentences_file:
    for sentence in first_sentences + second_sentences:
        sentences_file.write("{}\n".format(sentence))

CPU times: user 457.37 s, sys: 56.00 s, total: 513.37 s
Wall time: 356.27 s


In [5]:
with open("data/sentences/sentences.txt", "r", encoding = "ISO-8859-1") as sentences_file:
    sentences = sentences_file.read().split("\n")[:-1]

In [6]:
voices = ["Ivy", "Joanna", "Joey", "Justin", "Kendra", "Kimberly", "Matthew", "Salli"]

In [7]:
%%time
client = boto3.client("polly")

i = 1

random.seed(42)

rmtree("data/mp3", True)
makedirs("data/mp3")

for sentence in sentences:
    voice = random.choice(voices)
    file_mask = "data/mp3/sample-{:05}-{}.mp3".format(i, voice)
    i += 1
    response = client.synthesize_speech(
        OutputFormat="mp3",
        Text=sentence,
        TextType="text",
        VoiceId=voice
    )
    with open(file_mask, "wb") as out:
        with closing(response["AudioStream"]) as stream:
            out.write(stream.read())

In [8]:
mp3_files = sorted([f for f in listdir("data/mp3") if isfile(join("data/mp3", f))])

In [9]:
%%time
rmtree("data/wav", True)
makedirs("data/wav")

sample_start = random.randint(500, 1000)
sample_finish = sample_start + 2000

for mp3 in mp3_files:
    sound = AudioSegment.from_mp3("data/mp3/{}".format(mp3))[sample_start:sample_finish]
    sound.export("data/wav/{}wav".format(mp3[:-3]), format="wav")

CPU times: user 457.37 s, sys: 56.00 s, total: 513.37 s
Wall time: 356.27 s


In [10]:
def graph_spectrogram(wav_file, out):
    wav = wave.open(wav_file, "r")
    frames = wav.readframes(-1)
    sound_info = np.frombuffer(frames, "int16")
    frame_rate = wav.getframerate()
    wav.close()
    fig = plt.figure()
    fig.set_size_inches((1.4, 1.4))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    plt.set_cmap("hot")
    plt.specgram(sound_info, Fs=frame_rate)
    plt.savefig(out, format="png")
    plt.close(fig)

In [11]:
wav_files = sorted([f for f in listdir("data/wav/") if isfile(join("data/wav/", f))])

In [12]:
%%time
%%capture --no-stdout --no-display
rmtree("data/spectograms", True)
makedirs("data/spectograms")

for wav in wav_files:
    graph_spectrogram("data/wav/{}".format(wav), "data/spectograms/{}png".format(wav[:-3]))

CPU times: user 457.37 s, sys: 56.00 s, total: 513.37 s
Wall time: 356.27 s


In [13]:
spectograms = sorted([join("data/spectograms/", f) for f in listdir("data/spectograms/") if isfile(join("data/spectograms/", f))])

In [14]:
df = pd.DataFrame({
    "wav": [join("data/wav/", f) for f in wav_files],
    "mp3": [join("data/mp3/", f) for f in mp3_files],
    "spectogram": spectograms
})
df["label"] = df.spectogram.str.extract("sample-\\d+-(\\w+)\\.png", expand=False).apply(lambda x: voices.index(x))
df["voice"] = df.spectogram.str.extract('sample-\\d+-(\\w+)\\.png', expand=False)

In [15]:
train = df.groupby("voice").apply(lambda x: x.sample(frac=.8)).reset_index(0, drop=True)
validation = df.loc[~df.index.isin(train.index), :].groupby("voice").apply(lambda x: x.sample(frac=.5)).reset_index(0, drop=True)
test = df.loc[np.logical_not(np.logical_xor(~df.index.isin(train.index), ~df.index.isin(validation.index))), :]

In [16]:
def transform(row):
    img = cv2.imread(row["spectogram"])
    img = mx.nd.array(img)
    img = img.astype(np.float32)
    img = mx.nd.transpose(img, (2, 0, 1))
    img = img / 255
    label = np.float32(row["label"])
    return img, label

In [17]:
%%time
train_nd = [transform(row) for _, row in train.iterrows()]
validation_nd = [transform(row) for _, row in validation.iterrows()]

CPU times: user 26.8 s, sys: 2.54 s, total: 29.3 s
Wall time: 13.6 s


In [18]:
def save_to_disk(data, type):
    makedirs("{}/pvdwgmas/data/pickles/{}".format(gettempdir(), type))
    with open("{}/pvdwgmas/data/pickles/{}/data.p".format(gettempdir(), type), "wb") as out:
        dump(data, out)

In [19]:
%%time
rmtree("{}/pvdwgmas".format(gettempdir()), True)

save_to_disk(train_nd, "train")
save_to_disk(validation_nd, "validation")

CPU times: user 1.85 s, sys: 4.25 s, total: 6.1 s
Wall time: 7.44 s


In [20]:
%%time
inputs = sagemaker_session.upload_data(path="{}/pvdwgmas/data/pickles".format(gettempdir()),
                                       bucket="redacted", key_prefix="cosmin/sagemaker/demo")
rmtree("{}/pvdwgmas".format(gettempdir()), True)

CPU times: user 18.9 s, sys: 7.9 s, total: 26.8 s
Wall time: 26.9 s


In [21]:
rmtree("data/test", True)
makedirs("data/test")
for _, row in test.iterrows():
    makedirs("data/test/{}".format(row["voice"]), exist_ok=True)
    copy2(row["mp3"], "data/test/{}".format(row["voice"]))

## Model training

In [22]:
estimator = MXNet("voice-recognition-sagemaker-script.py", 
          role=sagemaker.get_execution_role(), 
          train_instance_count=1, 
          train_instance_type="ml.p2.xlarge",
          hyperparameters={"epochs": 5},
          py_version="py3")

In [23]:
estimator.fit(inputs)

INFO:sagemaker:Created S3 bucket: redacted
INFO:sagemaker:Creating training-job with name: sagemaker-mxnet-2018-05-29-10-33-20-284


..............................
[31m2018-05-29 10:38:13,223 INFO - root - running container entrypoint[0m
[31m2018-05-29 10:38:13,224 INFO - root - starting train task[0m
[31m2018-05-29 10:38:13,243 INFO - container_support.training - Training starting[0m
[31m2018-05-29 10:38:14,949 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'user_requirements_file': None, 'model_dir': '/opt/ml/model', 'input_dir': '/opt/ml/input', '_scheduler_ip': '10.32.0.4', 'output_dir': '/opt/ml/output', 'hosts': ['algo-1'], 'channel_dirs': {'training': '/opt/ml/input/data/training'}, 'code_dir': '/opt/ml/code', 'sagemaker_region': 'us-east-1', 'input_config_dir': '/opt/ml/input/config', '_scheduler_host': 'algo-1', 'container_log_level': 20, 'job_name': 'sagemaker-mxnet-2018-05-29-10-33-20-284', 'output_data_dir': '/opt/ml/output/data/', 'enable_cloudwatch_metrics': False, 'base_dir': '/opt/ml', 'user_script_name': 'voice-recognition-sagemaker-script.py', 'user_script_archive': 's3://redacted

  for idx, event in sagemaker.logs.multi_stream_iter(client, log_group, stream_names, positions):


[31mEpoch 0. Loss: 1.19020674213, Train_acc 0.927615951994, Test_acc 0.924924924925[0m
[31mEpoch 1. Loss: 0.0955917794597, Train_acc 0.910488811101, Test_acc 0.904904904905[0m
[31mEpoch 2. Loss: 0.0780380586131, Train_acc 0.982872859107, Test_acc 0.967967967968[0m
[31mEpoch 3. Loss: 0.0515212092374, Train_acc 0.987123390424, Test_acc 0.95995995996[0m
[31mEpoch 4. Loss: 0.0513322874282, Train_acc 0.995874484311, Test_acc 0.978978978979[0m
===== Job Complete =====
Billable seconds: 337


## Model Deployment

In [24]:
predictor = estimator.deploy(instance_type="ml.m4.xlarge", initial_instance_count=1)

INFO:sagemaker:Creating model with name: sagemaker-mxnet-2018-05-29-10-33-20-284
INFO:sagemaker:Creating endpoint with name sagemaker-mxnet-2018-05-29-10-33-20-284


--------------------------------------------------------------------------!

## Testing Inference

In [25]:
sagemaker_runtime_client = boto3.client("sagemaker-runtime")

### Single Sample

In [26]:
with open("Kimberly recites some shameless self promotion ad.mp3", "rb") as audio_file:
    payload = base64.b64encode(audio_file.read()).decode("utf-8")
    response = sagemaker_runtime_client.invoke_endpoint(
        EndpointName=predictor.endpoint,
        Body=payload,
        ContentType="audio/mp3",
        Accept="application/json"
    )["Body"].read()
    print("Kimberly predicted as {}".format(json.loads(response, encoding="utf-8")))

Kimberly predicted as Joanna


### Batch Mode

In [27]:
for directory in listdir("data/test"):
    batch = []
    cnt = 0
    total = 0
    detected = 0
    for file in listdir("data/test/{}".format(directory)):
        with open("data/test/{}/{}".format(directory, file), "rb") as audio_file:
            batch.append(base64.b64encode(audio_file.read()).decode("utf-8"))
            cnt += 1
            if cnt == 5:
                binary_json = json.dumps(batch).encode("utf-8")
                response = sagemaker_runtime_client.invoke_endpoint(
                    EndpointName=predictor.endpoint,
                    Body=binary_json,
                    ContentType="application/json",
                    Accept="application/json"
                )["Body"].read()
                individual_predictions = json.loads(response, encoding="utf-8")
                for prediction in individual_predictions:
                    total += 1 
                    if prediction == directory:
                        detected += 1
                cnt = 0
                batch = []
    print("""Recordings with {}:
        Total: {}
        Detected: {}
        Accuracy: {:0.2f}
    """.format(directory, str(total), str(detected), detected/total))

Recordings with Salli:
        Total: 125
        Detected: 121
        Accuracy: 0.97
    
Recordings with Kimberly:
        Total: 120
        Detected: 115
        Accuracy: 0.96
    
Recordings with Joey:
        Total: 125
        Detected: 125
        Accuracy: 1.00
    
Recordings with Justin:
        Total: 120
        Detected: 111
        Accuracy: 0.93
    
Recordings with Matthew:
        Total: 125
        Detected: 125
        Accuracy: 1.00
    
Recordings with Kendra:
        Total: 120
        Detected: 117
        Accuracy: 0.97
    
Recordings with Ivy:
        Total: 115
        Detected: 111
        Accuracy: 0.97
    
Recordings with Joanna:
        Total: 130
        Detected: 128
        Accuracy: 0.98
    
