In [1]:
import os
import io
import random
import json
import base64
import subprocess
from pickle import dump
from shutil import rmtree, copy2

import cv2
import boto3
import mxnet as mx
import numpy as np
from pandas import read_csv
import sagemaker

In [2]:
sagemaker_session = sagemaker.Session()

## Synthetic Data Generation

In [3]:
dataset_size = 12000
img_size = 128
shape_size = 8
max_rows = 14
max_cols = 14
target_shape = "circle"

In [4]:
shapes = ["circle", "triangle", "square"]
img_width = img_height = img_size

In [5]:
%%time

os.makedirs("data/raw")
    
random.seed(42)

with open("data/dataset.csv", "w+") as dataset:
    dataset.write("img_path,target\n")
    for i in range(1, dataset_size + 1):
        target_number_of_shapes = 0

        filename = str(i).zfill(6) + ".shape"

        file_content = "img_dim:{},shp_dim:{}>>>".format(img_size, shape_size)
        for x in range(0, random.randint(1, max_rows)):
            for y in range(0, random.randint(1, max_cols)):
                shape_idx = random.randint(0, len(shapes) - 1)
                shape = shapes[shape_idx]
                if shape == target_shape:
                    target_number_of_shapes += 1
                file_content = file_content + shape + ","
            file_content = file_content[:-1]
            file_content += "|"
        file_content = file_content[:-1]
        file_content += "<<<"

        with open("./data/raw/{}".format(filename), "w") as shape_file:
            shape_file.write(file_content)

        dataset.write("./raw/{}.png,{}\n".format(filename, str(target_number_of_shapes)))

CPU times: user 2.89 s, sys: 2.12 s, total: 5.01 s
Wall time: 5.13 s


In [6]:
%%time
subprocess.call("""
    java -cp bin/shaper-all.jar com.cosminsanda.shaper.compiler.Shaper2Image --source-dir data/raw""", shell=True);

CPU times: user 3.12 ms, sys: 10.5 ms, total: 13.6 ms
Wall time: 49.8 s


0

In [7]:
df = read_csv("data/dataset.csv")

train = df.sample(frac=.8333, random_state=42)
validation = df.loc[~df.index.isin(train.index), :].sample(frac=.5, random_state=42)
test = df.loc[np.logical_not(np.logical_xor(~df.index.isin(train.index), ~df.index.isin(validation.index))), :]

In [8]:
def transform(row):
    img = cv2.imread("./data/{}".format(row["img_path"]))
    img = mx.nd.array(img)
    img = img.astype(np.float32)
    img = mx.nd.transpose(img, (2, 0, 1))
    img = img / 255
    label = np.float32(row["target"])
    return img, label

In [9]:
%%time
train_nd = [transform(row) for _, row in train.iterrows()]
validation_nd = [transform(row) for _, row in validation.iterrows()]

CPU times: user 10.3 s, sys: 2.66 s, total: 13 s
Wall time: 11.7 s


In [10]:
def save_to_disk(data, type):
    os.makedirs("data/pickles/{}".format(type))
    with open("data/pickles/{}/data.p".format(type), "wb") as out:
        dump(data, out)

In [11]:
%%time
save_to_disk(train_nd, "train")
save_to_disk(validation_nd, "validation")

CPU times: user 3.74 s, sys: 7.38 s, total: 11.1 s
Wall time: 12.4 s


In [12]:
%%time
inputs = sagemaker_session.upload_data(path="data/pickles", bucket="redacted", key_prefix="sagemaker/demo")

CPU times: user 23.9 s, sys: 19.1 s, total: 43 s
Wall time: 5min 17s


In [13]:
rmtree("./test", True)
os.makedirs("./test")
for _, row in test.iterrows():
    os.makedirs("test/{}".format(row["target"]), exist_ok=True)
    copy2("./data/{}".format(row["img_path"]), "./test/{}".format(row["target"]))

In [14]:
rmtree("data", True)

## Model Training

In [22]:
estimator = sagemaker.mxnet.MXNet("object-counting-sagemaker-script.py", 
          role=sagemaker.get_execution_role(), 
          train_instance_count=1, 
          train_instance_type="ml.p2.xlarge",
          hyperparameters={"epochs": 5},
          py_version="py3")