# Google Quick Draw Embedding

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import ndjson

with open('data/full_simplified_teapot.ndjson') as f:
    data = ndjson.load(f)

In [5]:
data[0]

{'word': 'teapot',
 'countrycode': 'US',
 'timestamp': '2017-03-15 04:43:12.20679 UTC',
 'recognized': True,
 'key_id': '4699508627734528',
 'drawing': [[[49,
    38,
    0,
    11,
    19,
    29,
    54,
    75,
    85,
    92,
    111,
    119,
    134,
    158,
    214,
    228,
    240,
    249,
    255,
    254,
    250,
    235,
    225,
    198,
    174,
    157,
    94,
    40],
   [104,
    88,
    70,
    55,
    51,
    54,
    82,
    89,
    89,
    85,
    53,
    49,
    49,
    57,
    66,
    75,
    88,
    106,
    131,
    150,
    161,
    179,
    185,
    192,
    191,
    186,
    151,
    108]],
  [[175, 169, 166, 164, 175, 192, 208, 218, 220, 216],
   [56, 53, 44, 17, 3, 0, 11, 27, 54, 60]],
  [[178, 172, 174, 183, 205], [52, 44, 26, 19, 17]]]}

## Draw Sketches

In [7]:
import numpy as np
from skimage.draw import line_aa
import time

in_size = 256
out_size = 64
out_max = out_size - 1
scaling = out_size / in_size

sketches = np.zeros((len(data), out_size, out_size), dtype=np.float64)

print(f'Draw {len(data)} sketches...')

t = time.time()
t0 = t
o = 10000
for s, sketch in enumerate(data):
    if s % o == o - 1:
        print(f'...drew {o} sketches in {(time.time()-t):.1f} secs')
        t = time.time()
    for stroke in sketch['drawing']:
        xs, ys = stroke
        for k in np.arange(1, len(xs)):            
            i, j, val = line_aa(
                min(out_max, round(ys[k-1] * scaling)),
                min(out_max, round(xs[k-1] * scaling)),
                min(out_max, round(ys[k] * scaling)),
                min(out_max, round(xs[k] * scaling))
            ) # i0, j0, i1, j1
            sketches[s][i,j] += val

print(f'Done drawing in {((time.time() - t0) / 60):.1f} min')

sketches_flat = np.clip(sketches.reshape((sketches.shape[0], -1)), 0, 1)

Draw 126804 sketches...
...drew 10000 sketches in 8.3 secs
...drew 10000 sketches in 8.5 secs
...drew 10000 sketches in 8.9 secs
...drew 10000 sketches in 8.9 secs
...drew 10000 sketches in 8.4 secs
...drew 10000 sketches in 8.3 secs
...drew 10000 sketches in 8.6 secs
...drew 10000 sketches in 8.4 secs
...drew 10000 sketches in 9.5 secs
...drew 10000 sketches in 9.3 secs
...drew 10000 sketches in 8.7 secs
...drew 10000 sketches in 8.6 secs
Done drawing in 1.8 min


## Embed Drawings

In [8]:
from umap import UMAP

neighborhood = 0.001 # 1 promille
n_neighbors = round(sketches_flat.shape[0] * neighborhood)

embeddings = UMAP(n_neighbors=n_neighbors).fit_transform(sketches_flat)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../miniconda3/envs/pilingjs/lib/python3.7/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../miniconda3/envs/pilingjs/lib/python3.7/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, m

## Scale Embeddings

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler((0.1, 0.9))
scaled_embeddings = scaler.fit_transform(embeddings)

## Save Ebmedding

In [11]:
import codecs, json 

f = codecs.open('data/full_simplified_teapot_umap_embedding.json', 'w', encoding='utf-8')
json.dump(embeddings.tolist(), f, separators=(',', ':'), sort_keys=True, indent=4)

## Compose Data

In [45]:
from functools import reduce
import pycountry

# x, y, recognized, countrycode, num_strokes, mean_stroke_length
features = np.zeros((embeddings.shape[0], 6))

features[:, 0:2] = embeddings # x, y

for i, drawing in enumerate(data):
    features[i, 2] = drawing['recognized']
    try:
        features[i, 3] = pycountry.countries.get(alpha_2=drawing['countrycode']).numeric
    except AttributeError:
        features[i, 3] = 0
    features[i, 4] = len(drawing['drawing'])
    features[i, 5] = reduce(lambda a, b: a + len(b[0]), drawing['drawing'], 0) / len(drawing['drawing'])

In [48]:
from sklearn.preprocessing import MinMaxScaler

features_scaled = MinMaxScaler().fit_transform(features)

## Sub-Sample Drawings

In [54]:
from apricot import FeatureBasedSelection

selector = FeatureBasedSelection(2000, concave_func='sqrt', optimizer='two-stage', n_jobs=-1, verbose=False)
_, selection = selector.fit_transform(features_scaled, np.arange(features_scaled.shape[0]))

## Save Data

In [67]:
import codecs, json 

out = []

for i in selection:
    out.append({
        'countryCode': data[i]['countrycode'],
        'recognized': data[i]['recognized'],
        'umapEmbedding': features_scaled[i, 0:2].tolist(),
        'src': np.array(data[i]['drawing']).tolist(),
    })

f = codecs.open('data/teapot-umap-subsample.json', 'w', encoding='utf-8')
json.dump(out, f, separators=(',', ':'), sort_keys=True, indent=2)