In [0]:
import os
import re
import smtplib
import subprocess
import time
import random

import google3
import tensorflow as tf
import numpy as np

from google3.pyglib import app
from google3.pyglib import flags
from google3.pyglib import gfile
from google3.pyglib import logging
from google3.pyglib import resources
from google3.sstable.python import sstable

from colabtools import adhoc_import
with adhoc_import.Google3(build_targets=["//research/handwriting/proto:labeled_ink_py_pb2"]):
  from google3.research.handwriting.proto import labeled_ink_pb2



In [0]:
INPUTFILE="/cns/ok-d/home/handwriting/deselaers/diagrams/create_data_20181102/00005-MergeDataPhase/e=1:kid=71394:mkey=handwriting_requests_cns/data-?????-of-00011"
OUTPUTFILE="/cns/ok-d/home/handwriting/deselaers/diagrams.tf_record" 
NUM_OUTPUT_SHARDS=25

In [0]:
INPUTFILE="/cns/ok-d/home/handwriting/deselaers/diagrams/shapes_data_20181102/00005-MergeDataPhase/e=1:kid=71394:mkey=handwriting_requests_cns/data-?????-of-00011"
OUTPUTFILE="/cns/ok-d/home/handwriting/deselaers/shapes.tf_record" 
NUM_OUTPUT_SHARDS=25

In [0]:
def fetch_table_data(input_file):
  table = sstable.MergedSSTable(
        gfile.Glob(input_file),
        wrapper=sstable.TableWrapper(labeled_ink_pb2.LabeledInk.FromString))
  return table

def to_tfexample(ink_hash, labeled_ink):
  features = {}
  features['ink_hash'] = tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[ink_hash]))
  #features['label_hash'] = tf.train.Feature(
  #      bytes_list=tf.train.BytesList(value=[labeled_ink.label.encode('utf-8')]))
  features['label'] = tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[labeled_ink.label.encode('utf-8')]))

  strokes = []
  for stroke in labeled_ink.ink.strokes:
    s = np.zeros([len(stroke.x), 4], dtype=np.float32)   
    for i in range(0, len(stroke.x)):
      s[i, 0] = stroke.x[i]
      s[i, 1] = stroke.y[i]
      s[i, 2] = stroke.t[i]
    s[-1, 3] = 1
    strokes.append(s)
  all_strokes = np.concatenate(strokes, axis=0)
  features['ink'] =  tf.train.Feature(
        float_list=tf.train.FloatList(value=all_strokes.astype(float).flatten()))
  features['shape'] =  tf.train.Feature(
        int64_list=tf.train.Int64List(value=all_strokes.shape))
  features['num_strokes'] =  tf.train.Feature(
        int64_list=tf.train.Int64List(value=[len(labeled_ink.ink.strokes)]))     
  example = tf.train.Example(features=tf.train.Features(feature=features))
  return ink_hash, example

def create_tfrecord_writers(output_file, num_output_shards):
  writers = []
  for i in range(num_output_shards):
    writers.append(tf.python_io.TFRecordWriter("%s-%05i-of-%05i" % (output_file, i, num_output_shards)))
  return writers

def close_tfrecord_writers(writers):
  for w in writers:
    w.close()

def _pick_output_shard():
    return random.randint(0, NUM_OUTPUT_SHARDS - 1)

def write_tfexample(writers, tf_example):
  writers[_pick_output_shard()].write(tf_example.SerializeToString())

In [61]:
table = fetch_table_data(INPUTFILE)
tfrecord_writers = create_tfrecord_writers(OUTPUTFILE, NUM_OUTPUT_SHARDS)

for key, labeled_ink in table.iteritems():
  inkhash, tf_example = to_tfexample(key, labeled_ink)
  print(key)
  
  write_tfexample(tfrecord_writers, tf_example)

close_tfrecord_writers(tfrecord_writers)

000b7d47bbbb78bc
00128d8001b0cd5a
00232c0c70aa01bb
00275b92b7271822
0029af826e879709
002f484f364868e2
0044bfb8666ed1d4
0048425a43bd45bb
00491ece6b91557e
004f59f5910ffc14
0050c52c8dcde1ca
0051f116ec7d820f
00602687c9fc1afe
0061e7edbae62383
006986c8f571d1e4
007333d522c1f0dc
007d90001f4e03bc
008ffb2238a05b1c
00929b1ea8686f47
00a3b627b2e62d7d
00a4d9561c8ca0b0
00a5b5f53a1b97fc
00a6684bda2b0013
00b6260104749e96
00bc62c70cdc467b
00bd45f14217c1e5
00c85387b9950434
00cdb92fedbcf65c
00d746037eadc768
00d8395cfd1ea173
00d9fe94359a7f31
00def299e89f9109
00e86aaab1ef83f3
00f443dc2dd566d3
0100757092dc9977
0105ed13eb94eef6
010cf4f1c7d1db00
010e7ddaf37cf230
0111fa2a75ebd99e
0125a53411af9d5d
012694bb6c97903c
01271d608528861f
01309e2e5f173c8a
014c2f2707cc8337
014d1ad3d84ed2f6
015d1efc72820802
016d2b576d49d53d
0178e275af2fb349
0178e509b89af941
0182675b9876d73e
0182ac5c1e186fd9
01a33cc427f02600
01a3cf5e4d3ed3b2
01ad25a79c1dc032
01b22acb46b7a8ff
01bb7872ba4e4d5e
01be6aa4a3505f14
01cc04ea93f72baf
01d0097e316150