# Setup 

In [65]:
import os
import json
import numpy as np
from tqdm import tqdm
from PIL import Image
from numpy import load
from google.cloud import storage
from google.cloud.storage import transfer_manager

# Extract 

Extract from Kuzushiji dataset

In [11]:
# !unzip ../datasets/kuzushiji.zip -d ../datasets/kuzushiji

In [61]:
labels_train = load("../datasets/kuzushiji/kmnist-train-labels.npz")['arr_0']
img_train = load("../datasets/kuzushiji/kmnist-train-imgs.npz")['arr_0']
labels_test = load("../datasets/kuzushiji/kmnist-test-labels.npz")['arr_0']
img_test = load("../datasets/kuzushiji/kmnist-test-imgs.npz")['arr_0']

# Transform

Transform numpy arrays into PNG images, labels into JSONL

In [13]:
# !mkdir ../datasets/kuzushiji_images
# !mkdir ../datasets/kuzushiji_images/train
# !mkdir ../datasets/kuzushiji_images/test

In [37]:
def export_to_png(npzfile, folder):   
    for i in tqdm(range(npzfile.shape[0])):
        img = Image.fromarray(npzfile[i])
        img.save(os.path.join(folder,"%05d.png" % i))

#export_to_png(img_train, "../datasets/kuzushiji_images/train")
export_to_png(img_test, "../datasets/kuzushiji_images/test")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 13091.46it/s]


## Generate JSONL

In [69]:
def make_rec(bucket, filename, label, phase):
    # https://cloud.google.com/vertex-ai/docs/image-data/classification/prepare-data
    rec = {
        "imageGcsUri": f"gs://{bucket}/{filename}",
        "classificationAnnotation": {
            "displayName": label,
            "annotationResourceLabels": {
                "aiplatform.googleapis.com/annotation_set_name": "hiragana",
                "env": "dev"
            }
        },
        "dataItemResourceLabels": {
        "aiplatform.googleapis.com/ml_use": phase
        }
    }
    return rec

train_json = [make_rec("kuzushiji-mnist", "%05d.png" % i, str(int(label)), "train") for i,label in enumerate(list(labels_train)) ]
test_json = [make_rec("kuzushiji-mnist", "%05d.png" % i, str(int(label)), "test") for i,label in enumerate(list(labels_test)) ]

In [78]:
with open("../datasets/kuzushiji/dataset.jsonl", "w") as f:
    for ln in train_json[0:10]:
        f.write(json.dumps(ln) + "\n")
    for ln in test_json[0:10]:        
        f.write(json.dumps(ln) + "\n")

In [76]:
!cat ../datasets/kuzushiji/dataset.jsonl

{"imageGcsUri": "gs://kuzushiji-mnist/00000.png", "classificationAnnotation": {"displayName": "8", "annotationResourceLabels": {"aiplatform.googleapis.com/annotation_set_name": "hiragana", "env": "dev"}}, "dataItemResourceLabels": {"aiplatform.googleapis.com/ml_use": "train"}}{"imageGcsUri": "gs://kuzushiji-mnist/00001.png", "classificationAnnotation": {"displayName": "7", "annotationResourceLabels": {"aiplatform.googleapis.com/annotation_set_name": "hiragana", "env": "dev"}}, "dataItemResourceLabels": {"aiplatform.googleapis.com/ml_use": "train"}}{"imageGcsUri": "gs://kuzushiji-mnist/00002.png", "classificationAnnotation": {"displayName": "0", "annotationResourceLabels": {"aiplatform.googleapis.com/annotation_set_name": "hiragana", "env": "dev"}}, "dataItemResourceLabels": {"aiplatform.googleapis.com/ml_use": "train"}}{"imageGcsUri": "gs://kuzushiji-mnist/00003.png", "classificationAnnotation": {"displayName": "1", "annotationResourceLabels": {"aiplatform.googleapis.com/annotation_set

# Load

Load into GCP Bucket

In [73]:
if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
    print("GCP credentials are needed for storage bucket")

In [37]:
client = storage.Client()
bucket = client.get_bucket('kuzushiji-mnist')

In [58]:
setnames = ["train","test"]

for setname in setnames:
    root_path = "../datasets/kuzushiji_images/"
    filenames = [fn for fn in os.listdir(root_path + setname) ]
    results = transfer_manager.upload_many_from_filenames(
       bucket, 
       filenames, 
       source_directory= root_path + setname, 
       blob_name_prefix= setname + "/",
       max_workers=1000
    )
    print("Failed:", len([result for result in results if isinstance(result, Exception)]))



Failed: 0
