In [25]:
import os
import h5py
import json
import pandas as pd
from io import BytesIO
from PIL import Image
import numpy as np

In [23]:
root = '/home/jluesch/Documents/data/plankton'

In [24]:

files = [el for el in os.listdir(root) if os.path.isfile(os.path.join(root, el))]

In [27]:
all_means = []

for hdf5_file in files:
    print(hdf5_file, end=" ")
    hdf5_file_path = os.path.join(root, hdf5_file)
    file = h5py.File(hdf5_file_path, "r")

    # Read the JSON string from the 'file_index' dataset
    file_index_json = file["file_index"][()]
    file_index = json.loads(file_index_json)

    # Add the HDF5 file name to each entry and accumulate the file entries
    for entry in file_index["files"][:4]:
        img_bytes = file[entry["path"]][()]
        f = BytesIO(img_bytes)
        img = Image.open(f).convert(mode="RGB")

        all_means.append(np.mean(img, axis=(0, 1)))

2011-TRAIN.hdf5 2007-TRAIN.hdf5 2013-VAL.hdf5 2009-VAL.hdf5 2008-VAL.hdf5 2014-TRAIN.hdf5 2012-VAL.hdf5 2010-TRAIN.hdf5 

In [31]:
print(np.mean(all_means, axis=0)/255., np.std(all_means, axis=0)/255.)


[0.68622917 0.68622917 0.68622917] [0.10176649 0.10176649 0.10176649]


# LMDB Testing

In [2]:
from io import BytesIO
import pandas as pd
import h5py
import json
from PIL import Image
import lmdb
import os

In [2]:

data_path = "/home/jluesch/Documents/data/plankton"
data_files = sorted([el for el in os.listdir(data_path) if el.endswith(".hdf5")])
data_files = data_files[1:]
print(data_files)
print(data_files[0][:-5])

['2008-VAL.hdf5', '2009-VAL.hdf5', '2010-TRAIN.hdf5', '2011-TRAIN.hdf5', '2012-VAL.hdf5', '2013-VAL.hdf5', '2014-TRAIN.hdf5']
2008-VAL


In [3]:
map_size = int(1e11)
for data_file in data_files:
    train_data_path = f"/home/jluesch/Documents/data/plankton/{data_file}"

    lmdb_imgs_path = f"/home/jluesch/Documents/data/plankton/lmdb/{data_file[:-5]}_imgs"
    lmdb_labels_path = f"/home/jluesch/Documents/data/plankton/lmdb/{data_file[:-5]}_labels"
    print(lmdb_imgs_path, lmdb_labels_path)

    env_imgs = lmdb.open(lmdb_imgs_path, map_size=map_size)
    env_labels = lmdb.open(lmdb_labels_path, map_size=map_size)

    file = h5py.File(train_data_path, "r")
    # Read the JSON string from the 'file_index' dataset
    file_index_json = file["file_index"][()]
    file_index = json.loads(file_index_json)
    print(f"#SAMPLES: {file_index['files']}")

    with env_labels.begin(write=True) as txn_labels:
        with env_imgs.begin(write=True) as txn_imgs:
            for entry in file_index["files"]:
                if entry["index"] % 50000 == 0:
                    print(entry["index"], entry["class_id"], entry["path"])

                img_bytes = file[entry["path"]][()]
                txn_imgs.put(str(entry["index"]).encode("utf-8"), img_bytes)
                txn_labels.put(str(entry["index"]).encode("utf-8"), str(entry["class_id"]).encode("utf-8"))
    env_imgs.close()
    env_labels.close()
    print(f"FINISHED {lmdb_imgs_path}")

/home/jluesch/Documents/data/plankton/2008-VAL.hdf5
dict_keys(['index', 'path', 'class_str', 'class_id'])
0 0 0 2008_amoeba_IFCB1_2008_043_185649_01270.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
40000 40000 21 2008_detritus_IFCB1_2008_116_215800_01249.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
80000 80000 41 2008_Heterocapsa_triquetra_IFCB1_2008_056_161935_02442.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
120000 120000 51 2008_mix_IFCB1_2008_056_161935_01314.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
160000 160000 51 2008_mix_IFCB1_2008_073_144023_00095.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
200000 200000 51 2008_mix_IFCB1_2008_095_001923_02526.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
240000 240000 51 2008_mix_IFCB1_2008_114_175618_01003.png
dict_keys(['index', 'path', 'class_str', 'class_id'])
280000 280000 51 2008_mix_IFCB1_2008_199_143321_03547.png
dict_keys(['index', 'path', 'class_str', 

In [5]:
import matplotlib.pyplot as plt
import torch
import torchvision

lmdb_env_imgs = lmdb.open(
    "/home/jluesch/Documents/data/plankton/lmdb/2007-TRAIN_imgs",
    readonly=True,
    lock=False,
    readahead=False,
    meminit=False,
)
lmdb_env_labels = lmdb.open(
    "/home/jluesch/Documents/data/plankton/lmdb/2007-TRAIN_labels",
    readonly=True,
    lock=False,
    readahead=False,
    meminit=False,
)

lmdb_txn_imgs = lmdb_env_imgs.begin()
lmdb_cursor_imgs = lmdb_txn_imgs.cursor()

lmdb_txn_labels = lmdb_env_labels.begin()
lmdb_cursor_labels = lmdb_txn_labels.cursor()

for i, ((key_i, value_i), (key_l, val_l)) in enumerate(zip(lmdb_cursor_imgs, lmdb_cursor_labels)):
    if i > 3:
        break
    print("key", int(key_i.decode()))
    f = BytesIO(value_i)

    img = Image.open(f).convert(mode="RGB")
    to_tensor = torchvision.transforms.ToTensor()
    img_tensor = to_tensor(img)
    buff = BytesIO()
    torch.save(img_tensor, buff)
    buff.seek(0)
    print(buff.read())
    print(img.size)

    #plt.imshow(img)
    #plt.show()
    print('label', val_l.decode())

key 0
b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04J\x00L\x02\x00tq\x05QK\x00K\x03K\xe0K\xe0\x87q\x06M\x00\xc4K\xe0K\x01\x87q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08/\xf8Ho\xa2\x00\x00\x00\xa2\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00"\x00archive/data/0FB\x1e\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?\xb3\xb22?