In [4]:
import math
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from pathlib import Path
from sklearn.model_selection import train_test_split

In [5]:
SEED = 1
RECORDS_PER_TFRECORD_FILE = 2000
DATA_ROOT = Path("/", "data")
IMAGE_HEIGHT, IMAGE_WIDTH = 224, 224

### Preparing cats-dogs dataset

#### Loading file list

In [7]:
meta_df = pd.read_csv(Path(DATA_ROOT, "01-raw", '01-catdog', 'list.txt'), names=['filename', 'species'], sep=' ', skiprows=6, usecols=[0, 2])
meta_df.species = meta_df.species - 1
meta_df.sample(5, random_state=SEED)

Unnamed: 0,filename,species
4387,Bombay_215,0
2534,english_setter_190,1
1936,american_bulldog_193,1
5551,keeshond_91,1
4363,Birman_87,0


#### Creating train-test datasets

In [8]:
test, train = train_test_split(meta_df, test_size=0.8, random_state=SEED, stratify=meta_df.species)
train_size = math.ceil(len(train) / RECORDS_PER_TFRECORD_FILE)
test_size = math.ceil(len(test) / RECORDS_PER_TFRECORD_FILE)

In [9]:
len(train), len(test)

(5880, 1469)

In [10]:
def generate_path(split_name, chunk_number):
    return Path(DATA_ROOT, '02-tfrecords', split_name, f"{split_name}-{str(chunk_number).zfill(3)}.tfrecord")

In [11]:
def read_and_preprocess_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize_with_pad(image, IMAGE_HEIGHT, IMAGE_WIDTH)
    image = tf.cast(image, tf.uint8)

    return image

In [14]:
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def write_tfrecord(items, tfrecord_filename):
    root = Path(DATA_ROOT, '01-raw', '01-catdog', 'images')

    tfrecord_filename.parents[0].mkdir(parents=True, exist_ok=True)
    writer = tf.io.TFRecordWriter(str(tfrecord_filename), tf.io.TFRecordOptions(compression_type="GZIP"))
    
    for filename, label in items:
        image = read_and_preprocess_image(str(Path(root, f'{filename}.jpg')))
        feature = {
            'label': _int64_feature(label),
            'image': _bytes_feature(image.numpy().tobytes()),
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())
    writer.close()

In [15]:
for i in tqdm(range(train_size)):
    write_tfrecord(train[i*RECORDS_PER_TFRECORD_FILE:(i + 1)*RECORDS_PER_TFRECORD_FILE].values, generate_path("train", i))

for i in tqdm(range(test_size)):
    write_tfrecord(test[i*RECORDS_PER_TFRECORD_FILE:(i + 1)*RECORDS_PER_TFRECORD_FILE].values, generate_path("test", i))

100%|██████████| 3/3 [01:20<00:00, 26.79s/it]
100%|██████████| 1/1 [00:19<00:00, 19.65s/it]
