In [1]:
import os
import math
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
SEED = 1
RECORDS_PER_TFRECORD_FILE = 2000
DATA_ROOT = Path("/", "data")
IMAGE_HEIGHT, IMAGE_WIDTH = 224, 224

### Preparing dataset

#### Loading cats-dogs file list

In [3]:
catdog_df = pd.read_csv(Path(DATA_ROOT, '01-raw', '01-catdog', 'list.txt'), names=['filename', 'species'], sep=' ', skiprows=6, usecols=[0, 2])
catdog_df.filename = str(Path(DATA_ROOT, '01-raw', '01-catdog', 'images')) + os.sep + catdog_df.filename + '.jpg'
catdog_df.species = catdog_df.species - 1
catdog_df.sample(5, random_state=SEED)

Unnamed: 0,filename,species
4387,/data/01-raw/01-catdog/images/Bombay_215.jpg,0
2534,/data/01-raw/01-catdog/images/english_setter_1...,1
1936,/data/01-raw/01-catdog/images/american_bulldog...,1
5551,/data/01-raw/01-catdog/images/keeshond_91.jpg,1
4363,/data/01-raw/01-catdog/images/Birman_87.jpg,0


#### Loading horses file list

In [4]:
filelist = Path(DATA_ROOT, '01-raw', '02-horse').glob('*.jpeg')
horse_df = pd.DataFrame({
    'filename': [str(f.absolute()) for f in filelist],
    'species': 2,
    
})
horse_df.sample(5, random_state=SEED)

Unnamed: 0,filename,species
1000,/data/01-raw/02-horse/OIP-_dXqiMjcGhETeeog11EI...,2
654,/data/01-raw/02-horse/OIP-abWfS5BJ1aUVqejincm6...,2
1102,/data/01-raw/02-horse/OIP-k8fBJWzNnV_pB9wefz_Z...,2
1975,/data/01-raw/02-horse/OIP-lMLWftM5UUc0DCr3QiC-...,2
730,/data/01-raw/02-horse/OIP-wmlrz98DPpqYNZ5JcXWf...,2


In [5]:
df = pd.concat([catdog_df, horse_df])
df

Unnamed: 0,filename,species
0,/data/01-raw/01-catdog/images/Abyssinian_100.jpg,0
1,/data/01-raw/01-catdog/images/Abyssinian_101.jpg,0
2,/data/01-raw/01-catdog/images/Abyssinian_102.jpg,0
3,/data/01-raw/01-catdog/images/Abyssinian_103.jpg,0
4,/data/01-raw/01-catdog/images/Abyssinian_104.jpg,0
...,...,...
2618,/data/01-raw/02-horse/OIP-Ca23ef0W6FQVPkgVkDLb...,2
2619,/data/01-raw/02-horse/OIP-dUUmM7yVE3vhAxInRvhl...,2
2620,/data/01-raw/02-horse/OIP-BRGXdaNFWe0VrnDbBUhp...,2
2621,/data/01-raw/02-horse/OIP-MwFXI4MkHT-sSutR2HMO...,2


#### Creating train-test datasets

In [6]:
test, train = train_test_split(df, test_size=0.8, random_state=SEED, stratify=df.species)
train_size = math.ceil(len(train) / RECORDS_PER_TFRECORD_FILE)
test_size = math.ceil(len(test) / RECORDS_PER_TFRECORD_FILE)

In [7]:
len(train), len(test)

(7978, 1994)

In [8]:
def generate_path(split_name, chunk_number):
    return Path(DATA_ROOT, '02-tfrecords', split_name, f"{split_name}-{str(chunk_number).zfill(3)}.tfrecord")

In [9]:
def read_and_preprocess_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize_with_pad(image, IMAGE_HEIGHT, IMAGE_WIDTH)
    image = tf.cast(image, tf.uint8)

    return image

In [10]:
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def write_tfrecord(items, tfrecord_filename):
    tfrecord_filename.parents[0].mkdir(parents=True, exist_ok=True)
    writer = tf.io.TFRecordWriter(str(tfrecord_filename), tf.io.TFRecordOptions(compression_type="GZIP"))
    
    for filename, label in items:
        image = read_and_preprocess_image(filename)
        feature = {
            'label': _int64_feature(label),
            'image': _bytes_feature(image.numpy().tobytes()),
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())
    writer.close()

In [11]:
for i in tqdm(range(train_size)):
    write_tfrecord(train[i*RECORDS_PER_TFRECORD_FILE:(i + 1)*RECORDS_PER_TFRECORD_FILE].values, generate_path("train", i))

for i in tqdm(range(test_size)):
    write_tfrecord(test[i*RECORDS_PER_TFRECORD_FILE:(i + 1)*RECORDS_PER_TFRECORD_FILE].values, generate_path("test", i))

100%|██████████| 4/4 [01:40<00:00, 25.03s/it]
100%|██████████| 1/1 [00:26<00:00, 26.38s/it]
