### Convolutional Neural Network: Colorectal Histology Image Classification

In [72]:
import tensorflow_datasets as tfds
import tensorflow as tf
AUTOTUNE = tf.data.AUTOTUNE

# Load the ColorectalHistology dataset
# The dataset typically comes with a 'train' split.
# We can use TFDS's slicing API to further split this 'train' split
# into custom train and test sets.
(train_ds, test_ds), ds_info = tfds.load(
    'colorectal_histology',
    split=['train[:80%]', 'train[80%:]'],  # 80% for training, 20% for testing
    shuffle_files=True,  # Shuffle files before splitting
    as_supervised=True,  # Return (image, label) pairs
    with_info=True
)

# ds_info contains metadata about the dataset
print(f"Number of training samples: {len(train_ds)}")
print(f"Number of testing samples: {len(test_ds)}")
print(f"Dataset info: {ds_info}")

# You can now iterate over train_ds and test_ds
# for example, to get a batch of data:
for image, label in train_ds.take(1):
    print(f"Image shape: {image.shape}, Label: {label}")

Number of training samples: 4000
Number of testing samples: 1000
Dataset info: tfds.core.DatasetInfo(
    name='colorectal_histology',
    full_name='colorectal_histology/2.0.0',
    description="""
    Classification of textures in colorectal cancer histology. Each example is a 150 x 150 x 3 RGB image of one of 8 classes.
    """,
    homepage='https://zenodo.org/record/53169#.XGZemKwzbmG',
    data_dir='/home/ubuntu/tensorflow_datasets/colorectal_histology/2.0.0',
    file_format=tfrecord,
    download_size=246.14 MiB,
    dataset_size=179.23 MiB,
    features=FeaturesDict({
        'filename': Text(shape=(), dtype=string),
        'image': Image(shape=(150, 150, 3), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=8),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'train': <SplitInfo num_examples=5000, num_shards=2>,
    },
    citation="""@article{kather2016multi,
      title={Multi-class texture analysi

2025-11-05 18:45:49.446783: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [73]:
for image, label in train_ds.take(3):
    print(f"{image.shape}, {label}")

#normalize images
print(type(train_ds))

(150, 150, 3), 4
(150, 150, 3), 5
(150, 150, 3), 6
<class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>


2025-11-05 18:45:49.608691: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Preprocess Data

In [None]:
# convert image data into tensors where each pixel value is a float that is normalized (between [0-1]) across all color channels
def preprocess_image(image, label):
    # Convert image to float32 and scale to [0, 1]
    image = tf.image.convert_image_dtype(image, tf.float32)
    
    # Optional: data augmentation
    # image = tf.image.random_flip_left_right(image)
    # image = tf.image.random_brightness(image, 0.1)
    
    return image, label

In [None]:
# Preprocess data 

# map current values in the tensor to normalized float32 values for both train and test datasets
train_ds = train_ds.map(preprocess_image, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.map(preprocess_image, num_parallel_calls=AUTOTUNE)

# shuffle the training set, created batches of images for training, and set prefetch to AUTOTUNE for tf to optimize image data management 
# during training
train_ds = train_ds.shuffle(buffer_size=1000) \
                   .batch(32) \
                   .prefetch(AUTOTUNE)
# just need to create batches of test data, no need to shuffle
test_ds = test_ds.batch(32).prefetch(AUTOTUNE)

# get the shape of our new traning data and make sure values in image tensors are between (0,1)
for images, labels in train_ds.take(1):
    print(images.shape, labels.shape)
    print(tf.reduce_min(images), tf.reduce_max(images))


(32, 150, 150, 3) (32,)
tf.Tensor(0.0, shape=(), dtype=float32) tf.Tensor(1.0, shape=(), dtype=float32)


2025-11-05 18:45:50.370151: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
