<a href="https://colab.research.google.com/github/brostromb/ai-ml-principles-exercises/blob/main/label_fashion_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
!{sys.executable} -m pip install numpy tensorflow wandb

import wandb
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
from IPython.display import clear_output
clear_output()

In [None]:
from datetime import datetime

date_and_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

wandb_run = wandb.init(
    project="labels_fashion",
    name=f"preprocessing {date_and_time}"
)

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

In [None]:
def log_bar(x, y, title, x_name="x", y_name="y", keep_order=False):
    if keep_order:
        x = [f"{idx}: {x_}" for idx, x_ in enumerate(x)] # Make sure alphabetical sorting works
    table = wandb.Table(
        data=[[x, y] for x, y in zip(x, y)],
        columns=[x_name, y_name]
    )
    wandb.log({title: wandb.plot.bar(table, x_name, y_name, title=title)})


def create_histogram(data, min_value=None, max_value=None, bins=10):
    if min_value is None:
        min_value = data.min()
    if max_value is None:
        max_value = data.max()

    if isinstance(bins, int):
        bin_edges = np.linspace(min_value, max_value, num=bins)
    else:
        bin_edges = bins
        
    numbers, _ = np.histogram(data, bins=bin_edges)
    bin_names = [f"{lower:.1f}-{upper:.1f}" for lower, upper in zip(bin_edges[:-1], bin_edges[1:])]

    return bin_names, numbers

In [None]:
from collections import Counter

num_images_per_label = Counter(y_train)
x, num_images = zip(*((str(x_), num_) for x_, num_ in sorted(num_images_per_label.items())))
log_bar(x, num_images, "Labels in training data", x_name="Label", y_name="# images")

num_images_per_label = Counter(y_test)
x, num_images = zip(*((str(x_), num_) for x_, num_ in sorted(num_images_per_label.items())))
log_bar(x, num_images, "Labels in test data", x_name="Label", y_name="# images")

In [None]:
i = 0
print(f"Sample {i} is number {y_train[i]}")
plt.imshow(x_train[0])

image = wandb.Image(x_train[0], caption=f"Training sample {i} is a {y_train[i]}")
wandb.log({"Example training images": image})

In [None]:
i = 1000
print(f"Sample {i} is number {y_train[i]}")
plt.imshow(x_train[1000])

image = wandb.Image(x_train[1000], caption=f"Training sample {i} is a {y_train[i]}")
wandb.log({"Example training images": image})

In [None]:
i = 2000
print(f"Sample {i} is number {y_train[i]}")
plt.imshow(x_train[2000])

image = wandb.Image(x_train[2000], caption=f"Training sample {i} is a {y_train[i]}")
wandb.log({"Example training images": image})

In [None]:
print("Label: ", y_train[1000])

In [None]:
min_value = min(x_train.min(), x_test.min())
max_value = max(x_train.max(), x_test.max())
wandb_run.summary["raw"] = {"min": min_value, "max": max_value, "dtype": str(x_train.dtype)}

# Create a new histogram of the image pixels intensities
bin_names, train_hist = create_histogram(x_train)
log_bar(bin_names, train_hist, "Raw training data", x_name="bin", y_name="# pixels", keep_order=True)

In [None]:
x_train_norm = x_train / 128 - 1
x_test_norm = x_test / 128 - 1

In [None]:
min_value = min(x_train_norm.min(), x_test_norm.min())
max_value = max(x_train_norm.max(), x_test_norm.max())
wandb_run.summary["preprocessed"] = {"min": min_value, "max": max_value, "dtype": str(x_train_norm.dtype)}

bin_names, train_hist = create_histogram(x_train_norm)
log_bar(bin_names, train_hist, "Preprocessed training data", x_name="bin", y_name="# pixels", keep_order=True)

In [None]:
i = 0
print(f"Sample {i} is number {y_train[i]}")
plt.imshow(x_train_norm[0])

image = wandb.Image(x_train_norm[0], caption=f"Training sample {i} is a {y_train[i]}")
wandb.log({"Example training image (preprocessed)": image})

In [None]:
num_images_per_label = Counter(y_test)
min_number_of_labels = min(num_images_per_label.values())

indexes_to_keep = []
for label in num_images_per_label.keys():
  indexes_to_keep.extend(
      np.random.choice(
          np.where(y_test == label)[0],
          size=min_number_of_labels,
          replace=False
      ).tolist()
  )

np.random.shuffle(indexes_to_keep)
x_test_norm_subsamp = x_test_norm[indexes_to_keep]
y_test_subsamp = y_test[indexes_to_keep]

In [None]:
x_test_norm_subsamp.shape

In [None]:
print(f"This should be a {y_test_subsamp[0]}")
plt.imshow(x_test_norm_subsamp[0])
wandb.log({"Verify still in sync": image})

In [None]:
num_images_per_label = Counter(y_test_subsamp)
labels_, num_images = zip(*((str(label), number) for label, number in sorted(num_images_per_label.items())))
log_bar(labels_, num_images, "Labels in test data (subsampled)", x_name="Label", y_name="# images")