## Installation

In [None]:
!python -m pip install keras_cv datasets transformers tensorboard tensorflow ipywidgets opencv-python tensorflow-datasets scikit-learn
!git-lfs --version

Login to huggingface if first time

In [None]:
from huggingface_hub import notebook_login

notebook_login()


Show that the GPU is being used

In [1]:
from tensorflow.config.experimental import list_physical_devices
print(list_physical_devices('GPU'))

model_id = "google/vit-base-patch16-224-in21k"

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Now create the database, also this is the time to define data augmentation

In [2]:
from transformers import ViTImageProcessor
from datasets import load_dataset
from tensorflow import keras
from tensorflow.keras import layers
from keras_cv.layers import RandAugment
import tensorflow as tf

# Define the model ID and other parameters
num_layers = 2
magnitude = 0.15

# Load the ViTImageProcessor
image_processor = ViTImageProcessor.from_pretrained(model_id)

# Create RandAugment transformation
rand_augment = RandAugment(
    value_range=[-1,1],
    augmentations_per_image=num_layers,
    magnitude=magnitude,
)


def transform(batch):
    inputs = image_processor([x for x in batch["image"]], return_tensors="tf")
    inputs["labels"] = batch["label"]
    return inputs

def augment(batch):
    inputs = image_processor([x for x in batch["image"]], return_tensors="tf")
    transposed = tf.transpose(inputs["pixel_values"], perm=[0,3,2,1])
    augmented = rand_augment(transposed)
    inputs["pixel_values"] = tf.transpose(augmented, perm=[0,3,2,1])
    inputs["labels"] = batch["label"]
    return inputs

dataset = load_dataset("streetview_images_cropped", data_dir="./")

eval_size=.15
test_size=.05

dataset = dataset["train"].shuffle().train_test_split(test_size=test_size)
dataset_final_test = dataset['test'].with_transform(transform)

dataset = dataset["train"].train_test_split(test_size=eval_size)
dataset['train'] = dataset['train'].with_transform(augment)
dataset['test'] = dataset['test'].with_transform(transform)
processed_dataset = dataset

Using TensorFlow backend


Resolving data files:   0%|          | 0/42570 [00:00<?, ?it/s]

Specify hyperparameters

In [3]:
from huggingface_hub import HfFolder
import tensorflow as tf

class_labels = processed_dataset['train'].features["label"].names
num_images_train = processed_dataset['train'].num_rows
id2label = {str(i): label for i, label in enumerate(class_labels)}
label2id = {v: k for k, v in id2label.items()}

num_train_epochs = 5
train_batch_size = 32
eval_batch_size = 32
learning_rate = 6e-5
weight_decay_rate=0.01
num_warmup_steps=0
output_dir=model_id.split("/")[1]
hub_token = HfFolder.get_token()
hub_model_id = f'dl-au-tamas-jedrek/{model_id.split("/")[1]}-street-view'


In [4]:
import json

with open("data/distances.json", "r") as infile:
    distances = json.load(infile)

#make matrix with label2id
import numpy as np
mat_distances = np.zeros((len(label2id), len(label2id)))
for key in distances.keys():
    for key2 in distances[key].keys():
        mat_distances[int(label2id[key])][int(label2id[key2])] = distances[key][key2]
mat_distances

array([[0.        , 2.12150204, 2.10150743, ..., 1.43083091, 2.64981238,
        1.05595098],
       [2.12150204, 0.        , 0.0631969 , ..., 0.7217327 , 2.86360224,
        2.20667979],
       [2.10150743, 0.0631969 , 0.        , ..., 0.68933549, 2.90697122,
        2.21733372],
       ...,
       [1.43083091, 0.7217327 , 0.68933549, ..., 0.        , 2.77736714,
        1.73784865],
       [2.64981238, 2.86360224, 2.90697122, ..., 2.77736714, 0.        ,
        1.60919535],
       [1.05595098, 2.20667979, 2.21733372, ..., 1.73784865, 1.60919535,
        0.        ]])

Get model, specify loss and metrics

In [5]:
from transformers import TFViTForImageClassification, create_optimizer
import tensorflow as tf

# create optimizer wight weigh decay
num_train_steps = num_images_train * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps,
)

# load pre-trained ViT model
model = TFViTForImageClassification.from_pretrained(
    model_id,
    num_labels=len(class_labels),
    id2label=id2label,
    label2id=label2id,
)

tensor_distances = tf.convert_to_tensor(mat_distances, dtype=tf.float32)
def customLoss(y_true, y_pred):
    y_pred_label = tf.argmax(y_pred, axis=1)
    y_true = tf.reshape(y_true, [-1])
    indices = tf.stack((y_true, y_pred_label), axis=1)
    dist = tf.gather_nd(tensor_distances, indices)
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) + (2 * dist)
# define loss
#loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss = customLoss

# define metrics 
metrics=[
    tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(3, name="top-3-accuracy"),
]

2023-11-19 12:01:18.730850: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2023-11-19 12:01:18.730870: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-11-19 12:01:18.730873: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-11-19 12:01:18.730901: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-19 12:01:18.730917: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some layers from the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing TFViTForImageClassification: ['vit/pooler/dense/bias:0', 'vit/pooler/dense/kern

Model: "tf_vi_t_for_image_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vit (TFViTMainLayer)        multiple                  85798656  
                                                                 
 classifier (Dense)          multiple                  99201     
                                                                 
Total params: 85897857 (327.67 MB)
Trainable params: 85897857 (327.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
model.vit.embeddings.trainable = True
model.summary()
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

Model: "tf_vi_t_for_image_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vit (TFViTMainLayer)        multiple                  85798656  
                                                                 
 classifier (Dense)          multiple                  99201     
                                                                 
Total params: 85897857 (327.67 MB)
Trainable params: 85897857 (327.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Transform dataset for training

In [8]:
tf_train_dataset = model.prepare_tf_dataset(processed_dataset['train'], batch_size=train_batch_size, shuffle=True)
tf_eval_dataset = model.prepare_tf_dataset(processed_dataset['test'], batch_size=eval_batch_size, shuffle=True)
tf_test_dataset = model.prepare_tf_dataset(dataset_final_test, batch_size=eval_batch_size, shuffle=True)

InvalidArgumentError: Exception encountered when calling layer 'rand_augment' (type RandAugment).

in user code:

    File "/opt/homebrew/anaconda3/envs/tf/lib/python3.9/site-packages/keras_cv/src/layers/preprocessing/rand_augment.py", line 127, in _augment  *
        result = super()._augment(sample)
    File "/opt/homebrew/anaconda3/envs/tf/lib/python3.9/site-packages/keras_cv/src/layers/preprocessing/random_augmentation_pipeline.py", line 104, in _augment  *
        result = tf.cond(
    File "/opt/homebrew/anaconda3/envs/tf/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/homebrew/anaconda3/envs/tf/lib/python3.9/site-packages/keras_cv/src/layers/preprocessing/base_image_augmentation_layer.py", line 420, in call
        outputs = self._format_output(self._augment(inputs), metadata)
    File "/opt/homebrew/anaconda3/envs/tf/lib/python3.9/site-packages/keras_cv/src/layers/preprocessing/random_choice.py", line 90, in _augment
        selected_op = self._random_generator.random_uniform(

    InvalidArgumentError: Exception encountered when calling layer 'random_choice' (type RandomChoice).
    
    {{function_node __wrapped__StatelessRandomUniformIntV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} minval must be 0-D, got shape [2]
    	 [[{{node StatelessRandomUniformIntV2}}]] [Op:StatelessRandomUniformIntV2] name: 
    
    Call arguments received by layer 'random_choice' (type RandomChoice):
      • inputs={'images': 'tf.Tensor(shape=(224, 224, 3), dtype=float32)'}


Call arguments received by layer 'rand_augment' (type RandAugment):
  • inputs=tf.Tensor(shape=(1, 224, 224, 3), dtype=float32)

Run to display train images

In [None]:
import matplotlib.pyplot as plt

sample_images, sample_labels = next(iter(tf_train_dataset))
plt.figure(figsize=(10, 10))
for i, image in enumerate(sample_images[:9]):
    ax = plt.subplot(3, 3, i + 1)
    transposed = tf.transpose(image)
    plt.imshow(transposed.numpy())
    plt.axis("off")

Push metrics to hub after every epoch

In [None]:
import os
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard as TensorboardCallback, EarlyStopping

callbacks = []
callbacks.append(TensorboardCallback(log_dir=os.path.join(output_dir, "logs")))
#callbacks.append(EarlyStopping(monitor="val_accuracy",patience=1))
callbacks.append(PushToHubCallback(
    output_dir,
    hub_model_id=hub_model_id,
    hub_token=hub_token,
))



Train model

In [None]:
from transformers import logging as transformers_logging
transformers_logging.set_verbosity_info()
train_results = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=callbacks,
    epochs=num_train_epochs,
    verbose=1
)

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

predictions = np.array([])
true_labels = np.array([])

for x, y in tf_test_dataset:
    y_prob = model.predict(x)
    # Apply softmax to obtain probabilities
    probabilities = tf.nn.softmax(y_prob.logits, axis=-1).numpy()
    # Get the predicted labels (class with the highest probability)
    y_pred = tf.argmax(probabilities, axis=-1).numpy()

    predictions = np.concatenate([predictions, y_pred])
    print(y_pred)
    print(y.numpy())
    true_labels = np.concatenate([true_labels, y.numpy()])

cm = confusion_matrix(true_labels, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot()
plt.show()