## Installation

In [1]:
!python -m pip install datasets transformers tensorboard tensorflow ipywidgets opencv-python tensorflow-datasets
!git-lfs --version

git-lfs/3.4.0 (GitHub; darwin arm64; go 1.20.6)


Login to huggingface if first time

In [2]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Show that the GPU is being used

In [74]:
from tensorflow.config.experimental import list_physical_devices
print(list_physical_devices('GPU'))

model_id = "google/vit-base-patch16-224-in21k"

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Now create the database, also this is the time to define data augmentation

In [75]:
from transformers import ViTImageProcessor
from datasets import load_dataset

image_processor = ViTImageProcessor.from_pretrained(model_id)
def transform(batch):
    inputs = image_processor([x for x in batch["image"]], return_tensors="tf")
    inputs["labels"] = batch["label"]
    return inputs

dataset = load_dataset("streetview_images_cropped", data_dir="./")
dataset = dataset.with_transform(transform)

loading configuration file preprocessor_config.json from cache at /Users/au724747/.cache/huggingface/hub/models--google--vit-base-patch16-224-in21k/snapshots/7cbdb7ee3a6bcdf99dae654893f66519c480a0f8/preprocessor_config.json
size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'shortest_edge', 'longest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.
Image processor ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}



Resolving data files:   0%|          | 0/42570 [00:00<?, ?it/s]

Separate test dataset

In [76]:
# test size will be 15% of train dataset
test_size=.15

processed_dataset = dataset['train'].shuffle().train_test_split(test_size=test_size)
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 36184
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 6386
    })
})

Specify hyperparameters

In [77]:
from huggingface_hub import HfFolder
import tensorflow as tf

class_labels = processed_dataset['train'].features["label"].names
num_images_train = processed_dataset['train'].num_rows
id2label = {str(i): label for i, label in enumerate(class_labels)}
label2id = {v: k for k, v in id2label.items()}

num_train_epochs = 20
train_batch_size = 32
eval_batch_size = 32
learning_rate = 6e-5
weight_decay_rate=0.01
num_warmup_steps=0
output_dir=model_id.split("/")[1]
hub_token = HfFolder.get_token()
hub_model_id = f'dl-au-tamas-jedrek/{model_id.split("/")[1]}-street-view'


In [78]:
import json

with open("data/distances.json", "r") as infile:
    distances = json.load(infile)

#make matrix with label2id
import numpy as np
mat_distances = np.zeros((len(label2id), len(label2id)))
for key in distances.keys():
    for key2 in distances[key].keys():
        mat_distances[int(label2id[key])][int(label2id[key2])] = distances[key][key2]
mat_distances

array([[0.        , 2.12150204, 2.10150743, ..., 1.43083091, 2.64981238,
        1.05595098],
       [2.12150204, 0.        , 0.0631969 , ..., 0.7217327 , 2.86360224,
        2.20667979],
       [2.10150743, 0.0631969 , 0.        , ..., 0.68933549, 2.90697122,
        2.21733372],
       ...,
       [1.43083091, 0.7217327 , 0.68933549, ..., 0.        , 2.77736714,
        1.73784865],
       [2.64981238, 2.86360224, 2.90697122, ..., 2.77736714, 0.        ,
        1.60919535],
       [1.05595098, 2.20667979, 2.21733372, ..., 1.73784865, 1.60919535,
        0.        ]])

Get model, specify loss and metrics

In [79]:
from transformers import TFViTForImageClassification, create_optimizer
import tensorflow as tf

# create optimizer wight weigh decay
num_train_steps = num_images_train * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps,
)

# load pre-trained ViT model
model = TFViTForImageClassification.from_pretrained(
    model_id,
    num_labels=len(class_labels),
    id2label=id2label,
    label2id=label2id,
)

model.summary()

tensor_distances = tf.convert_to_tensor(mat_distances, dtype=tf.float32)
def customLoss(y_true, y_pred):
    y_pred_label = tf.argmax(y_pred, axis=1)
    y_true = tf.reshape(y_true, [-1])
    indices = tf.stack((y_true, y_pred_label), axis=1)
    dist = tf.gather_nd(tensor_distances, indices)
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) * dist
# define loss
#loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss = customLoss

# define metrics 
metrics=[
    tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(3, name="top-3-accuracy"),
]

loading configuration file config.json from cache at /Users/au724747/.cache/huggingface/hub/models--google--vit-base-patch16-224-in21k/snapshots/7cbdb7ee3a6bcdf99dae654893f66519c480a0f8/config.json
Model config ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224-in21k",
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "Aabenraa Kommune",
    "1": "Aalborg Kommune000",
    "10": "Aarhus Kommune10001",
    "100": "Roskilde Kommune",
    "101": "Rudersdal Kommune",
    "102": "R\u00f8dovre Kommune",
    "103": "Sams\u00f8 Kommune",
    "104": "Silkeborg Kommune0",
    "105": "Silkeborg Kommune1",
    "106": "Skanderborg Kommune",
    "107": "Skive Kommune",
    "108": "Slagelse Kommune",
    "109": "Solr\u00f8d Kommune",
    "11": "Aarhus Kommune1001",
    "110": "Sor\u00f8 Kommune",
    "111": "Stevns Kommune",
    "112": "Str

Model: "tf_vi_t_for_image_classification_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vit (TFViTMainLayer)        multiple                  85798656  
                                                                 
 classifier (Dense)          multiple                  99201     
                                                                 
Total params: 85897857 (327.67 MB)
Trainable params: 85897857 (327.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [80]:
model.vit.embeddings.trainable = True
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

Transform dataset for training

In [81]:
tf_train_dataset = model.prepare_tf_dataset(processed_dataset['train'], batch_size=train_batch_size, shuffle=True)
tf_eval_dataset = model.prepare_tf_dataset(processed_dataset['test'], batch_size=eval_batch_size, shuffle=True)

Push metrics to hub after every epoch

In [82]:
import os
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard as TensorboardCallback, EarlyStopping

callbacks = []
callbacks.append(TensorboardCallback(log_dir=os.path.join(output_dir, "logs")))
#callbacks.append(EarlyStopping(monitor="val_accuracy",patience=1))
callbacks.append(PushToHubCallback(
    output_dir,
    hub_model_id=hub_model_id,
    hub_token=hub_token,
))



/Users/au724747/projects/dl-geolocation-vit/vit-base-patch16-224-in21k is already a clone of https://huggingface.co/dl-au-tamas-jedrek/vit-base-patch16-224-in21k-street-view. Make sure you pull the latest changes with `repo.git_pull()`.


Train model

In [83]:
from transformers import logging as transformers_logging
transformers_logging.set_verbosity_info()
train_results = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    callbacks=callbacks,
    epochs=num_train_epochs,
    verbose=1
)

Epoch 1/20
Tensor("ExpandDims:0", shape=(32, 1), dtype=int64) Tensor("tf_vi_t_for_image_classification_17/classifier/BiasAdd:0", shape=(32, 129), dtype=float32)
Tensor("ExpandDims:0", shape=(32, 1), dtype=int64) Tensor("tf_vi_t_for_image_classification_17/classifier/BiasAdd:0", shape=(32, 129), dtype=float32)


Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 2/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 3/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 4/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 5/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 6/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 7/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 8/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 9/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 10/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 11/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 12/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 13/20

Configuration saved in vit-base-patch16-224-in21k/config.json
Model weights saved in vit-base-patch16-224-in21k/tf_model.h5


Epoch 14/20