# Finetuning on MIT Indoor Scenes

https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019

https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb


In [None]:
import PIL
import torch
import pprint
import os
import pandas as pd

from datasets import Dataset
from pathlib import Path
from transformers import ViTFeatureExtractor, ViTForImageClassification
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


# Read data

In [None]:
# MODEL = "google/vit-base-patch16-224"
MODEL = "google/vit-base-patch16-224-in21k"
train_images = "/Users/vincent/datasets/mit_indoor_scenes/dataset/TrainImages.txt"
test_images = "/Users/vincent/datasets/mit_indoor_scenes/dataset/TestImages.txt"
train_images = pd.read_table(train_images)
test_images = pd.read_table(test_images)
data_base_path = "/Users/vincent/datasets/mit_indoor_scenes/dataset/indoorCVPR_09/Images/"

dataset_train = {}
dataset_val = {}
dataset_test = {}


## Get all Paths

In [None]:
def get_path_to_images(df: pd.DataFrame, data_base_path: str):
    return [Path(data_base_path, path) for path in df[df.columns[0]]]
list_train_and_val = get_path_to_images(train_images, data_base_path)
list_test = get_path_to_images(test_images, data_base_path)

In [None]:
list_train, list_val = train_test_split(list_train_and_val,train_size=0.8)

## Filter out JPEG Images

In [None]:
def get_jpeg_images(data_base_path: str, dataset: dict) -> dict:
    # python dicts are ordered since py3.6
    test_files_pixels_map_possible = {}
    test_files_pixels_map_not_possible = {}
    # for path in Path(data_base_path).rglob("*"):
    for path in data_base_path:
        path_as_str = str(path)
        if os.path.isfile(path):
            pixels = PIL.Image.open(path_as_str)
            # we only want jpeg type of images to avoid downstream errors
            if isinstance(pixels, PIL.JpegImagePlugin.JpegImageFile):
                # resize to 224 * 224
                pixels.thumbnail((224,224), PIL.Image.ANTIALIAS)
                test_files_pixels_map_possible[path_as_str] = pixels
            else:
                test_files_pixels_map_not_possible[path_as_str] = pixels


    pixels = list(test_files_pixels_map_possible.values())
    paths = list(test_files_pixels_map_possible.keys())

    dataset["paths"] = paths
    dataset["pixels"] = pixels
    print(".", end=" ")
    return dataset

In [None]:
dataset_train = get_jpeg_images(data_base_path=list_train, dataset=dataset_train)
dataset_val = get_jpeg_images(data_base_path=list_val, dataset=dataset_val)
dataset_test = get_jpeg_images(data_base_path=list_test, dataset=dataset_test)

## Extract Features

In [None]:

def extract_features(dataset: dict) -> dict:
    """Convert pixels into features."""
    feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL)
    print(".", end=" ")
    pixel_values = []
    dict_ = {
        "paths": [],
        "pixels": []
        }
    for i, image in enumerate(dataset["pixels"]):
        try: 
            batch = feature_extractor(images=image, return_tensors="pt")
            # we want to go from 
            #
            #        {'pixel_values': tensor([[[[ 1.0000,  1.0000,  0.5686,  ..., -0.1451, -0.2471, -0.2078],
            #                [ 1.0000,  0.9843,  0.5137,  ..., -0.0745,  0.1608,  0.4353],
            # to
            #
            #        {'pixel_values'} : [tensor([[[ 1.0000,  1.0000,  0.5686,  ..., -0.1451, -0.2471, -0.2078],
            #                [ 1.0000,  0.9843,  0.5137 ...
            # at this point we are not sure why but we are following the notebook from Niels Rogge (link in title)
            # pixel_values = [ image for image in batch["pixel_values"]]
            pixel_values.append(batch["pixel_values"][0])
            # we only keep the records that we can properly process
            dict_["paths"].append(dataset["paths"][i])
            dict_["pixels"].append(dataset["pixels"][i])
        except ValueError as e:
            # some pictures cannot be processed. 
            # we catch the error here, discard the picture 
            # and continue to the next
            print(f"ValueError on {image} : {e}. discarding ...")
        
    dict_["pixel_values"] = pixel_values
    return dict_

In [None]:
dataset_train = extract_features(dataset_train)
dataset_val = extract_features(dataset_val)
dataset_test = extract_features(dataset_test)

## Construct Labels

In [None]:
def fit_labels(dataset: dict) -> dict:
    labels_list = [path.split("/")[-2] for path in dataset["paths"]]
    le = preprocessing.LabelEncoder()
    le.fit(labels_list)
    print(le.classes_)
    return le


def transform_labels(dataset: dict, label_encoder:preprocessing.LabelEncoder) -> dict:
    labels_list = [path.split("/")[-2] for path in dataset["paths"]]
    targets = label_encoder.transform(labels_list)
    targets = torch.as_tensor(targets)
    dataset["labels"] = targets
    return dataset

In [None]:
label_encoder = fit_labels(dataset=dataset_train)
targets = [ int(x) for x in list(label_encoder.transform(label_encoder.classes_))]
classes = label_encoder.classes_
class_no = len(classes)
print(f"classes: {classes}")
print(f"classes no: {class_no}")

dataset_train = transform_labels(dataset_train, label_encoder)
dataset_val = transform_labels(dataset_val, label_encoder)
dataset_test = transform_labels(dataset_test, label_encoder)

In [None]:
assert len(dataset_train["labels"]) == len(dataset_train["paths"]) == len(dataset_train["pixel_values"])
assert len(dataset_val["labels"]) == len(dataset_val["paths"]) == len(dataset_val["pixel_values"])

## Construct Dataset

In [None]:
dataset_train = Dataset.from_dict(dataset_train)
dataset_val = Dataset.from_dict(dataset_val)
dataset_test = Dataset.from_dict(dataset_test)

In [None]:
def collate_fn(examples):
    # Dataset.from_dict removes the tensor and converts it as a list
    # we convert the list here back as a tensor because we lost this
    # when converting to a "Dataset"
    pixel_values = torch.stack([torch.tensor(example["pixel_values"]) for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
        f"mit-indoor-scenes",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_dir='logs',
        remove_unused_columns=False,
)

# evaluation
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# model
id2label = {id:label for id, label in zip(targets, classes)}
label2id = {label:id for id,label in zip(targets, classes )}
num_labels = len(classes)


pprint.pprint(f"id2label: {id2label}")
pprint.pprint(f"label2id: {label2id}")
pprint.pprint(f"num labels: {num_labels}")


In [None]:
model = ViTForImageClassification.from_pretrained(
            MODEL,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id
)

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL)
trainer = Trainer(
    model,
    args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)

trainer.train()

## Evaluation

In [None]:
outputs = trainer.predict(dataset_test)

In [None]:
pprint.pprint(outputs.metrics)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

y_true = outputs.label_ids
y_pred = outputs.predictions.argmax(1)

labels = list(label2id.keys())
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(100,100))
disp.plot(xticks_rotation=45, ax=ax)

## Push To HUB

- the best model was checkpoint 1281.
- load this model
- create a repo on huggingface to push a model "vincentclaes/mit-indoor-scenes"
- use the repo path to push the model.

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained("mit-indoor-scenes/checkpoint-1281")

In [None]:
model.push_to_hub("vincentclaes/mit-indoor-scenes")

In [None]:
# feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
# inputs = feature_extractor(image, return_tensors="pt")
# model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
# outputs = model(**inputs)

# yeey, we can load the model pushed to the huggingface repo.
from transformers import ViTFeatureExtractor
from PIL import Image

image = Image.open("images/coffee-machine-in-office.jpeg")

feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL)
inputs = feature_extractor(image, return_tensors="pt")
model = AutoModel.from_pretrained("vincentclaes/mit-indoor-scenes")
outputs = model(**inputs)

In [71]:

vars(outputs)

{'last_hidden_state': tensor([[[-0.2738,  0.1057,  0.1835,  ..., -0.3797, -0.1796,  0.4518],
          [-0.1293, -0.0596, -0.0943,  ..., -0.2971,  0.0511,  0.3609],
          [-0.1005, -0.0538, -0.0512,  ..., -0.2894,  0.0361,  0.4253],
          ...,
          [-0.2806, -0.1837, -0.0868,  ..., -0.0997,  0.0631,  0.4024],
          [-0.2779, -0.0071, -0.0388,  ..., -0.0938, -0.1475,  0.4918],
          [-0.3208, -0.1019,  0.0198,  ..., -0.2735, -0.1187,  0.5043]]],
        grad_fn=<NativeLayerNormBackward0>),
 'pooler_output': tensor([[ 5.4609e-03,  4.2200e-02, -5.9996e-02,  8.9833e-03,  1.8819e-02,
          -3.3277e-02,  2.1682e-01, -2.0431e-02,  1.5139e-02,  1.6508e-01,
          -1.5392e-01, -2.0770e-02, -1.3163e-01, -1.0191e-01,  1.0971e-01,
           6.0922e-02, -1.5648e-01, -2.3313e-02,  1.7571e-01,  2.2045e-01,
           1.1908e-01,  7.4136e-02,  8.9425e-02, -6.4859e-02, -1.2866e-01,
          -4.3641e-03,  6.9227e-03, -8.7397e-02,  3.3734e-02,  5.4693e-02,
           2.3472e