# Load the model

In [49]:
from transformers import SegformerForSemanticSegmentation

# Load the model
model_directory = './scripts/segformer_output'
model_directory = 'guimCC/segformer-v0-gta'
original_model = SegformerForSemanticSegmentation.from_pretrained(model_directory)



In [46]:
from peft import PeftConfig, PeftModel

model_id = "guimCC/segformer-v0-gta-cityscapes"
#model_id = "./scripts/segformer-gta-cityscapes-lora"

from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor

checkpoint_dir = "guimCC/segformer-v0-gta"

image_processor = SegformerImageProcessor()

model = SegformerForSemanticSegmentation.from_pretrained(checkpoint_dir)

config = PeftConfig.from_pretrained(model_id)

# Load the Lora model
lora_model = PeftModel.from_pretrained(model, model_id)




In [38]:
lora_model.push_to_hub("guimCC/segformer-v0-gta-cityscapes")

CommitInfo(commit_url='https://huggingface.co/guimCC/segformer-v0-gta-cityscapes/commit/2b21925b822c9b80383960960b0082724d997407', commit_message='Upload model', commit_description='', oid='2b21925b822c9b80383960960b0082724d997407', pr_url=None, pr_revision=None, pr_num=None)

# Load the dataset

## GTA

In [3]:
from datasets import concatenate_datasets, DatasetDict, load_from_disk
import os

def load_batches(split_name, directory):
    batches = []
    batch_num = 0
    while True:
        batch_dir = os.path.join(directory, f"{split_name}_batch_{batch_num}.arrow")
        if not os.path.exists(batch_dir):
            break
        batch_dataset = load_from_disk(batch_dir)
        batches.append(batch_dataset)
        batch_num += 1
    return concatenate_datasets(batches) if batches else None

# Load each split
dataset_path = './gta_dataset'

train_dataset = load_batches('train', dataset_path)
validation_dataset = load_batches('validation', dataset_path)
test_dataset = load_batches('test', dataset_path)

# Create a DatasetDict
hf_datasets = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

gta_train_ds = hf_datasets["train"]
gta_test_ds = hf_datasets["test"].train_test_split(test_size=0.1)['test']
gta_val_ds = hf_datasets["validation"].train_test_split(test_size=0.1)['test']

## Cityscapes

In [3]:
from datasets import concatenate_datasets, DatasetDict, load_from_disk

# Load the dataset from disk
loaded_dataset = load_from_disk("./cityscapes_train_1000_dataset_v3")
print("Dataset loaded successfully.")

# Prepare train and test splits

loaded_dataset = loaded_dataset.train_test_split(test_size=0.1)
cty_test_ds = loaded_dataset["test"]


Dataset loaded successfully.


# Perform evaluation

## GTA

In [4]:
from transformers import SegformerImageProcessor
from PIL import Image
import numpy as np

processor = SegformerImageProcessor()


import json
from huggingface_hub import cached_download, hf_hub_url

repo_id = "huggingface/label-files"
filename = "cityscapes-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}


id2label[19] = 'ignore'
label2id['ignore'] = 19
num_labels = len(id2label)


def val_transforms(example_batch):
    images = [Image.fromarray(np.array(x, dtype=np.uint8)) for x in example_batch['image']]
    labels = [Image.fromarray(np.array(x, dtype=np.uint8), mode='L') for x in example_batch['mask']]
    
    # Ensure labels are within the expected range
    labels = [Image.fromarray(np.minimum(np.array(label), num_labels - 1), mode='L') for label in labels]
    
    inputs = processor(images=images, segmentation_maps=labels, return_tensors="pt")
    return inputs


import torch
from torch import nn
import evaluate

metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    metrics = metric.compute(
        predictions=pred_labels,
        references=labels,
        num_labels=len(id2label),
        ignore_index=19,
        reduce_labels=processor.do_reduce_labels,
    )
    
    # add per category metrics as individual key-value pairs
    per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
    per_category_iou = metrics.pop("per_category_iou").tolist()

    metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
    metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
    
    return metrics
  
gta_test_ds.set_transform(val_transforms)




## Cityscapes

In [53]:
from transformers import SegformerImageProcessor
from PIL import Image
import numpy as np

import json
from huggingface_hub import cached_download, hf_hub_url

repo_id = "huggingface/label-files"
filename = "cityscapes-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}


id2label[19] = 'ignore'
label2id['ignore'] = 19
num_labels = len(id2label)


# Preprocessing
from torchvision.transforms import ColorJitter

# Transofrms the color properities
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
from PIL import Image
import numpy as np



def handle_grayscale_image(image):
    np_image = np.array(image)
    if np_image.ndim == 2:
        tiled_image = np.tile(np.expand_dims(np_image, -1), 3)
        return Image.fromarray(tiled_image)
    else:
        return Image.fromarray(np_image)


def val_transforms(example_batch):
    images = [handle_grayscale_image(x) for x in example_batch["image"]]
    labels = [x for x in example_batch["annotation"]]
    inputs = image_processor(images, labels)
    return inputs

import torch
from torch import nn
import evaluate

metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
    
    with torch.no_grad(): # Don't want to store the gradients while computing this metric since it's validation
        logits, labels = eval_pred
        logits_tensor = torch.from_numpy(logits)
        # scale the logits to the size of the label
        logits_tensor = nn.functional.interpolate(
            logits_tensor,
            size=labels.shape[-2:],
            mode="bilinear",
            align_corners=False,
        ).argmax(dim=1)

        pred_labels = logits_tensor.detach().cpu().numpy()
        # currently using _compute instead of compute
        # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
        metrics = metric._compute(
            predictions=pred_labels,
            references=labels,
            num_labels=len(id2label),
            ignore_index=0,
            reduce_labels=image_processor.do_reduce_labels,
        )

        # add per category metrics as individual key-value pairs
        per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
        per_category_iou = metrics.pop("per_category_iou").tolist()

        metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
        metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})

        return metrics
  
cty_test_ds.set_transform(val_transforms)


# Evaluate

## Segformer

In [54]:
from transformers import Trainer, TrainingArguments

# Assuming evaluation doesn't require gradient updates
eval_args = TrainingArguments(
    output_dir='./segformer_evaluation/sgf-v0-cty',  # Directory to store evaluation results
    do_train=False,
    do_eval=True,
    eval_steps=10,
    logging_steps=5,
    per_device_eval_batch_size=10,
    dataloader_num_workers=0,
    remove_unused_columns=False,

)

trainer = Trainer(
    model=original_model,
    args=eval_args,
    eval_dataset=cty_test_ds,
    compute_metrics=compute_metrics  # Your metrics function as defined earlier
)

In [55]:
eval_results = trainer.evaluate()
print(eval_results)

  return F.conv2d(input, weight, bias, self.stride,


{'eval_loss': 1.2160677909851074, 'eval_mean_iou': 0.08249961911508377, 'eval_mean_accuracy': 0.3430836199348098, 'eval_overall_accuracy': 0.7057258167711694, 'eval_accuracy_road': nan, 'eval_accuracy_sidewalk': 0.452280847430867, 'eval_accuracy_building': 0.8751869702509556, 'eval_accuracy_wall': 0.33387868724360226, 'eval_accuracy_fence': 0.23689513284076605, 'eval_accuracy_pole': 0.149771875716871, 'eval_accuracy_traffic light': 0.0, 'eval_accuracy_traffic sign': 0.0, 'eval_accuracy_vegetation': 0.9201750423182221, 'eval_accuracy_terrain': 0.8118086934949409, 'eval_accuracy_sky': 0.9956422699811825, 'eval_accuracy_person': 0.3508328122335285, 'eval_accuracy_rider': 0.0, 'eval_accuracy_car': 0.7460295161214794, 'eval_accuracy_truck': 0.3030033111941599, 'eval_accuracy_bus': 0.0, 'eval_accuracy_train': 0.0, 'eval_accuracy_motorcycle': 0.0, 'eval_accuracy_bicycle': 0.0, 'eval_accuracy_ignore': nan, 'eval_iou_road': 0.0, 'eval_iou_sidewalk': 0.022881098458391723, 'eval_iou_building': 0.

  acc = total_area_intersect / total_area_label


## Lora

In [56]:
from transformers import Trainer, TrainingArguments

# Assuming evaluation doesn't require gradient updates
lora_eval_args = TrainingArguments(
    output_dir='./segformer_evaluation/sgf-v0-lora-cty',  # Directory to store evaluation results
    do_train=False,
    do_eval=True,
    eval_steps=10,
    logging_steps=5,
    per_device_eval_batch_size=10,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    label_names=["labels"]

)

# Compute Metrics Issue: https://github.com/huggingface/transformers/issues/29186
lora_trainer = Trainer(
    model=lora_model,
    args=lora_eval_args,
    eval_dataset=cty_test_ds,
    compute_metrics=compute_metrics  # Your metrics function as defined earlier
)

In [57]:
lora_eval_results = lora_trainer.evaluate()
print(lora_eval_results)

  return F.conv2d(input, weight, bias, self.stride,


{'eval_loss': 0.16964639723300934, 'eval_mean_iou': 0.23273853558468585, 'eval_mean_accuracy': 0.8557465871805577, 'eval_overall_accuracy': 0.922382745940558, 'eval_accuracy_road': nan, 'eval_accuracy_sidewalk': 0.8975153645087158, 'eval_accuracy_building': 0.9098147840378188, 'eval_accuracy_wall': 0.8705801914436413, 'eval_accuracy_fence': 0.8380793262853692, 'eval_accuracy_pole': 0.7568246119338312, 'eval_accuracy_traffic light': 0.789844189975596, 'eval_accuracy_traffic sign': 0.7714963744232037, 'eval_accuracy_vegetation': 0.9276372008469087, 'eval_accuracy_terrain': 0.9637375294597132, 'eval_accuracy_sky': 0.9979383652044556, 'eval_accuracy_person': 0.7645671081803195, 'eval_accuracy_rider': 0.6194160219615673, 'eval_accuracy_car': 0.9761392740201217, 'eval_accuracy_truck': 0.702689619115853, 'eval_accuracy_bus': 0.9686170212765958, 'eval_accuracy_train': 0.8235132158590308, 'eval_accuracy_motorcycle': 0.9681533440864248, 'eval_accuracy_bicycle': 0.8568750266308748, 'eval_accuracy

  iou = total_area_intersect / total_area_union
  acc = total_area_intersect / total_area_label


# Visualize

## GTA

In [13]:
# Formatting the output in a pretty table using Python's tabulate library

from tabulate import tabulate

output = {
    'eval_loss': 0.5304863452911377,
    'eval_mean_iou': 0.3863717727599142,
    'eval_mean_accuracy': 0.4594820622645841,
    'eval_overall_accuracy': 0.877796907688061,
    'eval_accuracy_road': 0.9489809431222332,
    'eval_accuracy_sidewalk': 0.8455630469421747,
    'eval_accuracy_building': 0.9184319410853448,
    'eval_accuracy_wall': 0.44873866548323804,
    'eval_accuracy_fence': 0.2258698878655819,
    'eval_accuracy_pole': 0.20655266367158562,
    'eval_accuracy_traffic light': 0.0,
    'eval_accuracy_traffic sign': 0.0,
    'eval_accuracy_vegetation': 0.8701828218735161,
    'eval_accuracy_terrain': 0.48447198018374665,
    'eval_accuracy_sky': 0.9759494728206349,
    'eval_accuracy_person': 0.46963642497888913,
    'eval_accuracy_rider': 0.0,
    'eval_accuracy_car': 0.9183919255986837,
    'eval_accuracy_truck': 0.7639999478392405,
    'eval_accuracy_bus': 0.6533894615622279,
    'eval_accuracy_train': 0.0,
    'eval_accuracy_motorcycle': 0.0,
    'eval_accuracy_bicycle': 0.0,
    'eval_accuracy_ignore': 'nan',
    'eval_iou_road': 0.9180418984751986,
    'eval_iou_sidewalk': 0.7313570464885094,
    'eval_iou_building': 0.8009566177938597,
    'eval_iou_wall': 0.37947135831448287,
    'eval_iou_fence': 0.2024631043010466,
    'eval_iou_pole': 0.17764894431211686,
    'eval_iou_traffic light': 0.0,
    'eval_iou_traffic sign': 0.0,
    'eval_iou_vegetation': 0.7098098136862394,
    'eval_iou_terrain': 0.4269937162526332,
    'eval_iou_sky': 0.9295866001833509,
    'eval_iou_person': 0.35261619649221454,
    'eval_iou_rider': 0.0,
    'eval_iou_car': 0.8107652652247773,
    'eval_iou_truck': 0.6620217799686436,
    'eval_iou_bus': 0.6257031137052117,
    'eval_iou_train': 0.0,
    'eval_iou_motorcycle': 0.0,
    'eval_iou_bicycle': 0.0,
    'eval_iou_ignore': 0.0,
    'eval_runtime': 686.5265,
    'eval_samples_per_second': 0.897,
    'eval_steps_per_second': 0.012
}

# Convert the dictionary to a list of lists for tabulate
data = [[key, value] for key, value in output.items()]

# Create the table
table = tabulate(data, headers=["Metric", "Value"], tablefmt="pretty")

print(table)


+-----------------------------+---------------------+
|           Metric            |        Value        |
+-----------------------------+---------------------+
|          eval_loss          | 0.5304863452911377  |
|        eval_mean_iou        | 0.3863717727599142  |
|     eval_mean_accuracy      | 0.4594820622645841  |
|    eval_overall_accuracy    |  0.877796907688061  |
|     eval_accuracy_road      | 0.9489809431222332  |
|   eval_accuracy_sidewalk    | 0.8455630469421747  |
|   eval_accuracy_building    | 0.9184319410853448  |
|     eval_accuracy_wall      | 0.44873866548323804 |
|     eval_accuracy_fence     | 0.2258698878655819  |
|     eval_accuracy_pole      | 0.20655266367158562 |
| eval_accuracy_traffic light |         0.0         |
| eval_accuracy_traffic sign  |         0.0         |
|  eval_accuracy_vegetation   | 0.8701828218735161  |
|    eval_accuracy_terrain    | 0.48447198018374665 |
|      eval_accuracy_sky      | 0.9759494728206349  |
|    eval_accuracy_person   

: 

## Cityscapes

In [13]:
# Formatting the output in a pretty table using Python's tabulate library

from tabulate import tabulate

output = {'eval_loss': 1.146903395652771, 'eval_mean_iou': 0.08527295022108815, 'eval_mean_accuracy': 0.35031907385611705, 'eval_overall_accuracy': 0.6403262338725791, 'eval_accuracy_road': 0.5707251169399141, 'eval_accuracy_sidewalk': 0.372826855881155, 'eval_accuracy_building': 0.8886840766712605, 'eval_accuracy_wall': 0.3608147036150444, 'eval_accuracy_fence': 0.10439456945390266, 'eval_accuracy_pole': 0.09889556124805779, 'eval_accuracy_traffic light': 0.0, 'eval_accuracy_traffic sign': 0.0, 'eval_accuracy_vegetation': 0.9053155791789973, 'eval_accuracy_terrain': 0.8172969812841195, 'eval_accuracy_sky': 0.9914449084746553, 'eval_accuracy_person': 0.10675872093023256, 'eval_accuracy_rider': 0.0, 'eval_accuracy_car': 0.7414439247994549, 'eval_accuracy_truck': 0.48972868217054266, 'eval_accuracy_bus': 0.20773272261888726, 'eval_accuracy_train': 0.0, 'eval_accuracy_motorcycle': 0.0, 'eval_accuracy_bicycle': 0.0, 'eval_accuracy_ignore': 'nan', 'eval_iou_road': 0.1931597801565552, 'eval_iou_sidewalk': 0.01675527174811858, 'eval_iou_building': 0.037787175770186185, 'eval_iou_wall': 0.05374355627270035, 'eval_iou_fence': 0.07243608432947611, 'eval_iou_pole': 0.0641508748672036, 'eval_iou_traffic light': 0.0, 'eval_iou_traffic sign': 0.0, 'eval_iou_vegetation': 0.06478703055444684, 'eval_iou_terrain': 0.3191420114236002, 'eval_iou_sky': 0.5165666641106099, 'eval_iou_person': 0.009816369079439752, 'eval_iou_rider': 0.0, 'eval_iou_car': 0.1253018669789748, 'eval_iou_truck': 0.11226121723678366, 'eval_iou_bus': 0.11955110189366774, 'eval_iou_train': 0.0, 'eval_iou_motorcycle': 0.0, 'eval_iou_bicycle': 0.0, 'eval_iou_ignore': 0.0, 'eval_runtime': 22.695, 'eval_samples_per_second': 4.406, 'eval_steps_per_second': 0.088}

# Convert the dictionary to a list of lists for tabulate
data = [[key, value] for key, value in output.items()]

# Create the table
table = tabulate(data, headers=["Metric", "Value"], tablefmt="pretty")

print(table)


+-----------------------------+----------------------+
|           Metric            |        Value         |
+-----------------------------+----------------------+
|          eval_loss          |  1.146903395652771   |
|        eval_mean_iou        | 0.08527295022108815  |
|     eval_mean_accuracy      | 0.35031907385611705  |
|    eval_overall_accuracy    |  0.6403262338725791  |
|     eval_accuracy_road      |  0.5707251169399141  |
|   eval_accuracy_sidewalk    |  0.372826855881155   |
|   eval_accuracy_building    |  0.8886840766712605  |
|     eval_accuracy_wall      |  0.3608147036150444  |
|     eval_accuracy_fence     | 0.10439456945390266  |
|     eval_accuracy_pole      | 0.09889556124805779  |
| eval_accuracy_traffic light |         0.0          |
| eval_accuracy_traffic sign  |         0.0          |
|  eval_accuracy_vegetation   |  0.9053155791789973  |
|    eval_accuracy_terrain    |  0.8172969812841195  |
|      eval_accuracy_sky      |  0.9914449084746553  |
|    eval_

## Cityscapes LoRA

In [63]:
# Formatting the output in a pretty table using Python's tabulate library

from tabulate import tabulate

output = {'eval_loss': 0.16964639723300934, 'eval_mean_iou': 0.23273853558468585, 'eval_mean_accuracy': 0.8557465871805577, 'eval_overall_accuracy': 0.922382745940558, 'eval_accuracy_road': 'nan', 'eval_accuracy_sidewalk': 0.8975153645087158, 'eval_accuracy_building': 0.9098147840378188, 'eval_accuracy_wall': 0.8705801914436413, 'eval_accuracy_fence': 0.8380793262853692, 'eval_accuracy_pole': 0.7568246119338312, 'eval_accuracy_traffic light': 0.789844189975596, 'eval_accuracy_traffic sign': 0.7714963744232037, 'eval_accuracy_vegetation': 0.9276372008469087, 'eval_accuracy_terrain': 0.9637375294597132, 'eval_accuracy_sky': 0.9979383652044556, 'eval_accuracy_person': 0.7645671081803195, 'eval_accuracy_rider': 0.6194160219615673, 'eval_accuracy_car': 0.9761392740201217, 'eval_accuracy_truck': 0.702689619115853, 'eval_accuracy_bus': 0.9686170212765958, 'eval_accuracy_train': 0.8235132158590308, 'eval_accuracy_motorcycle': 0.9681533440864248, 'eval_accuracy_bicycle': 0.8568750266308748, 'eval_accuracy_ignore': 'nan', 'eval_iou_road': 0.0, 'eval_iou_sidewalk': 0.13305125444486632, 'eval_iou_building': 0.05004134800803356, 'eval_iou_wall': 0.15970041613846794, 'eval_iou_fence': 0.3285951715453578, 'eval_iou_pole': 0.08824971764845747, 'eval_iou_traffic light': 0.2151458594329251, 'eval_iou_traffic sign': 0.16679112773125532, 'eval_iou_vegetation': 0.088245352649776, 'eval_iou_terrain': 0.4874015098830665, 'eval_iou_sky': 0.4333834831977473, 'eval_iou_person': 0.07837338585613725, 'eval_iou_rider': 0.11151547827649728, 'eval_iou_car': 0.14361598951923962, 'eval_iou_truck': 0.3716916584726319, 'eval_iou_bus': 0.2656261395959449, 'eval_iou_train': 0.6419832582099163, 'eval_iou_motorcycle': 0.4894189927184466, 'eval_iou_bicycle': 0.16920203278026452, 'eval_iou_ignore': 'nan', 'eval_runtime': 15.089, 'eval_samples_per_second': 6.627, 'eval_steps_per_second': 0.133}

# Convert the dictionary to a list of lists for tabulate
data = [[key, value] for key, value in output.items()]

# Create the table
table = tabulate(data, headers=["Metric", "Value"], tablefmt="pretty")

print(table)


+-----------------------------+---------------------+
|           Metric            |        Value        |
+-----------------------------+---------------------+
|          eval_loss          | 0.16964639723300934 |
|        eval_mean_iou        | 0.23273853558468585 |
|     eval_mean_accuracy      | 0.8557465871805577  |
|    eval_overall_accuracy    |  0.922382745940558  |
|     eval_accuracy_road      |         nan         |
|   eval_accuracy_sidewalk    | 0.8975153645087158  |
|   eval_accuracy_building    | 0.9098147840378188  |
|     eval_accuracy_wall      | 0.8705801914436413  |
|     eval_accuracy_fence     | 0.8380793262853692  |
|     eval_accuracy_pole      | 0.7568246119338312  |
| eval_accuracy_traffic light |  0.789844189975596  |
| eval_accuracy_traffic sign  | 0.7714963744232037  |
|  eval_accuracy_vegetation   | 0.9276372008469087  |
|    eval_accuracy_terrain    | 0.9637375294597132  |
|      eval_accuracy_sky      | 0.9979383652044556  |
|    eval_accuracy_person   