# Increase gpu memory


In [1]:
import os
import tensorflow as tf
from transformers.utils import logging

# Limit GPU memory usage (e.g., 4GB)
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.9"

logging.set_verbosity_info()

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

2025-02-22 13:51:47.437867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740228707.448585  567204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740228707.451999  567204 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Evaluate pretrained DETR on coco datatset

NOTE: GenAI tools were used for debugging and coding assistance.

Load Dataset (CoCo)

In [4]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from PIL import Image
import os
import torch
import time
from transformers import AutoImageProcessor, DetrForObjectDetection
from prettytable import PrettyTable

# Define paths
coco_root_dir = "/home/utn/firi22ka/Desktop/jenga/mlp/g10/datasets"
coco_annotations_path = f"{coco_root_dir}/annotations/instances_val2017.json"
coco_val_images = f"{coco_root_dir}/val2017"
num_img = 1000 # no of images to be evaluated
# Load COCO annotations
coco_gt = COCO(coco_annotations_path)
# Select first n image IDs
image_ids = coco_gt.getImgIds()[:num_img] 
print(f'Selected {len(image_ids)} for evaluation')



loading annotations into memory...
Done (t=0.56s)
creating index...
index created!
Selected 1000 for evaluation


Helper function to calculate IoU

In [5]:
def calculate_iou(boxA, boxB):
    """
    Calculate Intersection over Union (IoU) between two boxes.
    Each box is in [x_min, y_min, x_max, y_max] format.
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    unionArea = boxAArea + boxBArea - interArea
    return interArea / unionArea if unionArea > 0 else 0

Defining evaluation function for coco dataset

In [None]:
def run_coco_evaluation(coco_gt, image_ids, coco_val_images, image_processor, model):
    """
    Process the selected COCO images with the given model and
    calculate both COCO evaluation metrics and simple manual metrics.
    """
    predictions = []
    total_inference_time = 0.0
    total_detections = 0
    total_iou = 0.0
    manual_tp, manual_fp, manual_fn = 0, 0, 0

    for img_id in image_ids:
        # Load image info and file
        img_info = coco_gt.loadImgs(img_id)[0]
        img_path = os.path.join(coco_val_images, img_info['file_name'])
        image = Image.open(img_path).convert("RGB")
        
        # Load ground-truth boxes (convert [x, y, w, h] -> [x_min, y_min, x_max, y_max])
        ann_ids = coco_gt.getAnnIds(imgIds=img_id)
        anns = coco_gt.loadAnns(ann_ids)
        gt_boxes = []
        for ann in anns:
            x, y, w, h = ann['bbox']
            gt_boxes.append([x, y, x + w, y + h])
        
        # Measure inference time
        start_time = time.time()
        inputs = image_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        inference_time = time.time() - start_time
        total_inference_time += inference_time
        
        # Post-process outputs (results contains boxes, labels, and scores)
        target_sizes = torch.tensor([image.size[::-1]])
        results = image_processor.post_process_object_detection(
            outputs, threshold=0.9, target_sizes=target_sizes
        )[0]
        pred_boxes = results["boxes"].tolist()
        pred_scores = results["scores"].tolist()
        pred_labels = results["labels"].tolist()
        
        total_detections += len(pred_boxes)
        
        # Accumulate predictions in COCO format
        for bbox, score, label in zip(pred_boxes, pred_scores, pred_labels):
            x_min, y_min, x_max, y_max = bbox
            width = x_max - x_min
            height = y_max - y_min
            predictions.append({
                "image_id": img_id,
                "category_id": int(label),
                "bbox": [x_min, y_min, width, height],
                "score": float(score)
            })
        
        # manuallu computing metrics
        matched_gt = set()
        for bbox, label in zip(pred_boxes, pred_labels):
            best_iou = 0
            best_gt_idx = -1
            for idx, gt_box in enumerate(gt_boxes):
                iou = calculate_iou(bbox, gt_box)
                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx
            total_iou += best_iou
            if best_iou >= 0.5 and best_gt_idx not in matched_gt:
                manual_tp += 1
                matched_gt.add(best_gt_idx)
            else:
                manual_fp += 1
        manual_fn += (len(gt_boxes) - len(matched_gt))
    

    manual_precision = manual_tp / (manual_tp + manual_fp) if (manual_tp + manual_fp) > 0 else 0
    manual_recall = manual_tp / (manual_tp + manual_fn) if (manual_tp + manual_fn) > 0 else 0
    manual_f1 = (2 * manual_precision * manual_recall / (manual_precision + manual_recall)) if (manual_precision + manual_recall) > 0 else 0
    mean_iou = total_iou / total_detections if total_detections > 0 else 0
    avg_inference_time = total_inference_time / len(image_ids)
    
   
    if predictions:
        coco_dt = coco_gt.loadRes(predictions)
        coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
        coco_eval.params.imgIds = image_ids 
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()
        mAP = coco_eval.stats[0]  # mAP @[0.5:0.95]
        mAR = coco_eval.stats[8]  # mAR 
    else:
        mAP, mAR = 0, 0

   
    metrics = {
        "mAP": mAP,
        "mAR": mAR,
        "Manual Precision": manual_precision,
        "Manual Recall": manual_recall,
        "Manual F1 Score": manual_f1,
        "Mean IoU": mean_iou,
        "Avg Inference Time per Image": avg_inference_time
    }
    return metrics

Function to evaluate models on coco dataset

In [7]:
def evaluate_model(model_dir, model_label="Model"):
    """
    Loads the image processor and model from model_dir, runs evaluation,
    and prints the results.
    """
    print(f"\nEvaluating {model_label} from: {model_dir}")
    image_processor = AutoImageProcessor.from_pretrained(model_dir)
    model = DetrForObjectDetection.from_pretrained(model_dir)
    
    metrics = run_coco_evaluation(coco_gt, image_ids, coco_val_images, image_processor, model)
    
    # Print results in a table format
    table = PrettyTable()
    table.field_names = ["Metric", "Value"]
    table.add_row(["COCO mAP (0.5:0.95)", f"{metrics['mAP']:.3f}"])
    table.add_row(["COCO mAR (maxDets=100)", f"{metrics['mAR']:.3f}"])
    table.add_row(["Manual Precision", f"{metrics['Manual Precision']:.3f}"])
    table.add_row(["Manual Recall", f"{metrics['Manual Recall']:.3f}"])
    table.add_row(["Manual F1 Score", f"{metrics['Manual F1 Score']:.3f}"])
    table.add_row(["Mean IoU", f"{metrics['Mean IoU']:.3f}"])
    table.add_row(["Avg Inference Time per Image (s)", f"{metrics['Avg Inference Time per Image']:.4f}"])
    print(table)
    
    return metrics

In [8]:
# Evaluate the pretrained model
pretrained_dir = "facebook/detr-resnet-50"
evaluate_model(pretrained_dir, model_label="Pretrained Model")   

loading configuration file preprocessor_config.json from cache at /home/utn/firi22ka/.cache/huggingface/hub/models--facebook--detr-resnet-50/snapshots/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b/preprocessor_config.json
Image processor DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "pad_size": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}




Evaluating Pretrained Model from: facebook/detr-resnet-50


loading configuration file config.json from cache at /home/utn/firi22ka/.cache/huggingface/hub/models--facebook--detr-resnet-50/snapshots/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b/config.json
Model config DetrConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "DetrForObjectDetection"
  ],
  "attention_dropout": 0.0,
  "auxiliary_loss": false,
  "backbone": "resnet50",
  "backbone_config": null,
  "backbone_kwargs": {
    "in_chans": 3,
    "out_indices": [
      1,
      2,
      3,
      4
    ]
  },
  "bbox_cost": 5,
  "bbox_loss_coefficient": 5,
  "class_cost": 1,
  "classifier_dropout": 0.0,
  "d_model": 256,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "dice_loss_coefficient": 1,
  "dilation": false,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_coefficient": 0.1,
  "giou_cost": 2,
  

Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.65s).
Accumulating evaluation results...
DONE (t=0.26s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.378
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.541
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.400
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.177
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.430
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.558
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.297
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.423
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.427
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=10

{'mAP': np.float64(0.377783967177354),
 'mAR': np.float64(0.426712983865974),
 'Manual Precision': 0.7605562150245888,
 'Manual Recall': 0.5794573643410853,
 'Manual F1 Score': 0.6577693040991421,
 'Mean IoU': 0.6920766636999975,
 'Avg Inference Time per Image': 0.5387422678470611}

# evaluate dert_retrained on coco dataset 10000 images

In [9]:
finetuned_dir = "/home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_10000"
evaluate_model(finetuned_dir, model_label="Finetuned Model")


loading configuration file /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_10000/preprocessor_config.json
Image processor DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "pad_size": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}

loading configuration file /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_10000/config.json
Model config DetrConfig {
  "_name_or_path": "facebook/detr-resnet-50",
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "DetrForObjectDetection"
  ],
  "attention_dropout": 0.0,
  "auxiliary_loss": false,
  "backbone": "resnet50",
  "b


Evaluating Finetuned Model from: /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_10000


All model checkpoint weights were used when initializing DetrForObjectDetection.

All the weights of DetrForObjectDetection were initialized from the model checkpoint at /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_10000.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DetrForObjectDetection for predictions without further training.


Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=1.01s).
Accumulating evaluation results...
DONE (t=0.25s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.258
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.386
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.276
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.114
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.316
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.373
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.215
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.297
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.300
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=10

{'mAP': np.float64(0.25789124581124223),
 'mAR': np.float64(0.2996246568652838),
 'Manual Precision': 0.7282948157401624,
 'Manual Recall': 0.45193798449612405,
 'Manual F1 Score': 0.5577613011241329,
 'Mean IoU': 0.6616348767408118,
 'Avg Inference Time per Image': 0.5581301186084747}

# Evaluate on DeTR finetuned on backbone frozen on 100 images

In [10]:
finetuned_dir = "/home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_backobone_freeze_100"
evaluate_model(finetuned_dir, model_label="Finetuned Model")


loading configuration file /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_backobone_freeze_100/preprocessor_config.json
Image processor DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "pad_size": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}

loading configuration file /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_backobone_freeze_100/config.json
Model config DetrConfig {
  "_name_or_path": "facebook/detr-resnet-50",
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "DetrForObjectDetection"
  ],
  "attention_dropout": 0.0,
  "auxiliary_loss": false,



Evaluating Finetuned Model from: /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_backobone_freeze_100


All model checkpoint weights were used when initializing DetrForObjectDetection.

All the weights of DetrForObjectDetection were initialized from the model checkpoint at /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco_backobone_freeze_100.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DetrForObjectDetection for predictions without further training.


Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=1.04s).
Accumulating evaluation results...
DONE (t=0.25s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.290
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.494
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.289
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.082
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.503
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.243
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.339
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.342
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=10

{'mAP': np.float64(0.28998345007148746),
 'mAR': np.float64(0.34204071507658573),
 'Manual Precision': 0.6810150979762287,
 'Manual Recall': 0.5478036175710594,
 'Manual F1 Score': 0.6071888872977231,
 'Mean IoU': 0.6124405322729614,
 'Avg Inference Time per Image': 0.5331497633457184}

# DETR finetuned

In [11]:
finetuned_dir = "/home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco"
evaluate_model(finetuned_dir, model_label="Finetuned Model")

loading configuration file /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco/preprocessor_config.json
Image processor DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "pad_size": null,
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}

loading configuration file /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco/config.json
Model config DetrConfig {
  "_name_or_path": "facebook/detr-resnet-50",
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "DetrForObjectDetection"
  ],
  "attention_dropout": 0.0,
  "auxiliary_loss": false,
  "backbone": "resnet50",
  "backbone_conf


Evaluating Finetuned Model from: /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco


All model checkpoint weights were used when initializing DetrForObjectDetection.

All the weights of DetrForObjectDetection were initialized from the model checkpoint at /home/utn/firi22ka/Desktop/jenga/mlp/g10/test/detr_finetuned_coco.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DetrForObjectDetection for predictions without further training.


Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=1.09s).
Accumulating evaluation results...
DONE (t=0.26s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.353
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.525
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.381
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.148
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.409
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.545
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.279
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.398
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.402
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=10

{'mAP': np.float64(0.3532021516871853),
 'mAR': np.float64(0.402203611106347),
 'Manual Precision': 0.7578729994837378,
 'Manual Recall': 0.5689922480620155,
 'Manual F1 Score': 0.6499889307062209,
 'Mean IoU': 0.6813058596162751,
 'Avg Inference Time per Image': 0.5185170247554779}