In [1]:
from pathlib import Path
import tensorflow as tf

2025-10-12 13:21:25.817887: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


# Problem #1: Generating a Normalized Coordinate Grid
**Source** : Gemini

**Context**
In many computer vision tasks, particularly in object detection (e.g., YOLO, SSD), we divide an image into a grid. Each cell in this grid is responsible for predicting objects located within it. To do this, the model needs to know the location of each grid cell. Your task is to generate a tensor that contains the normalized (x, y) coordinates of the center of each cell.

**Your Task**
Write a Python function generate_normalized_grid(grid_size) that takes an integer grid_size and returns a TensorFlow tensor with the following properties:

**Shape**: (grid_size, grid_size, 2)

**Data Type**: tf.float32

**Content**: The tensor should represent a grid where the last dimension [..., 0] holds the normalized x-coordinates and [..., 1] holds the normalized y-coordinates. The coordinates must be normalized to the range [0.0, 1.0).

**Normalization Logic**: For a grid of size N, the center of the cell at (row, col) has coordinates ((col + 0.5) / N, (row + 0.5) / N).

**Example**
If grid_size = 2, the expected output tensor is:
```python
<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
[[[0.25, 0.25],  # Cell at (row=0, col=0) -> (x=0.25, y=0.25)
  [0.75, 0.25]], # Cell at (row=0, col=1) -> (x=0.75, y=0.25)

 [[0.25, 0.75],  # Cell at (row=1, col=0) -> (x=0.25, y=0.75)
  [0.75, 0.75]]]># Cell at (row=1, col=1) -> (x=0.75, y=0.75)
```
Notice how the x-coordinate increases along the columns and the y-coordinate increases along the rows.


## Thoughts
* So here we need a vectorized solution that creates a tensor whose values depend on its indices.
* I think it would help to initialize the grid cells with index values. 
* We can initialize the grid with zeroes and then use `scatter_nd_update` to update the row cells and column cells.

### Update
* Found a simpler way to do this, created 2 colums using tf.range and tf.repeat and combined it to create a tensor grid where each cell value represented its coordinate value.
* After that calculation was as simple as broadcasting addition and division 


## Solution 1

In [3]:
@tf.function
def generate_normalized_grid(grid_size = 2):
    ## step 1 - get the grid indices range
    grid_range = tf.range(grid_size, dtype=tf.float32)
    ## create column 0 for the grid - this column represents the x-coordinate of each grid cell
    ## value indices of this column would be 0,0,1,1 for grid_size 2
    col_0 = tf.reshape(tf.repeat(grid_range,repeats=grid_size), shape=(grid_size,grid_size,1))
    ## create column 1 for the grid - this column represents y-coordinate of each cell
    col_1 = tf.reshape(tf.repeat([grid_range],repeats=grid_size,axis=0),shape=(grid_size,grid_size,1))
    ## concatenate to form our grid
    ## currently each grid cell represents it index value in float. 
    grid = tf.concat(values = [col_1,col_0], axis = 2)
    ## calculate the grid cell center. 
    coordinate_grid = (grid + 0.5)/grid_size
    return coordinate_grid

In [4]:
generate_normalized_grid()

I0000 00:00:1760300491.093246   63937 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6053 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:2d:00.0, compute capability: 7.5


<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[0.25, 0.25],
        [0.75, 0.25]],

       [[0.25, 0.75],
        [0.75, 0.75]]], dtype=float32)>

## Solution 2 - using tf.meshgrid

In [5]:


def generate_grid_cooridinates(grid_size = 2):
    # 1. Create a 1D vector for x indices: [0, 1, 2, ...]
    coorinate_range = tf.range(grid_size, dtype=tf.float32)
    grid_X,grid_Y = tf.meshgrid(coorinate_range,coorinate_range)
    coordinate_grid = tf.stack(values=[grid_X,grid_Y], axis=2)
    return coordinate_grid
    
def generate_normalized_grid(grid_size = 2):
    coordinate_grid = generate_grid_cooridinates(grid_size=grid_size)
    normalized_grid = (coordinate_grid + 0.5) / grid_size
    return normalized_grid


In [6]:
normalized_grid = generate_normalized_grid()

# Problem #2 : Scaling the Grid to Image Coordinates
**Source**: Gemini

**Context**
In our object detection project, the normalized grid you just created is a generic, resolution-independent representation. However, to actually use it with a specific image, we need to convert those [0.0, 1.0) coordinates into actual pixel coordinates. For example, the center of the top-left cell in a 13x13 grid might be (0.038, 0.038) in normalized space, but on a 416x416 pixel image, that corresponds to pixel (16, 16).

**Your Task**
Write a Python function `scale_grid_to_pixels(normalized_grid, image_shape)` that takes two arguments:

**normalized_grid**: The output tensor from our previous problem, with shape (grid_size, grid_size, 2).

**image_shape**: A 1D TensorFlow tensor or a Python tuple/list of two integers, **`[height, width]`**.

The function should return a new tensor with the same shape as normalized_grid, but where the (x, y) coordinates have been scaled to the pixel space of the image.

**Scaling Logic**:

`pixel_x = normalized_x * width`

`pixel_y = normalized_y * height`

**Example**:
Given the normalized_grid for grid_size = 2:
```
[[[0.25, 0.25], [0.75, 0.25]],
 [[0.25, 0.75], [0.75, 0.75]]]
```
And an image_shape of [416, 416], the expected output is:
```
<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
[[[104., 104.],  # (0.25*416, 0.25*416)
  [312., 104.]], # (0.75*416, 0.25*416)

 [[104., 312.],  # (0.25*416, 0.75*416)
  [312., 312.]]]># (0.75*416, 0.75*416)
```
**Important**: Note the order. The image_shape is (height, width), but our coordinate grid is (x, y). Your solution will need to handle this correctly.

## Thoughts
* First impression of this problem is that this is straight forward, just a simple multiplication. Lets try that and see

In [7]:
@tf.function
def scale_grid_to_pixels(normalized_grid, image_shape):
    ## cast to float32
    img_shape_float = tf.cast(image_shape, dtype=tf.float32)
    ## reorder the image shape colums so that we multiply normalized_x with width and normalized_y with height
    reordered_column = tf.gather(params=img_shape_float, indices=[1,0],axis=0)
    scaled_grid = tf.multiply(normalized_grid, reordered_column)
    return scaled_grid

In [8]:
image_shape = tf.constant(value=[416,416])
scale_grid_to_pixels(normalized_grid=normalized_grid,image_shape=image_shape)

<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[104., 104.],
        [312., 104.]],

       [[104., 312.],
        [312., 312.]]], dtype=float32)>

# Problem #3: Batch-Scaling Grids for Multiple Images

**Context**
To train a neural network efficiently, we process multiple images at once in a "batch". Our grid scaling logic needs to support this. We'll have one common normalized_grid, but a list of different image shapes—one for each image in the batch. Your task is to perform the scaling operation for the entire batch in a single, vectorized call.

**Your Task**
Write a function batch_scale_grids(normalized_grid, batch_image_shapes) that takes:

`normalized_grid: The (grid_size, grid_size, 2) tensor from Problem #1.`

`batch_image_shapes: A 2D tensor of shape (batch_size, 2), where each row is an [height, width] pair.`

The function should return a single tensor of shape (batch_size, grid_size, grid_size, 2) containing the scaled grid for each image.

**Example**
Given normalized_grid (for grid_size=2) and batch_image_shapes:

```python
# A batch of 2 images with different shapes
batch_image_shapes = tf.constant([[416, 416],  # Image 1 is 416x416
                                  [800, 600]], # Image 2 is 800x600 (WxH) -> No, (HxW)
                                 dtype=tf.int32)
```
The goal is to multiply the single (2, 2, 2) normalized grid with the (2, 2) batch of shapes to produce a (2, 2, 2, 2) - (batch_size, grid_size, grid_size, 2) output tensor.

## Thoughts
* This is interesting, the shape of both these tensors are different, so direct tensor multiplication won't work. 
* We can try and use `tf.newaxis` to add axis to normalized grid and then do a tf.reshape to get the desired output. 
* We'll also need to reorder elements of batch image shape like we did before. 

In [9]:
def batch_normalized_grid(normalized_grid, batch_image_shapes):
    batch_image_shapes_float = tf.cast(batch_image_shapes, dtype=tf.float32)
    ## reorder the values
    batch_image_shapes_float_reordered = tf.reverse(batch_image_shapes_float, axis=[-1])
    ## add additional axis
    batch_image_shapes_float_reordered = batch_image_shapes_float_reordered[:,tf.newaxis,tf.newaxis,:]
    normalized_grid_expanded = normalized_grid[tf.newaxis,:]
    ## this will give us batch_image_shape as (batch_size,1,1,2) and normalized grid as (1,grid_size,grid_size,2)
    ## multiplying these two will give us (batch_size, grid_size,grid_size,2)
    normalized_grid = tf.multiply(normalized_grid_expanded,batch_image_shapes_float_reordered)
    return normalized_grid

In [10]:
batch_image_shapes = tf.constant([[416, 416],  # Image 1 is 416x416
                                  [800, 600]], # Image 2 is 800x600 (WxH) -> No, (HxW)
                                 dtype=tf.int32)

normalized_grids = batch_normalized_grid(normalized_grid=normalized_grid, batch_image_shapes=batch_image_shapes)


# Problem #4: Generating Anchor Box Grids

## **Context**

So far, we have the coordinates for the *center* of each grid cell. In modern object detectors (like YOLO), each grid cell doesn't just predict one object; it's responsible for several "anchor boxes" of different pre-defined shapes and sizes (e.g., a tall box, a wide box, a large square box). The model's job isn't to predict a box from scratch, but rather to predict small *adjustments* to the closest matching anchor box.

Our task is to generate the full set of anchor boxes for every grid cell across every image in our batch.

## **Your Task**

Write a function `generate_anchor_grids(scaled_grids, anchor_boxes)` that takes:

1.  `scaled_grids`: The output from our previous problem—a tensor of pixel coordinates for the grid centers, with shape `(batch_size, grid_size, grid_size, 2)`.
2.  `anchor_boxes`: A 2D tensor of shape `(num_anchors, 2)`, where each row is a `[width, height]` pair for a pre-defined anchor.

The function should return a tensor of shape `(batch_size, grid_size, grid_size, num_anchors, 4)`. This final tensor represents the specific bounding boxes (in `[x_min, y_min, x_max, y_max]` format) for every anchor at every grid location.

**Calculation Logic**:
For each grid center `(cx, cy)` from `scaled_grids` and each anchor size `(w, h)` from `anchor_boxes`:
* `x_min = cx - w / 2`
* `y_min = cy - h / 2`
* `x_max = cx + w / 2`
* `y_max = cy + h / 2`

**Core Challenge**: This is another broadcasting puzzle, but with more dimensions. You'll need to expand both `scaled_grids` and `anchor_boxes` so you can perform the `center +/- size/2` calculation. After calculating the `min` and `max` coordinates, you will need to combine them to form the final `(..., 4)` dimension.

This is the most complex problem yet, but it uses the exact same principles you've already mastered. Good luck!

In [11]:
@tf.function
def generate_anchor_grids(scaled_grids, anchor_boxes):
    ## step 1: add axis to scaled grids
    reshaped_scaled_grids = scaled_grids[:,:,:,tf.newaxis,:]
    ## step 2: add axis to anchor_boxes
    reshaped_anchor_boxes = anchor_boxes[tf.newaxis,tf.newaxis,tf.newaxis,:,:]
    ## step 3: calculate min values
    min_values = reshaped_scaled_grids - reshaped_anchor_boxes/2
    max_values = reshaped_scaled_grids + reshaped_anchor_boxes/2
    ## step 4: calculate anchor grid
    anchor_grid = tf.concat([min_values,max_values],axis=-1)
    return anchor_grid

In [12]:

# Shape: (batch_size=1, grid_size=2, grid_size=2, 2)
## i.e. one 2x2 grid => 4 grid centers. 
scaled_grids = tf.constant(
    [[[[100., 100.], [300., 100.]],
      [[100., 300.], [300., 300.]]]],
    dtype=tf.float32
)

# Shape: (num_anchors=3, 2)
# i.e. 3 anchors defined with their width and height.
anchor_boxes = tf.constant(
    [[10., 10.],  # Anchor 1: width=10, height=10
     [20., 10.],  # Anchor 2: width=20, height=10
     [10., 20.]], # Anchor 3: width=10, height=20
    dtype=tf.float32
)

anchor_grid = generate_anchor_grids(scaled_grids=scaled_grids,anchor_boxes=anchor_boxes)
anchor_grid
  


<tf.Tensor: shape=(1, 2, 2, 3, 4), dtype=float32, numpy=
array([[[[[ 95.,  95., 105., 105.],
          [ 90.,  95., 110., 105.],
          [ 95.,  90., 105., 110.]],

         [[295.,  95., 305., 105.],
          [290.,  95., 310., 105.],
          [295.,  90., 305., 110.]]],


        [[[ 95., 295., 105., 305.],
          [ 90., 295., 110., 305.],
          [ 95., 290., 105., 310.]],

         [[295., 295., 305., 305.],
          [290., 295., 310., 305.],
          [295., 290., 305., 310.]]]]], dtype=float32)>

In [13]:
print(anchor_grid[0, 0, 0, 1])

tf.Tensor([ 90.  95. 110. 105.], shape=(4,), dtype=float32)


# Problem #5: Calculating Intersection over Union (IoU)

## **Context**

Intersection over Union (IoU) is a number from 0 to 1 that measures how much two bounding boxes overlap. It's the ratio of the area of their intersection to the area of their union.

IoU is critical for two main reasons:

1.  **During Training**: We use IoU to match our generated anchor boxes to the ground truth object boxes. An anchor with a high IoU to a ground truth box is considered a "positive" example responsible for predicting that object.
2.  **During Inference**: We use IoU in a process called Non-Max Suppression (NMS) to eliminate redundant, overlapping predictions for the same object.

Your task is to implement a fully vectorized function that calculates the pairwise IoU for two sets of boxes.

## **Your Task**

Write a function `calculate_iou(boxes1, boxes2)` that takes:

1.  `boxes1`: A tensor of shape `(N, 4)` representing N bounding boxes.
2.  `boxes2`: A tensor of shape `(M, 4)` representing M bounding boxes.

<!-- end list -->

  * The box format for both is `[x_min, y_min, x_max, y_max]`.

The function should return a 2D tensor of shape `(N, M)`, where `output[i, j]` is the IoU score between `boxes1[i]` and `boxes2[j]`.

## **Calculation Logic**

This is a multi-step calculation that will require broadcasting to compare every box from `boxes1` with every box from `boxes2`.

1.  **Expand Dims for Broadcasting**: Reshape `boxes1` to `(N, 1, 4)` and `boxes2` to `(1, M, 4)`.
2.  **Find Intersection Coordinates**:
      * The top-left corner of the intersection is `(max(box1_x_min, box2_x_min), max(box1_y_min, box2_y_min))`. Use `tf.maximum`.
      * The bottom-right corner is `(min(box1_x_max, box2_x_max), min(box1_y_max, box2_y_max))`. Use `tf.minimum`.
3.  **Calculate Intersection Area**:
      * Calculate the width and height of the intersection.
      * **Crucial Edge Case**: If the boxes don't overlap, the width or height can be negative. You must clip them at 0 (`tf.maximum(width, 0)`). The area is then `width * height`.
4.  **Calculate Union Area**:
      * Calculate the area of all boxes in `boxes1` and `boxes2`. Area is `(x_max - x_min) * (y_max - y_min)`.
      * The union area is `area1 + area2 - intersection_area`.
5.  **Calculate IoU**:
      * `IoU = intersection_area / union_area`.
      * **Crucial Edge Case**: To avoid dividing by zero if the union area is 0, add a tiny number (epsilon, e.g., `1e-7`) to the denominator.

-----

### Test Data

```python
# boxes1 has 2 boxes
boxes1 = tf.constant([[0, 0, 10, 10],   # Box A
                       [15, 15, 25, 25]], # Box B
                      dtype=tf.float32)

# boxes2 has 3 boxes
boxes2 = tf.constant([[5, 5, 15, 15],     # Box C (overlaps A)
                       [0, 0, 10, 10],     # Box D (identical to A)
                       [30, 30, 40, 40]],  # Box E (no overlap)
                      dtype=tf.float32)
```

## **Expected Output Shape**: `(2, 3)`

## **Sanity Check**

  * **IoU of Box A and Box D**: They are identical. The intersection is the area of the box (100), and the union is also the area of the box (100). The IoU should be `1.0`.
  * **IoU of Box B and Box E**: They have no overlap. The intersection area is 0. The IoU should be `0.0`.

This one brings everything together: broadcasting, element-wise math, slicing, and handling edge cases. Good luck\!

In [31]:
def calculate_iou(boxes1, boxes2):
    # step 1 expand dimensions
    reshaped_boxes1 = boxes1[:, tf.newaxis, :]
    reshaped_boxes2 = boxes2[tf.newaxis, :, :]

    # step 2 calculate top left corner of intersection
    # top_left_x = tf.maximum(reshaped_boxes1[:, :, 0], reshaped_boxes2[:, :, 0])
    # top_left_y = tf.maximum(reshaped_boxes1[:, :, 1], reshaped_boxes2[:, :, 1])
    # top_left = tf.stack([top_left_x, top_left_y], axis=-1)
    top_left = tf.maximum(reshaped_boxes1[:, :, 0:2],reshaped_boxes2[:, :, 0:2])
    
    # step 3 calculate bottom right corner of intersection
    # bottom_right_x = tf.minimum(
    #     reshaped_boxes1[:, :, 2], reshaped_boxes2[:, :, 2])
    # bottom_right_y = tf.minimum(
    #     reshaped_boxes1[:, :, 3], reshaped_boxes2[:, :, 3])
    # bottom_right = tf.stack([bottom_right_x, bottom_right_y], axis=-1)
    bottom_right = tf.minimum(reshaped_boxes1[:, :, 2:],reshaped_boxes2[:, :, 2:])

    # step 4 calculate intersection area width
    intersection_width = tf.maximum(
        (bottom_right[:, :, 0] - top_left[:, :, 0]), 0)

    # step 5 calculate intersection height
    intersection_height = tf.maximum(
        (bottom_right[:, :, 1] - top_left[:, :, 1]), 0)

    # step 6 calculate intersection area
    intersection_area = intersection_width * intersection_height

    # step 7 calculate intersection union
    boxes1_area = (reshaped_boxes1[:, :, 2] - reshaped_boxes1[:, :, 0]) * \
        (reshaped_boxes1[:, :, 3] - reshaped_boxes1[:, :, 1])
        
    boxes2_area = (reshaped_boxes2[:, :, 2] - reshaped_boxes2[:, :, 0]) * \
        (reshaped_boxes2[:, :, 3] - reshaped_boxes2[:, :, 1])        
    
    union_area = boxes1_area + boxes2_area - intersection_area

    ## step 8 calculate iou
    epsilon = 1e-7 ## adding epsilon to ensure that denominator is always non-zero.
    iou = intersection_area / (union_area + epsilon)

    return iou

In [32]:
# boxes1 has 2 boxes
boxes1 = tf.constant([[0, 0, 10, 10],   # Box A
                       [15, 15, 25, 25]], # Box B
                      dtype=tf.float32)

# boxes2 has 3 boxes
boxes2 = tf.constant([[5, 5, 15, 15],     # Box C (overlaps A)
                       [0, 0, 10, 10],     # Box D (identical to A)
                       [30, 30, 40, 40]],  # Box E (no overlap)
                      dtype=tf.float32)

ious= calculate_iou(boxes1=boxes1,boxes2=boxes2)
ious

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.14285715, 1.        , 0.        ],
       [0.        , 0.        , 0.        ]], dtype=float32)>

In [29]:
t = tf.constant([[[1, 1, 1], [2, 2, 2]],
                 [[3, 3, 3], [4, 4, 4]],
                 [[5, 5, 5], [6, 6, 6]]])

print(t.shape)
tf.slice(t, [1, 0, 0], [1, 1, 3])  # [[[3, 3, 3]]]
tf.slice(t, [1, 0, 0], [1, 2, 3])  # [[[3, 3, 3],
                                   #   [4, 4, 4]]]
tf.slice(t, [1, 0, 0], [2, 1, 3])  # [[[3, 3, 3]],
                                   #  [[5, 5, 5]]]

(3, 2, 3)


<tf.Tensor: shape=(2, 1, 3), dtype=int32, numpy=
array([[[3, 3, 3]],

       [[5, 5, 5]]], dtype=int32)>