In [1]:
from pathlib import Path
import tensorflow as tf

2025-10-07 14:45:17.305593: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


# Problem #1: Generating a Normalized Coordinate Grid
**Source** : Gemini

**Context**
In many computer vision tasks, particularly in object detection (e.g., YOLO, SSD), we divide an image into a grid. Each cell in this grid is responsible for predicting objects located within it. To do this, the model needs to know the location of each grid cell. Your task is to generate a tensor that contains the normalized (x, y) coordinates of the center of each cell.

**Your Task**
Write a Python function generate_normalized_grid(grid_size) that takes an integer grid_size and returns a TensorFlow tensor with the following properties:

**Shape**: (grid_size, grid_size, 2)

**Data Type**: tf.float32

**Content**: The tensor should represent a grid where the last dimension [..., 0] holds the normalized x-coordinates and [..., 1] holds the normalized y-coordinates. The coordinates must be normalized to the range [0.0, 1.0).

**Normalization Logic**: For a grid of size N, the center of the cell at (row, col) has coordinates ((col + 0.5) / N, (row + 0.5) / N).

**Example**
If grid_size = 2, the expected output tensor is:
```python
<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
[[[0.25, 0.25],  # Cell at (row=0, col=0) -> (x=0.25, y=0.25)
  [0.75, 0.25]], # Cell at (row=0, col=1) -> (x=0.75, y=0.25)

 [[0.25, 0.75],  # Cell at (row=1, col=0) -> (x=0.25, y=0.75)
  [0.75, 0.75]]]># Cell at (row=1, col=1) -> (x=0.75, y=0.75)
```
Notice how the x-coordinate increases along the columns and the y-coordinate increases along the rows.


## Thoughts
* So here we need a vectorized solution that creates a tensor whose values depend on its indices.
* I think it would help to initialize the grid cells with index values. 
* We can initialize the grid with zeroes and then use `scatter_nd_update` to update the row cells and column cells.

### Update
* Found a simpler way to do this, created 2 colums using tf.range and tf.repeat and combined it to create a tensor grid where each cell value represented its coordinate value.
* After that calculation was as simple as broadcasting addition and division 


## Solution 1

In [3]:
@tf.function
def generate_normalized_grid(grid_size = 2):
    ## step 1 - get the grid indices range
    grid_range = tf.range(grid_size, dtype=tf.float32)
    ## create column 0 for the grid - this column represents the x-coordinate of each grid cell
    ## value indices of this column would be 0,0,1,1 for grid_size 2
    col_0 = tf.reshape(tf.repeat(grid_range,repeats=grid_size), shape=(grid_size,grid_size,1))
    ## create column 1 for the grid - this column represents y-coordinate of each cell
    col_1 = tf.reshape(tf.repeat([grid_range],repeats=grid_size,axis=0),shape=(grid_size,grid_size,1))
    ## concatenate to form our grid
    ## currently each grid cell represents it index value in float. 
    grid = tf.concat(values = [col_1,col_0], axis = 2)
    ## calculate the grid cell center. 
    coordinate_grid = (grid + 0.5)/grid_size
    return coordinate_grid

In [4]:
generate_normalized_grid()

I0000 00:00:1759873521.938794   75287 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6053 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 SUPER, pci bus id: 0000:2e:00.0, compute capability: 7.5


<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[0.25, 0.25],
        [0.75, 0.25]],

       [[0.25, 0.75],
        [0.75, 0.75]]], dtype=float32)>

## Solution 2 - using tf.meshgrid

In [5]:


def generate_grid_cooridinates(grid_size = 2):
    # 1. Create a 1D vector for x indices: [0, 1, 2, ...]
    coorinate_range = tf.range(grid_size, dtype=tf.float32)
    grid_X,grid_Y = tf.meshgrid(coorinate_range,coorinate_range)
    coordinate_grid = tf.stack(values=[grid_X,grid_Y], axis=2)
    return coordinate_grid
    
def generate_normalized_grid(grid_size = 2):
    coordinate_grid = generate_grid_cooridinates(grid_size=grid_size)
    normalized_grid = (coordinate_grid + 0.5) / grid_size
    return normalized_grid


In [6]:
normalized_grid = generate_normalized_grid()

# Problem #2 : Scaling the Grid to Image Coordinates
**Source**: Gemini

**Context**
In our object detection project, the normalized grid you just created is a generic, resolution-independent representation. However, to actually use it with a specific image, we need to convert those [0.0, 1.0) coordinates into actual pixel coordinates. For example, the center of the top-left cell in a 13x13 grid might be (0.038, 0.038) in normalized space, but on a 416x416 pixel image, that corresponds to pixel (16, 16).

**Your Task**
Write a Python function `scale_grid_to_pixels(normalized_grid, image_shape)` that takes two arguments:

**normalized_grid**: The output tensor from our previous problem, with shape (grid_size, grid_size, 2).

**image_shape**: A 1D TensorFlow tensor or a Python tuple/list of two integers, **`[height, width]`**.

The function should return a new tensor with the same shape as normalized_grid, but where the (x, y) coordinates have been scaled to the pixel space of the image.

**Scaling Logic**:

`pixel_x = normalized_x * width`

`pixel_y = normalized_y * height`

**Example**:
Given the normalized_grid for grid_size = 2:
```
[[[0.25, 0.25], [0.75, 0.25]],
 [[0.25, 0.75], [0.75, 0.75]]]
```
And an image_shape of [416, 416], the expected output is:
```
<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
[[[104., 104.],  # (0.25*416, 0.25*416)
  [312., 104.]], # (0.75*416, 0.25*416)

 [[104., 312.],  # (0.25*416, 0.75*416)
  [312., 312.]]]># (0.75*416, 0.75*416)
```
**Important**: Note the order. The image_shape is (height, width), but our coordinate grid is (x, y). Your solution will need to handle this correctly.

## Thoughts
* First impression of this problem is that this is straight forward, just a simple multiplication. Lets try that and see

In [7]:
@tf.function
def scale_grid_to_pixels(normalized_grid, image_shape):
    ## cast to float32
    img_shape_float = tf.cast(image_shape, dtype=tf.float32)
    ## reorder the image shape colums so that we multiply normalized_x with width and normalized_y with height
    reordered_column = tf.gather(params=img_shape_float, indices=[1,0],axis=0)
    scaled_grid = tf.multiply(normalized_grid, reordered_column)
    return scaled_grid

In [8]:
image_shape = tf.constant(value=[416,416])
scale_grid_to_pixels(normalized_grid=normalized_grid,image_shape=image_shape)

<tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
array([[[104., 104.],
        [312., 104.]],

       [[104., 312.],
        [312., 312.]]], dtype=float32)>

# Problem #3: Batch-Scaling Grids for Multiple Images

**Context**
To train a neural network efficiently, we process multiple images at once in a "batch". Our grid scaling logic needs to support this. We'll have one common normalized_grid, but a list of different image shapes—one for each image in the batch. Your task is to perform the scaling operation for the entire batch in a single, vectorized call.

**Your Task**
Write a function batch_scale_grids(normalized_grid, batch_image_shapes) that takes:

`normalized_grid: The (grid_size, grid_size, 2) tensor from Problem #1.`

`batch_image_shapes: A 2D tensor of shape (batch_size, 2), where each row is an [height, width] pair.`

The function should return a single tensor of shape (batch_size, grid_size, grid_size, 2) containing the scaled grid for each image.

**Example**
Given normalized_grid (for grid_size=2) and batch_image_shapes:

```python
# A batch of 2 images with different shapes
batch_image_shapes = tf.constant([[416, 416],  # Image 1 is 416x416
                                  [800, 600]], # Image 2 is 800x600 (WxH) -> No, (HxW)
                                 dtype=tf.int32)
```
The goal is to multiply the single (2, 2, 2) normalized grid with the (2, 2) batch of shapes to produce a (2, 2, 2, 2) - (batch_size, grid_size, grid_size, 2) output tensor.

## Thoughts
* This is interesting, the shape of both these tensors are different, so direct tensor multiplication won't work. 
* We can try and use `tf.newaxis` to add axis to normalized grid and then do a tf.reshape to get the desired output. 
* We'll also need to reorder elements of batch image shape like we did before. 

In [83]:
def batch_normalized_grid(normalized_grid, batch_image_shapes):
    batch_image_shapes_float = tf.cast(batch_image_shapes, dtype=tf.float32)
    ## reorder the values
    batch_image_shapes_float_reordered = tf.reverse(batch_image_shapes_float, axis=[-1])
    ## add additional axis
    batch_image_shapes_float_reordered = batch_image_shapes_float_reordered[:,tf.newaxis,tf.newaxis,:]
    normalized_grid_expanded = normalized_grid[tf.newaxis,:]
    ## this will give us batch_image_shape as (batch_size,1,1,2) and normalized grid as (1,grid_size,grid_size,2)
    ## multiplying these two will give us (batch_size, grid_size,grid_size,2)
    normalized_grid = tf.multiply(normalized_grid_expanded,batch_image_shapes_float_reordered)
    return normalized_grid

In [84]:
batch_image_shapes = tf.constant([[416, 416],  # Image 1 is 416x416
                                  [800, 600]], # Image 2 is 800x600 (WxH) -> No, (HxW)
                                 dtype=tf.int32)

temp = batch_normalized_grid(normalized_grid=normalized_grid, batch_image_shapes=batch_image_shapes)
temp

<tf.Tensor: shape=(2, 2, 2, 2), dtype=float32, numpy=
array([[[[104., 104.],
         [312., 104.]],

        [[104., 312.],
         [312., 312.]]],


       [[[150., 200.],
         [450., 200.]],

        [[150., 600.],
         [450., 600.]]]], dtype=float32)>

In [79]:
tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
print("Original tensor shape:", tensor.shape)

# Add a new axis at index 1 (between existing dimensions)
expanded_tensor = tf.expand_dims(tensor, axis=1)
print("Expanded tensor shape (axis=1):", expanded_tensor.shape)


Original tensor shape: (2, 3)
Expanded tensor shape (axis=1): (2, 1, 3)


In [80]:
expanded_tensor

<tf.Tensor: shape=(2, 1, 3), dtype=int32, numpy=
array([[[1, 2, 3]],

       [[4, 5, 6]]], dtype=int32)>

In [82]:
tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
print("Original tensor shape:", tensor.shape)

# Add a new axis at index 1 (between existing dimensions)
# The ellipsis (...) represents all other existing dimensions
expanded_tensor = tensor[:, tf.newaxis,tf.newaxis, :]
print("Expanded tensor shape (tf.newaxis in middle):", expanded_tensor.shape)

Original tensor shape: (2, 3)
Expanded tensor shape (tf.newaxis in middle): (2, 1, 1, 3)
