In [38]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
import torch.nn as nn

In [39]:
mot_dir = "MOT17/train"
print(os.listdir(mot_dir))

['MOT17-11-FRCNN', 'MOT17-02-SDP', 'MOT17-04-FRCNN', 'MOT17-13-DPM', 'MOT17-04-SDP', 'MOT17-09-FRCNN', 'MOT17-02-FRCNN', 'MOT17-04-DPM', 'MOT17-11-DPM', 'MOT17-05-SDP', 'MOT17-11-SDP', 'MOT17-05-FRCNN', 'MOT17-05-DPM', 'MOT17-10-DPM', 'MOT17-13-FRCNN', 'MOT17-09-DPM', 'MOT17-02-DPM', 'MOT17-10-FRCNN', 'MOT17-09-SDP', 'MOT17-13-SDP', 'MOT17-10-SDP']


In [40]:

def get_tracklet(gt_in, object_id):
    """
    Extract the tracklet for a given object ID.
    :param df: DataFrame containing the ground truth data.
    :param object_id: The ID of the object to extract the tracklet for.
    :return: DataFrame containing the tracklet.
    """
    tracklet  = []
    # tracklet.append(i) [for i in gt_in if gt_in[i] == object_id]
    for i in gt_in:
        # print(i)
        if i[1] == object_id:
            tracklet.append(i[2:6])
    return np.array(tracklet)

def tokenizer_frames(tracking, sliding_window):
    
    token  = [] 
    for i in range(sliding_window):
        current_bbox = tracking[i]
        # print("bounding box at {} frame : {}".format(i,current_bbox))
        # print("tracklet at Nth place is : ", tracking[sliding_window])
        delta = np.subtract(tracking[sliding_window], current_bbox)
        # print("delta is : ", delta)
        token.append(delta)
    
    return np.array(token)


for sequence in os.listdir(mot_dir):
    # seq = os.path.join(mot_dir, sequence)
    image_dir = os.path.join(mot_dir, sequence, "img1")
    gt_file = os.path.join(mot_dir, sequence, "gt", "gt.txt")
    # print(gt_file)

    gt_in = np.loadtxt(gt_file, delimiter = ",")

    # print(gt_in)

    ## Lets define 1 tracklet first
    object_id = 2 ## Tracklet with Object ID = 1
    tracklet = get_tracklet(gt_in, object_id)
    # print(" tracklet is : ", gt_in[tracklet])
    print(tracklet.shape)

    sliding_window = 50
    token = tokenizer_frames(tracklet, sliding_window)


    print("token shape is : ", token.shape)
    token = torch.tensor(token, dtype = torch.float32)

    input_dim = 4
    output_dim = 4
    embedding_layer = nn.Linear(input_dim, output_dim)

    embedded_tensor = embedding_layer(token)

    print(embedded_tensor)

    print(" shape of embedded tensot  : ", embedded_tensor.shape)
    
    break



(13, 4)


IndexError: index 50 is out of bounds for axis 0 with size 13

In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from mamba_ssm import Mamba

class BiMambaEncoder(nn.Module):
    def __init__(self, d_model, n_state):
        super(BiMambaEncoder, self).__init__()
        self.d_model = d_model
        
        self.mamba = Mamba(d_model, n_state)

        # Norm and feed-forward network layer
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )

    def forward(self, x):
        # Residual connection of the original input
        residual = x
        
        # Forward Mamba
        x_norm = self.norm1(x)
        mamba_out_forward = self.mamba(x_norm)

        # Backward Mamba
        x_flip = torch.flip(x_norm, dims=[1])  # Flip Sequence
        mamba_out_backward = self.mamba(x_flip)
        mamba_out_backward = torch.flip(mamba_out_backward, dims=[1])  # Flip back

        # Combining forward and backward
        mamba_out = mamba_out_forward + mamba_out_backward
        
        mamba_out = self.norm2(mamba_out)
        ff_out = self.feed_forward(mamba_out)

        output = ff_out + residual
        return output

# Initialize and test the model
d_model = 4
n_state = 64
model = BiMambaEncoder(d_model, n_state).cuda()
x = torch.rand(1, 50, d_model).cuda()  # Analog input data: (batch_size, seq_len, feature_dim)
output = model(x)
print(output.shape)  # Mamba Out: (32, 100, 512)

torch.Size([1, 50, 4])


## Initialize Tracklets

In [42]:
def initialize_tracklets(data):
    first_frame = int(np.min(data[:, 0]))
    first_frame_data = data[data[:, 0] == first_frame]
    # print(" first frame data : ", first_frame_data)

    tracklets = {}
    for row in first_frame_data:
        # print(" row is :", row)
        object_id = int(row[1])
        if object_id not in tracklets:
            tracklets[object_id] = [row[2:6]]
    return tracklets

### Update tracklets

In [43]:
def update_tracklets_for_frame(data, tracklets, frame):
    # unique_frames = np.unique(data[:, 0])


    # for frame in unique_frames:
    #     if frame == np.min(data[:, 0]):
    #         continue
    


    frame_data = data[data[:, 0] == frame]

    for row in frame_data:
        object_id = int(row[1])
        if object_id in tracklets:
            tracklets[object_id].append(row[2:6])
        else:
            ## Initialize a new tracklet if the object ID does not exist
            tracklets[object_id] = [row[2:6]]

    return tracklets
### Convert tracklet to numpy arrays

def convert_tracklets_to_numpy(tracklets):
    for object_id , tracklet_data in tracklets.items():
        tracklets[object_id] = np.array(tracklet_data)
    
    return tracklets



def load_data(filename):
    data = np.loadtxt(filename, delimiter=',', dtype=float)
    return data

In [44]:
filename = "MOT17/train/MOT17-02-SDP/gt/gt.txt"
data = load_data(filename)
tracklets = initialize_tracklets(data)

unique_frames = np.unique(data[:, 0])
for frame in unique_frames:
    if frame == 1:
        continue
    else:
        tracklets = update_tracklets_for_frame(data, tracklets, frame)
    # trackletsupdate_tracklets(data, tracklets) ## Updates all the tracklets at once. Can be changed to sending data per frame and updating the tracklets in a better way
tracklets = convert_tracklets_to_numpy(tracklets)


In [45]:
tracklets.get(2).shape

(56, 4)

## Tracklet Delta - Creating a Tokenizer function


In [47]:
def compute_deltas(tracklet_data):
    # Compute deltas as described
    deltas = []
    for i in range(len(tracklet_data) - 1):
        delta = tracklet_data[i+1] - tracklet_data[i]
        deltas.append(delta)
    return np.array(deltas)


# Example tracklet data: shape (num_frames, 4) with columns [cx, cy, w, h]
# tracklet_data = np.array([
#     [100, 150, 50, 60],
#     [105, 155, 50, 60],
#     [110, 160, 50, 60]
# ])


# Create delta values for input sequence

window_size = 50
tracklet_data = tracklets.get(3)[:window_size+1]

tracklet_deltas = compute_deltas(tracklet_data)
print(tracklet_deltas.shape)

(50, 4)


In [48]:
import torch.nn as nn

class TemporalTokenEmbedding(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(TemporalTokenEmbedding, self).__init__()
        self.embedding = nn.Linear(input_dim, embedding_dim)
    
    def forward(self, x):
        return self.embedding(x)

# Parameters
input_dim = tracklet_deltas.shape[1]  # Number of features in deltas (e.g., 4)
print(input_dim)
# Create and apply embedding layer
# embedding_layer = TemporalTokenEmbedding(input_dim, embedding_dim)
tracklet_delta_tensor = torch.tensor(tracklet_deltas, dtype=torch.float32).cuda()

4


## Steps to implement Bi-Mamba Encdoding Layer

#### Define Forward and Backward Mamba Modules: 
Implement the forward and backward Mamba modules.
#### Create a Bi-Mamba Block: 
Use the forward and backward modules to process input and apply normalization and MLP.
#### Assemble the Bi-Mamba Encoding Layer: 
Stack multiple Bi-Mamba blocks.

In [49]:
import torch
import torch.nn as nn
from mamba_ssm import Mamba



In [66]:

class BiMambaBlock(nn.Module):
    def __init__(self, d_model, n_state):
        super(BiMambaBlock, self).__init__()
        self.d_model = d_model
        
        self.mamba = Mamba(d_model, n_state)

        # Norm and feed-forward network layer
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )

    def forward(self, x):
        # Residual connection of the original input
        residual = x
        
        # Forward Mamba
        x_norm = self.norm1(x)
        mamba_out_forward = self.mamba(x_norm)

        # Backward Mamba
        x_flip = torch.flip(x_norm, dims=[1])  # Flip Sequence
        mamba_out_backward = self.mamba(x_flip)
        mamba_out_backward = torch.flip(mamba_out_backward, dims=[1])  # Flip back
        # print("mamba out backward shape :", mamba_out_backward.shape)
    
        # Combining forward and backward
        mamba_out = mamba_out_forward + mamba_out_backward
        mamba_out1  = self.norm2(mamba_out)
        # print("mamba out 1 shape :", mamba_out1.shape)
    
        mamba_out2 = self.feed_forward(mamba_out)

        ff_out  = mamba_out1 + mamba_out2
        # output = ff_out + residualstart_index
        return ff_out


In [67]:
class BiMambaEncodingLayer(nn.Module):
    def __init__(self, embedding_dim, num_blocks):
        super(BiMambaEncodingLayer, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_blocks = num_blocks
        self.blocks = nn.ModuleList([BiMambaBlock(input_dim, embedding_dim) for _ in range(num_blocks)])

        self.mamba_block = BiMambaBlock(embedding_dim, embedding_dim)
    def forward(self, x):
        # print(" embedding dimension is : ", self.embedding_dim)
        for block in self.blocks:
            x = block(x)
        # x = self.mamba_block(x)
        # x = self.mamba_block(x)
        
            # x = block(x)
            # print(" x shape in mamba block is : ", x.shape)
        
        return x

In [133]:
class FullModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_blocks, prediction_dim):
        super(FullModel, self).__init__()
        self.temporal_token_embedding = TemporalTokenEmbedding(input_dim, embedding_dim)
        self.bi_mamba_encoding_layer = BiMambaEncodingLayer(embedding_dim, num_blocks)
        self.prediction_head = nn.Linear(embedding_dim, prediction_dim)
    
    def forward(self, x):
        x = self.temporal_token_embedding(x)
        # print(" x shape is : ", x.shape)
        # print(' type of x is : ', type(x))
        # x  =  x.unsqueeze(0)
        # print(" x after reshaping it is : ", x.shape)
        x = self.bi_mamba_encoding_layer(x)
        # print(" x shape after  bimamba encoding layer is : ", x.shape)
        # x = self.prediction_head(x) ## This returns (batch, contect_window, 4) where 4 is the bounind box
        
        # We only want the last element prediction
        x = self.prediction_head(x[:, -1, :]) 
        
        # print(" x shape after  prediction head layer : ", x.shape)
        
        return x

# Example usage
input_dim = tracklet_delta_tensor.shape[1]  # Number of features in deltas (e.g., 4)
embedding_dim = 128 # Example embedding dimension
num_blocks = 4  # Number of Bi-Mamba blocks
prediction_dim = 4  # Number of predicted offsets

# Create and apply model
# model = FullModel(input_dim, embedding_dim, num_blocks, prediction_dim).cuda()
# predictions = model(tracklet_delta_tensor).cuda()
# print(" shape of predictions : ", predictions.shape)

## Dataset class creation -0 copying DIffMOT Class for now

In [134]:

from torch.utils.data import Dataset

import glob
import numpy as np
import os



class MambaMOTDataset(Dataset):
    def __init__(self, path, interval=None):
        # self.config = 
        self.interval = interval + 1 ## Changed config interval to interval only

        self.trackers = {}
        self.images = {}
        self.nframes = {}
        self.ntrackers = {}

        self.nsamples = {}
        self.nS = 0

        self.nds = {}
        self.cds = {}
        if os.path.isdir(path):
            # if 'MOT' in path:
            self.seqs = ['MOT17-02-SDP', 'MOT17-11-SDP', 'MOT17-04-SDP', 'MOT17-05-SDP', 'MOT17-09-DPM', 'MOT17-11-FRCNN', 
                            'MOT17-10-DPM', 'MOT17-10-FRCNN', 'MOT17-09-SDP', 'MOT17-10-SDP', 'MOT17-11-DPM', 'MOT17-13-SDP', 
                            'MOT17-02-DPM', 'MOT17-13-FRCNN', 'MOT17-02-FRCNN', 'MOT17-13-DPM', 'MOT17-04-DPM', 'MOT17-05-FRCNN', 
                            'MOT17-09-FRCNN', 'MOT17-04-FRCNN', 'MOT17-05-DPM']
                
                
            # else:
                # self.seqs = [s for s in os.listdir(path)]
            self.seqs.sort()
            lastindex = 0
            for seq in self.seqs:
                
                # path_yo = os.path.join(path + "/" + seq, "img1/"))
                # print(path_yo)
                seq_path = glob.glob(os.path.join(path, seq, "img1", "*.txt"))
                # print(os.listdir(seq_path))
                # trackerPath = glob.glob(os.path.join(path + "/" + seq, "/img1/*.txt"))
                # print(seq_path)
                # print(" tracker path fiels are : ", seq_path)
                self.trackers[seq] = sorted((seq_path))
                self.ntrackers[seq] = len(self.trackers[seq])
                print(" number of tracks in sequence : {} : {}".format(seq, self.ntrackers[seq]))
                if 'MOT' in seq:
                    # print(" Yes Coming her?")
                    imagePath = os.path.join(path, '../../train', seq, "img1/*.*")
                else:
                    # print(" else")
                    imagePath = os.path.join(path, '../train', seq, "img1/*.*")
                self.images[seq] = sorted(glob.glob(imagePath))
                self.nframes[seq] = len(self.images[seq])
                print(" number of frames in sequence : {} : {}".format(seq, self.nframes[seq]))
                
                # print("images are : ", self.images)

                self.nsamples[seq] = {}
                for i, pa in enumerate(self.trackers[seq]):
                    self.nsamples[seq][i] = len(np.loadtxt(pa, dtype=np.float32).reshape(-1,7)) - self.interval
                    self.nS += self.nsamples[seq][i]


                self.nds[seq] = [x for x in self.nsamples[seq].values()]
                self.cds[seq] = [sum(self.nds[seq][:i]) + lastindex for i in range(len(self.nds[seq]))]
                lastindex = self.cds[seq][-1] + self.nds[seq][-1]

        print('=' * 80)
        print('dataset summary')
        print(self.nS)
        print('=' * 80)

    def __getitem__(self, files_index):

        for i, seq in enumerate(self.cds):
            if files_index >= self.cds[seq][0]:
                ds = seq
                for j, c in enumerate(self.cds[seq]):
                    if files_index >= c:
                        trk = j
                        start_index = c
                    else:
                        break
            else:
                break

        track_path = self.trackers[ds][trk]
        track_gt = np.loadtxt(track_path, dtype=np.float32)

        init_index = files_index - start_index

        cur_index = init_index + self.interval
        cur_gt = track_gt[cur_index]
        cur_bbox = cur_gt[2:6]

        boxes = [track_gt[init_index + tmp_ind][2:6] for tmp_ind in range(self.interval)]
        delt_boxes = [boxes[i+1] - boxes[i] for i in range(self.interval - 1)]
        conds = np.concatenate((np.array(boxes)[1:], np.array(delt_boxes)), axis=1)

        delt = cur_bbox - boxes[-1]
        ret = {"cur_gt": cur_gt, "cur_bbox": cur_bbox, "condition": conds, "delta_bbox": delt}

        return ret

    def __len__(self):
        return self.nS


In [135]:
# data_path = 'MOT17/trackers_gt/train'

# interval = 50
# train_dataset = MambaMOTDataset(data_path, 50)

# train_data_loader = torch.utils.data.DataLoader(
#             train_dataset,
#             batch_size=64,
#             shuffle=True,
#             num_workers= 2,
#             pin_memory=True
#         )

In [136]:
# for batch in train_data_loader:
#     # print(" shape of batch is : ", batch)
#     data = batch
#     current_gt = data['cur_gt']
#     current_bbox = data['cur_bbox']
#     delta_bbox = data['delta_bbox']
#     print("current GT shape :", current_gt.shape)
#     print("delta shape : ", delta_bbox.shape)
#     print("current bbox shape : ", current_bbox.shape)
    

### Trying to create the Dataset class for it to function properly with Mamba/LSTM like model

In [137]:
import os
import numpy as np
from torch.utils.data import Dataset

class MOT20Dataset(Dataset):
    def __init__(self, root_dir, context_window=10):
        self.root_dir = root_dir
        self.context_window = context_window
        self.tracklets = self.load_tracklets()
        # print("tracklets are : ", self.tracklets)

    def load_tracklets(self):
        """
        Load the ground truth data from all the sequences in the MOT20 dataset.
        """
        tracklets = {}
        for seq in sorted(os.listdir(self.root_dir)):
            # print(" sequence is : ", seq)
            gt_path = os.path.join(self.root_dir, seq, 'gt', 'gt.txt')
            if os.path.exists(gt_path):
                data = np.loadtxt(gt_path, delimiter=',')
                # print(" shape of data is : ", data.shape)
                frames = np.unique(data[:, 0]).astype(int)
                # print(" frames list is : ", len(frames))
                # exit(0)
                tracklets[seq] = {frame: data[data[:, 0] == frame, :] for frame in frames}
        return tracklets

    def __len__(self):
        """
        Calculate the number of sliding windows possible for all sequences combined.
        """
        total_windows = 0
        for seq, frames in self.tracklets.items():
            total_windows += max(0, len(frames) - self.context_window)
        return total_windows

    def __getitem__(self, idx):
        """
        Generate a sliding window sample from the dataset.
        """
        current_idx = idx
        for seq, frames in self.tracklets.items():
            num_windows = max(0, len(frames) - self.context_window)
            if current_idx < num_windows:
                frame_indices = list(frames.keys())[current_idx:current_idx + self.context_window + 1]
                input_frames = [frames[frame_idx][:, 2:6] for frame_idx in frame_indices[:-1]]  # (x, y, width, height)
                target_frame = frames[frame_indices[-1]][:, 2:6]  # Next frame's bounding boxes
                
                input_frames = np.stack(input_frames)  # Convert list of arrays to a single array
                return input_frames, target_frame

            current_idx -= num_windows

        raise IndexError(f"Index {idx} out of range.")

# Usage example
root_dir = 'MOT17/train'
dataset = MOT20Dataset(root_dir)

# Access one sample
input_frames, target_frame = dataset[0]
print(input_frames.shape, target_frame.shape)
print("target is : ", target_frame)

(10, 39, 4) (40, 4)
target is :  [[ 9.120e+02  4.840e+02  9.700e+01  1.090e+02]
 [ 1.426e+03  4.090e+02  1.270e+02  3.930e+02]
 [ 5.870e+02  4.450e+02  8.600e+01  2.660e+02]
 [ 1.585e+03 -1.000e+00  3.360e+02  5.780e+02]
 [ 1.163e+03  4.410e+02  3.300e+01  8.900e+01]
 [ 1.308e+03  4.310e+02  3.400e+01  1.180e+02]
 [ 1.525e+03  4.280e+02  1.350e+02  3.490e+02]
 [ 1.055e+03  4.830e+02  3.600e+01  1.100e+02]
 [ 1.090e+03  4.840e+02  3.200e+01  1.140e+02]
 [ 7.300e+02  4.870e+02  3.600e+01  7.300e+01]
 [ 6.790e+02  4.920e+02  5.200e+01  1.050e+02]
 [ 7.350e+02  4.570e+02  2.700e+01  7.500e+01]
 [ 1.256e+03  4.470e+02  3.300e+01  1.000e+02]
 [ 1.015e+03  4.340e+02  4.000e+01  1.160e+02]
 [ 1.100e+03  4.400e+02  3.800e+01  1.080e+02]
 [ 9.340e+02  4.350e+02  4.200e+01  1.140e+02]
 [ 4.420e+02  4.460e+02  1.250e+02  2.820e+02]
 [ 6.400e+02  4.590e+02  6.200e+01  1.870e+02]
 [ 1.378e+03  4.380e+02  6.600e+01  1.240e+02]
 [ 1.504e+03  4.380e+02  5.100e+01  1.230e+02]
 [ 4.800e+02  4.600e+02  8.

##### The previous class was giving me the input tracklets for 10 frames and the target tracklet for 11th frame. Now this class below is going to give me offset format the way it is in the paper

In [163]:
import numpy as np
import os
from torch.utils.data import Dataset

class MOT20Dataset(Dataset):
    def __init__(self, path, window_size=10):
        self.window_size = window_size

        # Initialize data storage
        self.data = []
        self.targets = []

        # Load the dataset
        self._load_data(path)

    def _load_data(self, path):
        """
        Load data from the provided path and compute the bounding box differences.
        """
        sequences = [seq for seq in os.listdir(path) if os.path.isdir(os.path.join(path, seq))]
        # sequences = ["MOT17-02-DPM"]
        sequences.sort()
        
        for seq in sequences:
            gt_path = os.path.join(path, seq, "gt", "gt.txt")  # Path to ground truth file
            if not os.path.exists(gt_path):
                continue
            
            # Load ground truth data for the sequence
            gt_data = np.loadtxt(gt_path, delimiter=',')
            
            # Filter for specific object IDs, sorting by frame number
            for obj_id in np.unique(gt_data[:, 1]):
                obj_data = gt_data[gt_data[:, 1] == obj_id]
                obj_data = obj_data[obj_data[:, 0].argsort()]  # Sort by frame number
                # print(" object data is : ", obj_data)
                # Extract bounding boxes (columns: [frame, id, left, top, width, height, conf, x, y, z])
                bboxes = obj_data[:, 2:6]  # [left, top, width, height]
                
                # Compute differences and form input-target pairs
                for i in range(len(bboxes) - self.window_size):
                    input_diffs = np.diff(bboxes[i:i + self.window_size + 1], axis=0)
                    input_data = input_diffs[:-1]  # Differences for the input window
                    target_data = input_diffs[-1]  # Difference for the target frame
                    
                    self.data.append(input_data)
                    self.targets.append(target_data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Get the input-target pair at index `idx`.
        """
        input_data = self.data[idx]
        target_data = self.targets[idx]
        return torch.from_numpy(input_data.astype(float)), torch.from_numpy(target_data.astype(float))
    


In [164]:

from torch.utils.data import DataLoader

# Initialize the dataset
dataset = MOT20Dataset(path='MOT17/train', window_size=11)
print(" dataset[0] : ", dataset[1])
# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
 

 dataset[0] :  (tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], dtype=torch.float64), tensor([0., 0., 0., 0.], dtype=torch.float64))


In [165]:

# Iterate through the DataLoader
for data, targets in dataloader:
    # Your training code here
    print("input shape if : ", data.shape)

    
    # pass

input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 10, 4])
input shape if :  torch.Size([64, 

In [166]:
class WarmupScheduler:
    def __init__(self, optimizer, warmup_steps, initial_lr, warmup_lr):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.initial_lr = initial_lr
        self.warmup_lr = warmup_lr
        self.current_step = 0

    def step(self):
        # Linear warmup
        if self.current_step < self.warmup_steps:
            lr = self.warmup_lr + (self.initial_lr - self.warmup_lr) * (self.current_step / self.warmup_steps)
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = lr
            self.current_step += 1

    def get_lr(self):
        return self.optimizer.param_groups[0]['lr']
    
    

# model = modenum_blocksl.float()
criterion = nn.SmoothL1Loss()  # Mean squared error loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas = (0.9, 0.98), )

    
from torch.optim.lr_scheduler import StepLR

# Define the warmup scheduler
warmup_scheduler = WarmupScheduler(optimizer, warmup_steps=4000, initial_lr=0.001, warmup_lr=1e-6)

# Define a standard scheduler to use after warmup
scheduler_after_warmup = StepLR(optimizer, step_size=30, gamma=0.1)

#### Creating a basic LSTM Model for now with Adam optimizer and MSE Loss to do the prediction of offsets of bounding boxes. 

In [167]:
import torch
import torch.nn as nn
from mamba_ssm import Mamba

class BBoxLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(BBoxLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        # self.mamba = Mamba(input_size, hidden_size, num_layers, batch_first=True)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # LSTM forward pass
        out, _ = self.lstm(x, (h0, c0))
        
        # Take the output from the last time step
        out = self.fc(out[:, -1, :]).float()
        

        return out

In [169]:
from warmup_scheduler_pytorch import WarmUpScheduler
from torch.optim.lr_scheduler import StepLR  # example


# Model parameters
input_size = 4  # Bounding box has 4 coordinates: [x, y, width, height]
hidden_size = 64
output_size = 4  # Output also has 4 coordinates
num_layers = 1
embedding_dim = 256
num_blocks = 3
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Initialize model
# model = BBoxLSTMModel(input_size, hidden_size, output_size, num_layers).to(device)
model = FullModel(input_size, embedding_dim, num_blocks, output_size).to(device)

# Load data
dataset = MOT20Dataset(path='MOT17/train', window_size=10)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


scheduler_after_warmup = StepLR(optimizer, step_size=30, gamma=0.1)

warmup_scheduler = WarmupScheduler(optimizer, warmup_steps=4000, initial_lr=0.001, warmup_lr=1e-6)

import time


# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    start_time = time.time()
    for inputs, targets in dataloader:
        # Move tensors to the configured device
        inputs, targets = inputs.to(device), targets.to(device)
        # print("shape of inputs is : ", inputs.shape)
        targets = targets.float()
        # Forward pass
        outputs = model(inputs.float())
        
        # print(" shape of outputs is : ", outputs.shape)
        # print(" shape of targets is : ", targets.shape)
        loss = criterion(outputs, targets)

        # print("output is : ", outputs)
        # print("target is : ", targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Step the warmup scheduler
        # if warmup_scheduler.current_step < warmup_scheduler.warmup_steps:
        #     warmup_scheduler.step()
        # else:
        #     # Step the standard scheduler after warmup
        #     scheduler_after_warmup.step()
    end_time = time.time()
    time_taken = end_time - start_time
    print('Epoch [{}/{}], Loss: {} , Time Taken : {}'.format(epoch+1, num_epochs, loss.item(), time_taken))

Epoch [1/50], Loss: 0.5826888680458069 , Time Taken : 149.2920160293579
Epoch [2/50], Loss: 1.0610642433166504 , Time Taken : 148.22449612617493


In [143]:
bboxes = np.random.randint(50, size = (12, 4))
bboxes

array([[35, 33, 27, 16],
       [19, 46, 15, 39],
       [32, 12, 17,  9],
       [ 2, 23, 32, 27],
       [33, 34, 21, 19],
       [39, 24, 20, 20],
       [ 1, 34,  4, 48],
       [18, 16, 34, 32],
       [44, 27, 44, 19],
       [36, 36,  1, 22],
       [37, 28,  0,  2],
       [39,  2,  4, 35]])

In [144]:

window_size = 4
data = []
hello = []

for i in range(len(bboxes) - window_size):
    print(' i is : ', i)
    input_diffs = np.diff(bboxes[i:i + window_size + 1], axis=0)
    # print(" input difference are :", input_diffs)
    input_data = input_diffs[:-1]  # Differences for the input window
    target_data = input_diffs[-1]  # Difference for the target frame
    # print(" target data is : ", target_data)
    data.append(input_data)
    hello.append(target_data)


# print(bboxes)

print(np.shape(data))
for i in range(len(data)):
    print(" input is : ", data[i])
    
    
# for j in range(len(hello)):
    
print("target data is : ", hello)

# print(" input data is : ", data)

 i is :  0
 i is :  1
 i is :  2
 i is :  3
 i is :  4
 i is :  5
 i is :  6
 i is :  7
(8, 3, 4)
 input is :  [[-16  13 -12  23]
 [ 13 -34   2 -30]
 [-30  11  15  18]]
 input is :  [[ 13 -34   2 -30]
 [-30  11  15  18]
 [ 31  11 -11  -8]]
 input is :  [[-30  11  15  18]
 [ 31  11 -11  -8]
 [  6 -10  -1   1]]
 input is :  [[ 31  11 -11  -8]
 [  6 -10  -1   1]
 [-38  10 -16  28]]
 input is :  [[  6 -10  -1   1]
 [-38  10 -16  28]
 [ 17 -18  30 -16]]
 input is :  [[-38  10 -16  28]
 [ 17 -18  30 -16]
 [ 26  11  10 -13]]
 input is :  [[ 17 -18  30 -16]
 [ 26  11  10 -13]
 [ -8   9 -43   3]]
 input is :  [[ 26  11  10 -13]
 [ -8   9 -43   3]
 [  1  -8  -1 -20]]
target data is :  [array([ 31,  11, -11,  -8]), array([  6, -10,  -1,   1]), array([-38,  10, -16,  28]), array([ 17, -18,  30, -16]), array([ 26,  11,  10, -13]), array([ -8,   9, -43,   3]), array([  1,  -8,  -1, -20]), array([  2, -26,   4,  33])]
