In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
import torch.nn as nn

In [3]:
mot_dir = "MOT17/train"
print(os.listdir(mot_dir))

['MOT17-11-FRCNN', 'MOT17-02-SDP', 'MOT17-04-FRCNN', 'MOT17-13-DPM', 'MOT17-04-SDP', 'MOT17-09-FRCNN', 'MOT17-02-FRCNN', 'MOT17-04-DPM', 'MOT17-11-DPM', 'MOT17-05-SDP', 'MOT17-11-SDP', 'MOT17-05-FRCNN', 'MOT17-05-DPM', 'MOT17-10-DPM', 'MOT17-13-FRCNN', 'MOT17-09-DPM', 'MOT17-02-DPM', 'MOT17-10-FRCNN', 'MOT17-09-SDP', 'MOT17-13-SDP', 'MOT17-10-SDP']


In [4]:

def get_tracklet(gt_in, object_id):
    """
    Extract the tracklet for a given object ID.
    :param df: DataFrame containing the ground truth data.
    :param object_id: The ID of the object to extract the tracklet for.
    :return: DataFrame containing the tracklet.
    """
    tracklet  = []
    # tracklet.append(i) [for i in gt_in if gt_in[i] == object_id]
    for i in gt_in:
        # print(i)
        if i[1] == object_id:
            tracklet.append(i[2:6])
    return np.array(tracklet)

def tokenizer_frames(tracking, sliding_window):
    
    token  = [] 
    for i in range(sliding_window):
        current_bbox = tracking[i]
        # print("bounding box at {} frame : {}".format(i,current_bbox))
        # print("tracklet at Nth place is : ", tracking[sliding_window])
        delta = np.subtract(tracking[sliding_window], current_bbox)
        # print("delta is : ", delta)
        token.append(delta)
    
    return np.array(token)


for sequence in os.listdir(mot_dir):
    # seq = os.path.join(mot_dir, sequence)
    image_dir = os.path.join(mot_dir, sequence, "img1")
    gt_file = os.path.join(mot_dir, sequence, "gt", "gt.txt")
    # print(gt_file)

    gt_in = np.loadtxt(gt_file, delimiter = ",")

    # print(gt_in)

    ## Lets define 1 tracklet first
    object_id = 2 ## Tracklet with Object ID = 1
    tracklet = get_tracklet(gt_in, object_id)
    # print(" tracklet is : ", gt_in[tracklet])
    print(tracklet.shape)

    sliding_window = 50
    token = tokenizer_frames(tracklet, sliding_window)


    print("token shape is : ", token.shape)
    token = torch.tensor(token, dtype = torch.float32)

    input_dim = 4
    output_dim = 4
    embedding_layer = nn.Linear(input_dim, output_dim)

    embedded_tensor = embedding_layer(token)

    print(embedded_tensor)

    print(" shape of embedded tensot  : ", embedded_tensor.shape)
    
    break



(13, 4)


IndexError: index 50 is out of bounds for axis 0 with size 13

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from mamba_ssm import Mamba

class BiMambaEncoder(nn.Module):
    def __init__(self, d_model, n_state):
        super(BiMambaEncoder, self).__init__()
        self.d_model = d_model
        
        self.mamba = Mamba(d_model, n_state)

        # Norm and feed-forward network layer
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )

    def forward(self, x):
        # Residual connection of the original input
        residual = x
        
        # Forward Mamba
        x_norm = self.norm1(x)
        mamba_out_forward = self.mamba(x_norm)

        # Backward Mamba
        x_flip = torch.flip(x_norm, dims=[1])  # Flip Sequence
        mamba_out_backward = self.mamba(x_flip)
        mamba_out_backward = torch.flip(mamba_out_backward, dims=[1])  # Flip back

        # Combining forward and backward
        mamba_out = mamba_out_forward + mamba_out_backward
        
        mamba_out = self.norm2(mamba_out)
        ff_out = self.feed_forward(mamba_out)

        output = ff_out + residual
        return output

# Initialize and test the model
d_model = 4
n_state = 64
model = BiMambaEncoder(d_model, n_state).cuda()
x = torch.rand(1, 50, d_model).cuda()  # Analog input data: (batch_size, seq_len, feature_dim)
output = model(x)
print(output.shape)  # Mamba Out: (32, 100, 512)

torch.Size([1, 50, 4])


## Initialize Tracklets

In [6]:
def initialize_tracklets(data):
    first_frame = int(np.min(data[:, 0]))
    first_frame_data = data[data[:, 0] == first_frame]
    # print(" first frame data : ", first_frame_data)

    tracklets = {}
    for row in first_frame_data:
        # print(" row is :", row)
        object_id = int(row[1])
        if object_id not in tracklets:
            tracklets[object_id] = [row[2:6]]
    return tracklets

### Update tracklets

In [7]:
def update_tracklets_for_frame(data, tracklets, frame):
    # unique_frames = np.unique(data[:, 0])


    # for frame in unique_frames:
    #     if frame == np.min(data[:, 0]):
    #         continue
    


    frame_data = data[data[:, 0] == frame]

    for row in frame_data:
        object_id = int(row[1])
        if object_id in tracklets:
            tracklets[object_id].append(row[2:6])
        else:
            ## Initialize a new tracklet if the object ID does not exist
            tracklets[object_id] = [row[2:6]]

    return tracklets
### Convert tracklet to numpy arrays

def convert_tracklets_to_numpy(tracklets):
    for object_id , tracklet_data in tracklets.items():
        tracklets[object_id] = np.array(tracklet_data)
    
    return tracklets



def load_data(filename):
    data = np.loadtxt(filename, delimiter=',', dtype=float)
    return data

In [8]:
filename = "MOT17/train/MOT17-02-SDP/gt/gt.txt"
data = load_data(filename)
tracklets = initialize_tracklets(data)

unique_frames = np.unique(data[:, 0])
for frame in unique_frames:
    if frame == 1:
        continue
    else:
        tracklets = update_tracklets_for_frame(data, tracklets, frame)
    # trackletsupdate_tracklets(data, tracklets) ## Updates all the tracklets at once. Can be changed to sending data per frame and updating the tracklets in a better way
tracklets = convert_tracklets_to_numpy(tracklets)


In [9]:
tracklets.get(2).shape

(56, 4)

## Tracklet Delta - Creating a Tokenizer function


In [10]:
def compute_deltas(tracklet_data):
    # Compute deltas as described
    deltas = []
    for i in range(len(tracklet_data) - 1):
        delta = tracklet_data[i+1] - tracklet_data[i]
        deltas.append(delta)
    return np.array(deltas)


# Example tracklet data: shape (num_frames, 4) with columns [cx, cy, w, h]
# tracklet_data = np.array([
#     [100, 150, 50, 60],
#     [105, 155, 50, 60],
#     [110, 160, 50, 60]
# ])


# Create delta values for input sequence

window_size = 50
tracklet_data = tracklets.get(3)[:window_size+1]

tracklet_deltas = compute_deltas(tracklet_data)
print(tracklet_deltas.shape)

(50, 4)


In [11]:

# Parameters
input_dim = tracklet_deltas.shape[1]  # Number of features in deltas (e.g., 4)
print(input_dim)
# Create and apply embedding layer
# embedding_layer = TemporalTokenEmbedding(input_dim, embedding_dim)
tracklet_delta_tensor = torch.tensor(tracklet_deltas, dtype=torch.float32).cuda()

4


## Steps to implement Bi-Mamba Encdoding Layer

#### Define Forward and Backward Mamba Modules: 
Implement the forward and backward Mamba modules.
#### Create a Bi-Mamba Block: 
Use the forward and backward modules to process input and apply normalization and MLP.
#### Assemble the Bi-Mamba Encoding Layer: 
Stack multiple Bi-Mamba blocks.

In [12]:
import torch
import torch.nn as nn
from mamba_ssm import Mamba



In [13]:


# Example usage
input_dim = tracklet_delta_tensor.shape[1]  # Number of features in deltas (e.g., 4)
embedding_dim = 128 # Example embedding dimension
num_blocks = 4  # Number of Bi-Mamba blocks
prediction_dim = 4  # Number of predicted offsets

# Create and apply model
# model = FullModel(input_dim, embedding_dim, num_blocks, prediction_dim).cuda()
# predictions = model(tracklet_delta_tensor).cuda()
# print(" shape of predictions : ", predictions.shape)

## Dataset class creation -0 copying DIffMOT Class for now

In [14]:

from torch.utils.data import Dataset

import glob
import numpy as np
import os


In [15]:
# data_path = 'MOT17/trackers_gt/train'

# interval = 50
# train_dataset = MambaMOTDataset(data_path, 50)

# train_data_loader = torch.utils.data.DataLoader(
#             train_dataset,
#             batch_size=64,
#             shuffle=True,
#             num_workers= 2,
#             pin_memory=True
#         )

In [16]:
# for batch in train_data_loader:
#     # print(" shape of batch is : ", batch)
#     data = batch
#     current_gt = data['cur_gt']
#     current_bbox = data['cur_bbox']
#     delta_bbox = data['delta_bbox']
#     print("current GT shape :", current_gt.shape)
#     print("delta shape : ", delta_bbox.shape)
#     print("current bbox shape : ", current_bbox.shape)
    

### Trying to create the Dataset class for it to function properly with Mamba/LSTM like model

In [17]:
import os
import numpy as np
from torch.utils.data import Dataset


##### The previous class was giving me the input tracklets for 10 frames and the target tracklet for 11th frame. Now this class below is going to give me offset format the way it is in the paper

In [18]:

# import sys
# sys.path.append('/home/dheerajk/Research_DK/Mamba-MOT')

from datasets import MOT20DatasetOffset
from torch.utils.data import DataLoader
# Initialize the dataset
dataset = MOT20DatasetOffset(path='MOT17/train', window_size=11)
print(" dataset[0] : ", dataset[0])
# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

 dataset[0] :  (tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], dtype=torch.float64), tensor([0., 0., 0., 0.], dtype=torch.float64))


In [19]:

# Iterate through the DataLoader
# for data, targets in dataloader:
#     # Your training code here
#     print("data shape is : ", data.shape)
#     print("targets shape is : ", targets.shape )

    
    # pass

In [32]:

# Model parameters
input_size = 4  # Bounding box has 4 coordinates: [x, y, width, height]
hidden_size = 64 ## This one is used for LSTM NEtwork which I tried
output_size = 4  # Output also has 4 coordinates
num_layers = 1 ## For LSTM
embedding_dim = 128 ## For Mamba
num_blocks = 1 ## For Mamba
num_epochs = 20

warmup_steps = 4000 ## This is for custom warmuo schedular

In [33]:
criterion = nn.SmoothL1Loss()  # Mean squared error loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas = (0.9, 0.98), )


In [35]:
from warmup_scheduler_pytorch import WarmUpScheduler
from torch.optim.lr_scheduler import StepLR  # example
from models_mamba import FullModel, BBoxLSTMModel
from schedulars import CustomWarmupScheduler




device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Initialize model

# Load data

model_used = "Mamba" ## LSTM

if model_used == "Mamba":
    model = FullModel(input_size, embedding_dim, num_blocks, output_size).to(device)
elif model_used == "LSTM":
    model = BBoxLSTMModel(input_size, hidden_size, output_size, num_layers).to(device)

    
dataset = MOT20DatasetOffset(path='MOT17/train', window_size=10)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
scheduler = CustomWarmupScheduler(optimizer, d_model = embedding_dim, warmup_steps = warmup_steps)


# scheduler_after_warmup = StepLR(optimizer, step_size=30, gamma=0.1)

# warmup_scheduler = WarmupScheduler(optimizer, warmup_steps=4000, initial_lr=0.001, warmup_lr=1e-6)

import time
print(" dataloader length is :", len(dataloader))
# exit(0)
# Training loop
num_epochs = 100
print(" Model used to training: ", model_used) ## This is just a sanity printing check so that I dont have to see which loss came from which model later on or re-train it
for epoch in range(num_epochs):
    start_time = time.time()
    epoch_loss = 0.0  # Initialize epoch_loss

    for inputs, targets in dataloader:
        # Move tensors to the configured device
        inputs, targets = inputs.to(device), targets.to(device)
        # print("shape of inputs is : ", inputs.shape)
        targets = targets.float()
        # Forward pass
        outputs = model(inputs.float())
        
        # print(" shape of outputs is : ", outputs.shape)
        # print(" shape of targets is : ", targets.shape)
        loss = criterion(outputs, targets)
        epoch_loss += loss.item()  # Accumulate loss
        # print(" loss is :", loss.item())

        # print("output is : ", outputs)
        # print("target is : ", targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Step the warmup scheduler
        # if warmup_scheduler.current_step < warmup_scheduler.warmup_steps:
        #     warmup_scheduler.step()
        # else:
        #     # Step the standard scheduler after warmup
        #     scheduler_after_warmup.step()
        
        # Update the learning rate
        scheduler.step()
        
        
    print("Accumulated loss is : ", epoch_loss)
    avg_loss = epoch_loss / len(dataloader)  # Calculate average loss for the epoch

    end_time = time.time()
    time_taken = end_time - start_time
    print('Epoch [{}/{}], Loss: {} , Time Taken : {}'.format(epoch+1, num_epochs, avg_loss, time_taken))

 dataloader length is : 9236
Accumulated loss is :  2.3095174357149517
Epoch [1/100], Loss: 0.0002500560237889727 , Time Taken : 44.247742652893066
Accumulated loss is :  2.309519380243728
Epoch [2/100], Loss: 0.0002500562343269519 , Time Taken : 44.23379588127136
Accumulated loss is :  2.3095168538420694
Epoch [3/100], Loss: 0.0002500559607884441 , Time Taken : 43.9530029296875
Accumulated loss is :  2.3095174262998626
Epoch [4/100], Loss: 0.00025005602276958235 , Time Taken : 44.157885789871216
Accumulated loss is :  2.3095159332006006
Epoch [5/100], Loss: 0.00025005586110877007 , Time Taken : 44.57202625274658


In [None]:
bboxes = np.random.randint(50, size = (12, 4))
bboxes

array([[ 4, 29, 49, 10],
       [41, 27, 48,  4],
       [30, 39, 23, 11],
       [ 4,  8, 15, 19],
       [ 0, 12, 32, 44],
       [15, 17, 18, 35],
       [38, 12, 39, 15],
       [ 2, 19, 47, 41],
       [18, 41, 28, 35],
       [41, 35,  0, 31],
       [20, 42, 49, 26],
       [17, 16, 28, 39]])

In [None]:

window_size = 4
data = []
hello = []

for i in range(len(bboxes) - window_size):
    print(' i is : ', i)
    input_diffs = np.diff(bboxes[i:i + window_size + 1], axis=0)
    # print(" input difference are :", input_diffs)
    input_data = input_diffs[:-1]  # Differences for the input window
    target_data = input_diffs[-1]  # Difference for the target frame
    # print(" target data is : ", target_data)
    data.append(input_data)
    hello.append(target_data)


# print(bboxes)

print(np.shape(data))
for i in range(len(data)):
    print(" input is : ", data[i])
    
    
# for j in range(len(hello)):
    
print("target data is : ", hello)

# print(" input data is : ", data)

 i is :  0
 i is :  1
 i is :  2
 i is :  3
 i is :  4
 i is :  5
 i is :  6
 i is :  7
(8, 3, 4)
 input is :  [[ 37  -2  -1  -6]
 [-11  12 -25   7]
 [-26 -31  -8   8]]
 input is :  [[-11  12 -25   7]
 [-26 -31  -8   8]
 [ -4   4  17  25]]
 input is :  [[-26 -31  -8   8]
 [ -4   4  17  25]
 [ 15   5 -14  -9]]
 input is :  [[ -4   4  17  25]
 [ 15   5 -14  -9]
 [ 23  -5  21 -20]]
 input is :  [[ 15   5 -14  -9]
 [ 23  -5  21 -20]
 [-36   7   8  26]]
 input is :  [[ 23  -5  21 -20]
 [-36   7   8  26]
 [ 16  22 -19  -6]]
 input is :  [[-36   7   8  26]
 [ 16  22 -19  -6]
 [ 23  -6 -28  -4]]
 input is :  [[ 16  22 -19  -6]
 [ 23  -6 -28  -4]
 [-21   7  49  -5]]
target data is :  [array([-4,  4, 17, 25]), array([ 15,   5, -14,  -9]), array([ 23,  -5,  21, -20]), array([-36,   7,   8,  26]), array([ 16,  22, -19,  -6]), array([ 23,  -6, -28,  -4]), array([-21,   7,  49,  -5]), array([ -3, -26, -21,  13])]
