In [1]:
import os

os.chdir('../ImageBind')

from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

from torch.utils.data import DataLoader


import pandas as pd
import torch

import torch.nn as nn
import torch.optim as optim


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=False)
model.train()
model.to(device)

ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [44]:
os.chdir("../Temporal-ImageBind")
df = pd.read_csv("data/Temporal-Imagebind/imu/imu.csv")

window_size = 100
step_size = 50

tensors = []
images = []

for start_row in range(0, len(df), step_size):
    
    if start_row + window_size > len(df):
        break
  
    window_df = df.iloc[start_row:start_row + window_size]
    median_row = window_df.iloc[49, -7:]
    images.append(int(median_row[0]/10**8))
    window_data = window_df.iloc[:, -6:]
    tensor = torch.tensor(window_data.values, dtype=torch.float)
    tensors.append(tensor.T)
   

print(images)
print(len(images))

[15341091825, 15341091875, 15341091925, 15341091975, 15341092025, 15341092075, 15341092125, 15341092175, 15341092225, 15341092275, 15341092325, 15341092375, 15341092425, 15341092475, 15341092525, 15341092575, 15341092625, 15341092675, 15341092725, 15341092775, 15341092825, 15341092875, 15341092925, 15341092975, 15341093025, 15341093075, 15341093125, 15341093175, 15341093225, 15341093275, 15341093325, 15341093375, 15341093425, 15341093475, 15341093525, 15341093575, 15341093625, 15341093675, 15341093725, 15341093775, 15341093825, 15341093875, 15341093925, 15341093975, 15341094025, 15341094075, 15341094125, 15341094175, 15341094225, 15341094275, 15341094325, 15341094375]
52


In [53]:

import os
import shutil


source_dir = "data/Temporal-Imagebind/images/"
target_dir = "data/Temporal-Imagebind/selected_images/"


if not os.path.exists(target_dir):
    os.makedirs(target_dir)

all_files = os.listdir(source_dir)


filtered_files = [file for file in all_files if file.endswith('.jpg') and any(file.startswith(str(number)) for number in images)]
filtered_files = sorted(filtered_files)

for file in filtered_files:
    shutil.copy(os.path.join(source_dir, file), os.path.join(target_dir, file))




for i in range(0, len(filtered_files)):
    filtered_files[i] = "data/Temporal-Imagebind/selected_images/"+filtered_files[i]
    
print(len(filtered_files))
print(filtered_files)


52
['data/Temporal-Imagebind/selected_images/1534109182500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109187500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109192500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109197500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109202500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109207500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109212500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109217500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109222500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109227500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109232500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109237500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109242500000000.jpg', 'data/Temporal-Imagebind/selected_images/1534109247500000000.jpg', 'data/Temporal-Imagebind/selected_images/15341092525000000

In [54]:


inputs = {
    ModalityType.VISION: data.load_and_transform_vision_data(filtered_files, device),
    ModalityType.IMU: tensors
}


In [56]:
from torch.utils.data import Dataset

class IMUImageDataset(Dataset):
    def __init__(self, imu_data, image_data):
        assert len(imu_data) == len(image_data), "IMU and image data must be of the same length"
        self.imu_data = imu_data
        self.image_data = image_data

    def __len__(self):
        return len(self.imu_data)

    def __getitem__(self, idx):
        imu_item = self.imu_data[idx]
        image_item = self.image_data[idx]
        return imu_item, image_item


In [57]:
from torch.utils.data import random_split


dataset = IMUImageDataset(inputs[ModalityType.VISION], inputs[ModalityType.IMU])


total_size = len(dataset)
train_size = int(0.8 * total_size)  
test_size = total_size - train_size  


train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [58]:
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.2):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, q, k):
        similarity = torch.matmul(q, k.t()) / self.temperature
        
        numerator = torch.exp(torch.diag(similarity))
        denominator = torch.sum(torch.exp(similarity), dim=1) + numerator
        
        # Calculate the loss
        loss = -torch.mean(torch.log(numerator / denominator))
        
        return loss


In [62]:
loss_function = ContrastiveLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01) 
num_epochs = 100

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs1,  labels in train_loader:
        optimizer.zero_grad()  # Zero the gradients
        model_inputs = {'imu': inputs1}
        outputs1 = model(model_inputs)
       
        loss = loss_function(outputs1, labels)
        
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print average loss per epoch
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 5 is not equal to len(dims) = 4