In [1]:
import torch
import torch.nn as nn

# Transformer Encoder với embed_dim = 1024
class ObjectEncoder(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=8, num_layers=2):
        super(ObjectEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

    def forward(self, x):
        return self.transformer(x)  # (B, seq_len, embed_dim)


# VideoFeatureExtractor với embed_dim = 1024
class VideoFeatureExtractor(nn.Module):
    def __init__(self, embed_dim=1024):
        super(VideoFeatureExtractor, self).__init__()
        self.input_proj = nn.Linear(1024, embed_dim)  # Giữ nguyên Linear(1024, 1024) vì C và M có 512 chiều
        self.transformer = ObjectEncoder(embed_dim=embed_dim)
        self.max_pool = nn.AdaptiveMaxPool1d(1)  # Max pooling theo chiều seq_len

    def forward(self, C, M):
        """
        C: (B, N, 512) - Đặc trưng không gian
        M: (B, N, 512) - Đặc trưng chuyển động
        """
        fused_features = torch.cat([C, M], dim=-1)  # (B, N, 1024)
        fused_features = self.input_proj(fused_features)  # (B, N, 1024)

        R = self.transformer(fused_features.to(torch.float32))  # (B, N, 1024)

        R_pooled = self.max_pool(R.permute(0, 2, 1))  # (B, 1024, 1)
        v = R_pooled.squeeze(-1)  # (B, 1024)

        return v

# Kiểm tra
B, N = 4, 10  # Batch size và số key frames
C = torch.randn(B, N, 512)  # B x N x 512 vector đặc trưng không gian
M = torch.randn(B, N, 512)  # B x N x 512 vector đặc trưng chuyển động

model = VideoFeatureExtractor(embed_dim=1024)
video_vector = model(C, M)
print("Video content vector shape:", video_vector.shape)  # Expect (B, 1024)

Video content vector shape: torch.Size([4, 1024])


In [2]:
import torch
import torch.nn as nn

class ObjectDecoder(nn.Module):
    def __init__(self, embed_dim=1024, num_queries=8, num_heads=8):
        super(ObjectDecoder, self).__init__()
        self.num_queries = num_queries  # P = 8
        self.embed_dim = embed_dim  # Kích thước của Q = 1024
        self.num_heads = num_heads

        # Truy vấn Q có kích thước (P, 1024)
        self.query_embed = nn.Parameter(torch.randn(1, num_queries, embed_dim))  # (1, P, 1024)

        # Projection để đưa O (N, 512) lên 1024 chiều
        self.encoder_proj = nn.Linear(512, embed_dim)

        # Multi-Head Attention thay cho Transformer Decoder
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

    def forward(self, O, v):
        """
        O: Đầu ra của encoder (B, N, 512)
        v: Vector nội dung video (B, 1024)
        """
        B = O.shape[0]  # Batch size

        # Dự báo O lên 1024 chiều
        O = self.encoder_proj(O)  # (B, N, 1024)

        # Sao chép v thành (B, N, 1024) giống O
        v_expanded = v.unsqueeze(1).expand(-1, O.shape[1], -1)  # (B, N, 1024)

        # Tạo tập truy vấn Q từ embedding truy vấn (B, P, 1024)
        Q = self.query_embed.expand(B, -1, -1)  # (B, P, 1024)

        # Multi-Head Attention
        xi, _ = self.multihead_attn(Q, O, v_expanded)  # (B, P, 1024)

        return xi  # (B, P, 1024)

# Fully Connected Layer để ánh xạ ξ sang không gian ngữ nghĩa e ∈ R^d
class SemanticMapper(nn.Module):
    def __init__(self, embed_dim=1024, semantic_dim=1024):
        super(SemanticMapper, self).__init__()
        self.fc = nn.Linear(embed_dim, semantic_dim)  # Ánh xạ từ 1024 → d (1024)

    def forward(self, ξ):
        ξ_mapped = self.fc(ξ)  # (B, P, semantic_dim)
        return ξ_mapped

# ====================== #
# 🔹 Kiểm tra mô hình 🔹 #
# ====================== #
B = 4   # Batch size
N = 10  # Số key frames chính
P = 8   # Số truy vấn
O = torch.randn(B, N, 512)  # Đầu ra từ encoder
v = torch.randn(B, 1024)  # Vector content video

decoder = ObjectDecoder()
ξ = decoder(O, v)

semantic_mapper = SemanticMapper()
ξ_mapped = semantic_mapper(ξ)

print("Output shape của ξ:", ξ.shape)  # (B, P, 1024)
print("Shape of ξ_mapped:", ξ_mapped.shape)  # (B, P, 1024)


Output shape của ξ: torch.Size([4, 8, 1024])
Shape of ξ_mapped: torch.Size([4, 8, 1024])


In [3]:
import torch
import torch.nn as nn

class ObjectEncoderDecoder(nn.Module):
    def __init__(self, vid_embed_dim=1024, objects_embed_dim=1024, num_object_queries=8, num_decoder_layers=6, num_heads=8, ff_dim=2048, decoder_embed_dim=1024, semantic_dim=1024):
        super(ObjectEncoderDecoder, self).__init__()
        self.encoder = VideoFeatureExtractor(embed_dim=vid_embed_dim)
        self.decoder = ObjectDecoder(embed_dim=1024, num_queries=8, num_heads=8)
        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, temporal_features, object_features, context_features):
        """
        temporal_features: (B, N, feature_dim)
        object_features: (B, N, feature_dim)
        context_features: (B, N, feature_dim)
        """
        C = context_features.to(torch.float32)  # (B, N, feature_dim)
        M = temporal_features.to(torch.float32)  # (B, N, feature_dim)
        O = object_features.to(torch.float32)  # (B, N, feature_dim)


        v = self.encoder(C, M)  # (B, 1024)


        ξ = self.decoder(O, v)  # (B, num_object_queries, 1024)
        ξ_mapped = self.semantic_mapper(ξ)  # (B, num_object_queries, 1024)

        return ξ, ξ_mapped  # (B, num_object_queries, 1024) (B, num_object_queries, 1024)



In [None]:
!pip install spacy
!pip install peft==0.10.0
!pip install sentence-transformers

In [10]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy
from sentence_transformers import SentenceTransformer

# Load SpaCy để trích xuất danh từ
nlp = spacy.load("en_core_web_sm")

# Load SBERT để tạo embedding
sbert_model = SentenceTransformer("sentence-transformers/all-roberta-large-v1")

device = "cuda" if torch.cuda.is_available() else "cpu"
sbert_model = sbert_model.to(device)

def extract_noun_embeddings(caption):
    """Trích xuất danh từ từ caption và tạo embedding."""
    doc = nlp(caption)
    nouns = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]]
    # print(caption, ":", nouns)

    if not nouns:
        nouns = ["object"]  # Tránh lỗi nếu không có danh từ nào

    embeddings = sbert_model.encode(nouns)  # (num_objects, 1024)
    return embeddings

tobe_verbs = {"be", "is", "am", "are", "was", "were", "been", "being"}


def extract_verb_embeddings(caption):
    """Trích xuất động từ (loại bỏ động từ 'to be') từ caption và tạo embedding."""
    doc = nlp(caption)
    verbs = [token.text for token in doc if token.pos_ == "VERB" and token.lemma_ not in tobe_verbs]

    if not verbs:
        verbs = ["action"]  # Tránh lỗi nếu không có động từ nào

    embeddings = sbert_model.encode(verbs[0])  # (1024)
    return embeddings




In [7]:
caption = "A woman is explaining the features of a tomato."
noun_embeddings = extract_verb_embeddings(caption)
print("Embedding Shape:", noun_embeddings.shape)  # (num_objects, 1024)


Embedding Shape: (1024,)


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
import random

import torch
import os

def load_video_features(video_id, feature_dir="./video_features"):
    """
    Đọc file .pt của một video cụ thể.
    
    Args:
        video_id (str): ID của video.
        feature_dir (str): Thư mục chứa các file .pt.
        
    Returns:
        tuple: (temporal_features, object_features, context_features) hoặc None nếu file không tồn tại.
    """
    file_path = os.path.join(feature_dir, f"{video_id}.pt")
    
    if not os.path.exists(file_path):
        print(f"File {file_path} không tồn tại!")
        return None
    
    return torch.load(file_path)
 

class VideoDataset(Dataset):
    def __init__(self, video_data):
        """
        video_data: List chứa các video dưới dạng [(caption, video_id), ...]
        """
        self.video_data = video_data

    def preprocess(self, video_id):
        """Hàm này trả về temporal_features, object_features, context_features.""" 
        temporal_features, object_features, context_features = load_video_features(video_id)
        return temporal_features, object_features, context_features

    def __len__(self):
        return len(self.video_data)

    def __getitem__(self, idx):
        caption, video_id = self.video_data[idx]
        temporal_features, object_features, context_features = self.preprocess(video_id)

        # Trích xuất danh từ và tạo embedding
        label_embeddings = extract_noun_embeddings(caption)
        action_feature = extract_verb_embeddings(caption)
        caption_embedding = sbert_model.encode(caption)

        return temporal_features, object_features, context_features, torch.tensor(label_embeddings), action_feature, caption_embedding


import json
# Load MSRVTT captions from a JSON file (assuming it's in a dictionary format)
# Replace 'msrvtt_captions.json' with the actual path to your file
with open('./MSRVTT/annotation/MSR_VTT.json', 'r') as file:
    msrvtt_data = json.load(file)

train_list = []

for annotation in  msrvtt_data['annotations']:
    video_id = annotation['image_id']
    caption = annotation['caption']
    train_list.append([caption, video_id])
    
# Chia dữ liệu 80-20
video_data = train_list
random.shuffle(video_data)
split = int(0.8 * len(video_data))
train_data, val_data = video_data[:split], video_data[split:]

# Tạo DataLoader
batch_size = 8
train_dataset = VideoDataset(train_data)
val_dataset = VideoDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [9]:
from scipy.optimize import linear_sum_assignment
import torch
import torch.nn.functional as F

def hungarian_loss(pred, target):
    batch_size, num_queries, feature_dim = pred.shape
    total_loss = 0

    for i in range(batch_size):
        # Lọc các vector target hợp lệ (loại bỏ padding)
        valid_target = target[i][~(target[i] == 0).all(dim=1)]

        num_pred = pred.shape[1]
        num_target = valid_target.shape[0]

        if num_target == 0:  # Nếu không có đối tượng thực tế nào
            total_loss += F.mse_loss(pred[i], torch.zeros_like(pred[i]))
            continue

        # Padding để số lượng bằng nhau
        if num_pred > num_target:  # Nếu dự đoán nhiều hơn thực tế
            pad_size = num_pred - num_target
            padded_target = torch.cat(
                [valid_target, torch.zeros(pad_size, feature_dim, device=valid_target.device)],
                dim=0
            )
            padded_pred = pred[i]  # Không cần chỉnh sửa `pred`
        else:  # Nếu thực tế nhiều hơn dự đoán
            pad_size = num_target - num_pred
            padded_pred = torch.cat(
                [pred[i], torch.zeros(pad_size, feature_dim, device=pred.device)],
                dim=0
            )
            padded_target = valid_target  # Giữ nguyên target

        # Tạo ma trận chi phí (dùng tích vô hướng âm để chọn giá trị lớn nhất)
        cost_matrix = -torch.matmul(padded_pred, padded_target.T).cpu().detach().numpy()
        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        # Lấy các giá trị đã được ánh xạ
        matched_pred = padded_pred[row_ind]
        matched_target = padded_target[col_ind]

        total_loss += F.mse_loss(matched_pred, matched_target)

    return total_loss / batch_size


In [8]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    temporal_features, object_features, context_features, label_embeddings, action_embeddings, caption_embedding = zip(*batch)

    temporal_features = torch.stack(temporal_features)  # (batch_size, seq_len, 512)
    object_features = torch.stack(object_features)      # (batch_size, seq_len, 512)
    context_features = torch.stack(context_features)    # (batch_size, seq_len, 512)

    # Padding cho danh từ
    padded_labels = pad_sequence(label_embeddings, batch_first=True, padding_value=0)  # (batch_size, max_num_objects, 1024)

    return temporal_features, object_features, context_features, padded_labels



#Train OBJECT HEAD

In [9]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = ObjectEncoderDecoder().to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [11]:
# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [None]:
# from tqdm import tqdm

# num_epochs = 10

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

#     for temporal, object_feats, context, label_emb in progress_bar:
#         temporal, object_feats, context, label_emb = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device)

#         Xi, pred = model(context, temporal, object_feats)
#         loss = hungarian_loss(pred, label_emb)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         batch_loss = loss.item()
#         total_loss += batch_loss

#         progress_bar.set_postfix({"Batch Loss": batch_loss, "Avg Loss": total_loss / len(train_loader)})

#     avg_train_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

#     # Validation phase
#     model.eval()
#     val_loss = 0

#     with torch.no_grad():
#         val_progress_bar = tqdm(val_loader, desc="Validating", leave=True)

#         for temporal, object_feats, context, label_emb in val_progress_bar:
#             temporal, object_feats, context, label_emb = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device)

#             pred = model(context, temporal, object_feats)
#             loss = hungarian_loss(pred, label_emb)

#             val_loss += loss.item()
#             val_progress_bar.set_postfix({"Batch Loss": loss.item(), "Avg Val Loss": val_loss / len(val_loader)})

#     avg_val_loss = val_loss / len(val_loader)
#     print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")


# Train Action Head

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ActionHead(nn.Module):
    def __init__(self, embed_dim=1024, num_objects=10, action_dim=1024):
        super(ActionHead, self).__init__()
        self.embed_dim = embed_dim
        self.num_objects = num_objects  # P
        self.action_dim = action_dim

        # Tham số học được: Wα, Uα, bα
        self.W_alpha = nn.Linear(embed_dim, embed_dim, bias=False)
        self.U_alpha = nn.Linear(embed_dim, embed_dim, bias=False)
        self.b_alpha = nn.Parameter(torch.zeros(embed_dim))

        # Transformer Encoder để encode hành động
        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8)

        # Fully Connected Layer để ánh xạ sang không gian ngữ nghĩa
        self.fc = nn.Linear(embed_dim, action_dim)

    def forward(self, M, ξ):
        """
        M: Motion Features - (B, N, embed_dim)
        ξ: Object Representations - (B, P, embed_dim)
        """
        B, N, _ = M.shape  # Batch size, Số motion features
        _, P, _ = ξ.shape  # Số object features (P)

        # Tính attention scores αi,k
        M_exp = self.W_alpha(M).unsqueeze(2)  # (B, N, 1, embed_dim)
        ξ_exp = self.U_alpha(ξ).unsqueeze(1)  # (B, 1, P, embed_dim)
        attn_scores = torch.tanh(M_exp + ξ_exp + self.b_alpha)  # (B, N, P, embed_dim)

        attn_scores = attn_scores.mean(dim=-1)  # (B, N, P)
        attn_weights = F.softmax(attn_scores, dim=-1)  # (B, N, P)

        # Tính toán motion-object features M^e
        M_e = torch.bmm(attn_weights, ξ)  # (B, N, embed_dim)

        # Kết hợp M và M_e để đưa vào Transformer
        M_combined = torch.cat([M, M_e], dim=-1)  # (B, N, 2*embed_dim)
        M_combined = M_combined.view(B, N, 2, self.embed_dim).mean(dim=2)  # (B, N, embed_dim)

        Attn = self.transformer(M_combined)  # (B, N, embed_dim)

        # Max Pooling trên chiều thời gian (N)
        A_pooled, _ = torch.max(Attn, dim=1)  # (B, embed_dim)

        # Fully Connected Layer để ánh xạ sang không gian ngữ nghĩa
        a = self.fc(A_pooled)  # (B, action_dim)

        return a


# =============================
# 🔹 Test Mô Hình
batch_size = 4  # Số lượng video trong batch
embed_dim = 1024
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Không gian ngữ nghĩa hành động

# Khởi tạo mô hình
action_head = ActionHead(embed_dim=embed_dim, num_objects=num_objects, action_dim=action_dim)

# Giả lập đầu vào
M = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
ξ = torch.randn(batch_size, num_objects, embed_dim)  # Object representations

# Tính toán action representation
a = action_head(M, ξ)
print("Shape of action representation:", a.shape)  # (batch_size, action_dim)


Shape of action representation: torch.Size([4, 1024])


In [11]:
import torch
import torch.nn as nn

class ActionEncoderDecoder(nn.Module):
    def __init__(self, vid_embed_dim=1024, objects_embed_dim=1024, num_object_queries=8, num_decoder_layers=6, num_heads=8, ff_dim=2048, decoder_embed_dim=1024, semantic_dim=1024):
        super(ActionEncoderDecoder, self).__init__()
        self.object_encoder_decoder = ObjectEncoderDecoder()
        self.action_head = ActionHead()
        self.linear_layer = nn.Linear(512, 1024)
        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, context_features, ξ):
        """
        temporal_features: (B, N, feature_dim)
        object_features: (B, N, feature_dim)
        context_features: (B, N, feature_dim)
        """
        C = context_features.to(torch.float32)  # (B, N, feature_dim) 
        C_project = self.linear_layer(C)

        action_embed = self.action_head(C_project, ξ)  # (B, 1024)

        action_embed_mapped = self.semantic_mapper(action_embed)  # (B, 1024)

        return action_embed_mapped


# =============================
# 🔹 Test Mô Hình
batch_size = 4  # Số lượng video trong batch
embed_dim = 512
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Không gian ngữ nghĩa hành động

# Khởi tạo mô hình
action_encoder_decoder = ActionEncoderDecoder()


In [12]:
# Hàm loss Euclidean
def euclidean_loss(pred, target):
    return torch.norm(pred - target, p=2, dim=-1).mean()

In [12]:
import torch
from torch.nn.utils.rnn import pad_sequence

def collate_action_fn(batch):
    """
    Hàm collate cho DataLoader, xử lý padding cho `label_embeddings` và đảm bảo `action_embeddings` có dạng (batch_size, 1024).
    """
    temporal_features, object_features, context_features, label_embeddings, action_embeddings, caption_embedding = zip(*batch)

    # Stack các đặc trưng theo batch
    temporal_features = torch.stack(temporal_features)  # (batch_size, seq_len, 512)
    object_features = torch.stack(object_features)      # (batch_size, seq_len, 512)
    context_features = torch.stack(context_features)    # (batch_size, seq_len, 512)

    # 🔹 Padding cho `label_embeddings`
    padded_labels = pad_sequence([torch.tensor(lbl, dtype=torch.float32) for lbl in label_embeddings], batch_first=True, padding_value=0)
    # Kết quả: (batch_size, max_num_objects_in_batch, feature_dim)

    # 🔹 Chuyển `action_embeddings` thành Tensor có dạng (batch_size, 1024)
    action_embeddings = torch.stack([torch.tensor(a, dtype=torch.float32) for a in action_embeddings])  # (batch_size, 1024)
    caption_embeddings = torch.stack([torch.tensor(a, dtype=torch.float32) for a in caption_embedding])  # (batch_size, 1024)

    return temporal_features, object_features, context_features, padded_labels, action_embeddings, caption_embeddings


In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ActionEncoderDecoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_action_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_action_fn)



from tqdm import tqdm

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

    for temporal, object_feats, context, label_emb, action_emb in progress_bar:
        temporal, object_feats, context, label_emb, action_emb, caption_embed = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device), caption_embed.to(device)

        pred = model(temporal, object_feats, context)

        loss = euclidean_loss(pred, action_emb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_loss = loss.item()
        total_loss += batch_loss

        progress_bar.set_postfix({"Batch Loss": batch_loss, "Avg Loss": total_loss / len(train_loader)})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0

    with torch.no_grad():
        val_progress_bar = tqdm(val_loader, desc="Validating", leave=True)

        for temporal, object_feats, context, label_emb, action_emb, caption_embed in val_progress_bar:
            temporal, object_feats, context, label_emb, action_emb = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device)

            pred = model(temporal, object_feats, context)
            loss = euclidean_loss(pred, action_emb)

            val_loss += loss.item()
            val_progress_bar.set_postfix({"Batch Loss": loss.item(), "Avg Val Loss": val_loss / len(val_loader)})

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")


NameError: name 'collate_action_fn' is not defined

# Train global head

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GlobalHead(nn.Module):
    def __init__(self, embed_dim=1024, model_dim=1024, num_layers=6, nhead=8):
        super(GlobalHead, self).__init__()

        # Projection layers for attention mechanism
        self.Wg = nn.Linear(embed_dim, model_dim)
        self.Ug = nn.Linear(embed_dim, model_dim)
        self.bg = nn.Parameter(torch.zeros(model_dim))

        # Transformer Encoder for global representation
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim * 3, nhead=nhead, dim_feedforward=2048)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Final MLP to predict linguistic representation
        self.fc = nn.Linear(model_dim * 3, model_dim)

    def compute_weighted_representation(self, context, features):
        """
        Tính trọng số attention cho features (action hoặc object) dựa trên context.

        context: (B, N, 1024)
        features: (B, P, 1024)
        """
        B, N, D = context.shape
        _, P, _ = features.shape  # P có thể bằng N (cho action) hoặc khác N (cho object)

        # Tính attention scores βi,k
        beta = torch.tanh(self.Wg(context).unsqueeze(2) + self.Ug(features).unsqueeze(1) + self.bg)  # (B, N, P, 1024)
        beta = F.softmax(beta, dim=2)  # (B, N, P, 1024)

        # Tổng hợp features theo trọng số beta
        weighted_features = torch.einsum("bnpd,bpd->bnd", beta, features)  # (B, N, 1024)
        return weighted_features

    def forward(self, obj_feat, action_feat, context):
        """
        obj_feat: (B, P, 1024)  # Object features
        action_feat: (B, N, 1024)  # Action features
        context: (B, N, 1024)  # Context representations
        """
        # Tính Ca (context-aware action representation)
        Ca = self.compute_weighted_representation(context, action_feat)

        # Tính Ce (context-aware object representation)
        Ce = self.compute_weighted_representation(context, obj_feat)

        # Ghép tất cả thành input cho Transformer Encoder
        fusion_input = torch.cat([context, Ca, Ce], dim=-1)  # (B, N, 3*1024)

        # Transformer Encoder để trích xuất đặc trưng toàn cục
        G = self.transformer_encoder(fusion_input)  # (B, N, 3*1024)

        # Max Pooling theo trục N (chọn thông tin quan trọng nhất từ N context)
        g = torch.max(G, dim=1)[0]  # (B, 3*1024)

        # Dự đoán đặc trưng ngôn ngữ
        g = self.fc(g)  # (B, 1024)

        return g


# ===========================
# 🔹 Kiểm tra Global Head với Batch 🔹
# ===========================

B = 4  # Batch size
P = 10  # Số object
N = 8  # Số action/context

# Tạo dữ liệu đầu vào giả lập
obj_feat = torch.randn(B, P, 1024)  # (B, P, 1024)
action_feat = torch.randn(B, N, 1024)  # (B, N, 1024)
context = torch.randn(B, N, 1024)  # (B, N, 1024)

# Khởi tạo Global Head
global_head = GlobalHead()

# Tính toán global representation
g = global_head(obj_feat, action_feat, context)
print("Global linguistic representation shape:", g.shape)  # (B, 1024)


Global linguistic representation shape: torch.Size([4, 1024])


In [13]:
import torch
import torch.nn as nn

class GlobalEncoderDecoder(nn.Module):
    def __init__(self, vid_embed_dim=1024, objects_embed_dim=1024, num_object_queries=8, num_decoder_layers=6, num_heads=8, ff_dim=2048, decoder_embed_dim=1024, semantic_dim=1024):
        super(GlobalEncoderDecoder, self).__init__()
        self.object_encoder_decoder = ObjectEncoderDecoder()
        self.global_head = GlobalHead()
        self.linear_layer = nn.Linear(512, 1024)

        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, temporal_features, object_features, context_features):
        """
        temporal_features: (B, N, feature_dim)
        object_features: (B, N, feature_dim)
        context_features: (B, N, feature_dim)
        """
        C = context_features.to(torch.float32)  # (B, N, feature_dim)
        M = temporal_features.to(torch.float32)  # (B, N, feature_dim)
        O = object_features.to(torch.float32)  # (B, N, feature_dim)

        C_project = self.linear_layer(C)
        M_project = self.linear_layer(M)

        ξ, ξ_mapped = self.object_encoder_decoder(temporal_features, object_features, context_features)

        obj_feat, action_feat, context = ξ, M_project, C_project

        action_embed = self.global_head(obj_feat, action_feat, context)  # (B, 1024)

        # action_embed_mapped = self.semantic_mapper(action_embed)  # (B, 1024)

        return action_embed


# =============================
# 🔹 Test Mô Hình
batch_size = 4  # Số lượng video trong batch
embed_dim = 512
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Không gian ngữ nghĩa hành động

# Khởi tạo mô hình
global_encoder_decoder = GlobalEncoderDecoder()

# Giả lập đầu vào
M = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
O = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
C = torch.randn(batch_size, num_motions, embed_dim)  # Motion features

# Tính toán action representation
a = global_encoder_decoder(M, O, C)
print("Shape of action representation:", a.shape)  # (batch_size, action_dim)

Shape of action representation: torch.Size([4, 1024])


In [16]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = GlobalEncoderDecoder().to(device)

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_action_fn)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_action_fn)


# from tqdm import tqdm

# num_epochs = 10

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

#     for temporal, object_feats, context, label_emb, action_emb, caption_embed in progress_bar:
#         temporal, object_feats, context, label_emb, action_emb, caption_embed = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device), caption_embed.to(device)

#         pred = model(temporal, object_feats, context)

#         loss = euclidean_loss(pred, caption_embed)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         batch_loss = loss.item()
#         total_loss += batch_loss

#         progress_bar.set_postfix({"Batch Loss": batch_loss, "Avg Loss": total_loss / len(train_loader)})

#     avg_train_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

#     # Validation phase
#     model.eval()
#     val_loss = 0

#     with torch.no_grad():
#         val_progress_bar = tqdm(val_loader, desc="Validating", leave=True)

#         for temporal, object_feats, context, label_emb, action_emb, caption_embed in val_progress_bar:
#             temporal, object_feats, context, label_emb, action_emb, caption_embed = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device), caption_embed.to(device)

#             pred = model(temporal, object_feats, context)
#             loss = euclidean_loss(pred, caption_embed)

#             val_loss += loss.item()
#             val_progress_bar.set_postfix({"Batch Loss": loss.item(), "Avg Val Loss": val_loss / len(val_loader)})

#     avg_val_loss = val_loss / len(val_loader)
#     print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import OPTForCausalLM, AutoTokenizer
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images


# specify the path to the model





class QFormer(nn.Module):
    def __init__(self, embed_dim=1024, num_queries=8, output_dim=768):
        super(QFormer, self).__init__()
        self.num_queries = num_queries
        self.output_dim = output_dim

        # Học các query embeddings
        self.query_embed = nn.Parameter(torch.randn(1, num_queries, embed_dim))

        # Transformer Encoder để trích xuất video embeddings
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8, dim_feedforward=2048)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)

        # Biến đổi về không gian video query tokens
        self.fc = nn.Linear(embed_dim, output_dim)

    def forward(self, obj_feat, action_feat, global_feat):
        """
        obj_feat: (B, P, 1024)
        action_feat: (B, 1024)
        global_feat: (B, 1024)
        """
        B, P, _ = obj_feat.shape  # Batch size, số object
        N = P + 2  # Tổng số token

        # Ghép tất cả thành (B, N, 1024)
        video_features = torch.cat([obj_feat, action_feat.unsqueeze(1), global_feat.unsqueeze(1)], dim=1)

        # Transformer Encoder
        video_embeddings = self.transformer_encoder(video_features)  # (B, N, 1024)

        # Lấy output và mapping về không gian 768 chiều cho LLM
        video_query_tokens = self.fc(video_embeddings)  # (B, N, 768)

        return video_query_tokens


 

class CaptionHead(nn.Module):
    def __init__(self, llm_model="deepseek-ai/deepseek-vl2-tiny"):
        super(CaptionHead, self).__init__()
        self.qformer = QFormer()
        self.vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(llm_model)
        self.tokenizer = self.vl_chat_processor.tokenizer
        
        self.vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(llm_model, trust_remote_code=True)
        self.vl_gpt = self.vl_gpt.to(torch.bfloat16).cuda().eval()


    def generate_prompt(self, video_tokens):
        """
        video_tokens: (B, N, 768)
        Trả về danh sách prompt có kích thước (B, num_prompts).
        """
        B = video_tokens.shape[0]
        Xa_options = ["subject", "initial state", "final state"]
        Xb_options = ["The subject", "state initial change", "state final change"]

        prompts = []
        for i in range(B):
            # video_desc_inputs = self.tokenizer("<video>", return_tensors="pt")
            # video_desc = self.llm.generate(**video_desc_inputs, max_length=20)
            # video_desc_text = self.tokenizer.decode(video_desc[0], skip_special_tokens=True)

            video_prompts = [f"Video: <video>. This video tells the {Xa}. {Xb} is: "
                             for Xa, Xb in zip(Xa_options, Xb_options)]
            prompts.append(video_prompts)

        return prompts  # (B, num_prompts)

    def forward(self, obj_feat, action_feat, global_feat, max_length=50):
        """
        Sinh caption từ video features (batch).
        """
        video_tokens = self.qformer(obj_feat, action_feat, global_feat)  # (B, N, 768)
        batch_prompts = self.generate_prompt(video_tokens)  # (B, num_prompts)

        captions = []
        for prompts in batch_prompts:
            video_captions = []
            for prompt in prompts:
                conversation = [
                    {
                        "role": "<|User|>",
                        "content": prompt, 
                    },
                    {"role": "<|Assistant|>", "content": ""},
                ]
                
                # load images and prepare for inputs
                pil_images = load_pil_images(conversation)
                
                prepare_inputs = self.vl_chat_processor(
                    conversations=conversation,
                    images=pil_images,
                    video = torch.rand(4, 1280, dtype=torch.bfloat16),
                    force_batchify=True,
                    system_prompt=""
                ).to(self.vl_gpt.device)
                 
                
                # run image encoder to get the image embeddings
                inputs_embeds = self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
                 
                
                # run the model to get the response
                outputs = self.vl_gpt.language.generate(
                    inputs_embeds=inputs_embeds,
                    attention_mask=prepare_inputs.attention_mask,
                    pad_token_id=self.tokenizer.eos_token_id,
                    bos_token_id=self.tokenizer.bos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    max_new_tokens=512,
                    do_sample=False,
                    use_cache=True
                )
                
                answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
                video_captions.append(answer)
            captions.append(video_captions)

        return captions  # (B, num_prompts)


 

Python version is above 3.10, patching the collections module.


In [None]:
import torch
import torch.nn as nn

class VidCapModel(nn.Module):
    def __init__(self, objects_embed_dim=1024, semantic_dim=1024):
        super(VidCapModel, self).__init__()
        self.video_content_encoder = VideoFeatureExtractor()
        self.object_head = ObjectEncoderDecoder()
        self.global_head = GlobalHead()
        self.action_head = ActionHead()
        self.caption_head = CaptionHead() 
        self.linear_layer = nn.Linear(512, 1024)
         
        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, temporal_features, object_features, context_features):

        C_project = self.linear_layer(context_features)
        M_project = self.linear_layer(temporal_features)
        object_project = self.linear_layer(object_features)
        
        # Tính toán object head output (B, P, dim)
        ξ, object_head_output = self.object_head(temporal_features, object_features, context_features) 
        
        # Tính toán action head output (B, dim)
        action_head_output = self.action_head(M_project, ξ)
        
        # Tinhs toán global head output (B, dim)
        obj_feat, action_feat, context = ξ, M_project, C_project
        global_head_output = self.global_head(obj_feat, action_feat, context) 

        caption_head_output = self.caption_head(object_head_output, action_head_output, global_head_output)

        # caption_head_output = ""
        return object_head_output, action_head_output, global_head_output, caption_head_output


# =============================
# 🔹 Test Mô Hình
batch_size = 4  # Số lượng video trong batch
embed_dim = 512
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Không gian ngữ nghĩa hành động

# Khởi tạo mô hình
video_model = VidCapModel()

# Giả lập đầu vào
M = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
O = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
C = torch.randn(batch_size, num_motions, embed_dim)  # Motion features

# Tính toán action representation
object_head_output, action_head_output, global_head_output, caption_head_output  = video_model(M, O, C)
print("Shape of object_head_output representation:",  object_head_output.shape)  # (batch_size, action_dim)
print("Shape of action_head :",  action_head_output.shape)  # (batch_size, action_dim)
print("Shape of global_head_output representation:",  global_head_output.shape)  # (batch_size, action_dim)
print("caption_head_output", caption_head_output)
