In [1]:
import torch
import torch.nn as nn

# Transformer Encoder v·ªõi embed_dim = 1024
class ObjectEncoder(nn.Module):
    def __init__(self, embed_dim=1024, num_heads=8, num_layers=2):
        super(ObjectEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

    def forward(self, x):
        return self.transformer(x)  # (B, seq_len, embed_dim)


# VideoFeatureExtractor v·ªõi embed_dim = 1024
class VideoFeatureExtractor(nn.Module):
    def __init__(self, embed_dim=1024):
        super(VideoFeatureExtractor, self).__init__()
        self.input_proj = nn.Linear(1024, embed_dim)  # Gi·ªØ nguy√™n Linear(1024, 1024) v√¨ C v√† M c√≥ 512 chi·ªÅu
        self.transformer = ObjectEncoder(embed_dim=embed_dim)
        self.max_pool = nn.AdaptiveMaxPool1d(1)  # Max pooling theo chi·ªÅu seq_len

    def forward(self, C, M):
        """
        C: (B, N, 512) - ƒê·∫∑c tr∆∞ng kh√¥ng gian
        M: (B, N, 512) - ƒê·∫∑c tr∆∞ng chuy·ªÉn ƒë·ªông
        """
        fused_features = torch.cat([C, M], dim=-1)  # (B, N, 1024)
        fused_features = self.input_proj(fused_features)  # (B, N, 1024)

        R = self.transformer(fused_features.to(torch.float32))  # (B, N, 1024)

        R_pooled = self.max_pool(R.permute(0, 2, 1))  # (B, 1024, 1)
        v = R_pooled.squeeze(-1)  # (B, 1024)

        return v

# Ki·ªÉm tra
B, N = 4, 10  # Batch size v√† s·ªë key frames
C = torch.randn(B, N, 512)  # B x N x 512 vector ƒë·∫∑c tr∆∞ng kh√¥ng gian
M = torch.randn(B, N, 512)  # B x N x 512 vector ƒë·∫∑c tr∆∞ng chuy·ªÉn ƒë·ªông

model = VideoFeatureExtractor(embed_dim=1024)
video_vector = model(C, M)
print("Video content vector shape:", video_vector.shape)  # Expect (B, 1024)

Video content vector shape: torch.Size([4, 1024])


In [2]:
import torch
import torch.nn as nn

class ObjectDecoder(nn.Module):
    def __init__(self, embed_dim=1024, num_queries=8, num_heads=8):
        super(ObjectDecoder, self).__init__()
        self.num_queries = num_queries  # P = 8
        self.embed_dim = embed_dim  # K√≠ch th∆∞·ªõc c·ªßa Q = 1024
        self.num_heads = num_heads

        # Truy v·∫•n Q c√≥ k√≠ch th∆∞·ªõc (P, 1024)
        self.query_embed = nn.Parameter(torch.randn(1, num_queries, embed_dim))  # (1, P, 1024)

        # Projection ƒë·ªÉ ƒë∆∞a O (N, 512) l√™n 1024 chi·ªÅu
        self.encoder_proj = nn.Linear(512, embed_dim)

        # Multi-Head Attention thay cho Transformer Decoder
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

    def forward(self, O, v):
        """
        O: ƒê·∫ßu ra c·ªßa encoder (B, N, 512)
        v: Vector n·ªôi dung video (B, 1024)
        """
        B = O.shape[0]  # Batch size

        # D·ª± b√°o O l√™n 1024 chi·ªÅu
        O = self.encoder_proj(O)  # (B, N, 1024)

        # Sao ch√©p v th√†nh (B, N, 1024) gi·ªëng O
        v_expanded = v.unsqueeze(1).expand(-1, O.shape[1], -1)  # (B, N, 1024)

        # T·∫°o t·∫≠p truy v·∫•n Q t·ª´ embedding truy v·∫•n (B, P, 1024)
        Q = self.query_embed.expand(B, -1, -1)  # (B, P, 1024)

        # Multi-Head Attention
        xi, _ = self.multihead_attn(Q, O, v_expanded)  # (B, P, 1024)

        return xi  # (B, P, 1024)

# Fully Connected Layer ƒë·ªÉ √°nh x·∫° Œæ sang kh√¥ng gian ng·ªØ nghƒ©a e ‚àà R^d
class SemanticMapper(nn.Module):
    def __init__(self, embed_dim=1024, semantic_dim=1024):
        super(SemanticMapper, self).__init__()
        self.fc = nn.Linear(embed_dim, semantic_dim)  # √Ånh x·∫° t·ª´ 1024 ‚Üí d (1024)

    def forward(self, Œæ):
        Œæ_mapped = self.fc(Œæ)  # (B, P, semantic_dim)
        return Œæ_mapped

# ====================== #
# üîπ Ki·ªÉm tra m√¥ h√¨nh üîπ #
# ====================== #
B = 4   # Batch size
N = 10  # S·ªë key frames ch√≠nh
P = 8   # S·ªë truy v·∫•n
O = torch.randn(B, N, 512)  # ƒê·∫ßu ra t·ª´ encoder
v = torch.randn(B, 1024)  # Vector content video

decoder = ObjectDecoder()
Œæ = decoder(O, v)

semantic_mapper = SemanticMapper()
Œæ_mapped = semantic_mapper(Œæ)

print("Output shape c·ªßa Œæ:", Œæ.shape)  # (B, P, 1024)
print("Shape of Œæ_mapped:", Œæ_mapped.shape)  # (B, P, 1024)


Output shape c·ªßa Œæ: torch.Size([4, 8, 1024])
Shape of Œæ_mapped: torch.Size([4, 8, 1024])


In [3]:
import torch
import torch.nn as nn

class ObjectEncoderDecoder(nn.Module):
    def __init__(self, vid_embed_dim=1024, objects_embed_dim=1024, num_object_queries=8, num_decoder_layers=6, num_heads=8, ff_dim=2048, decoder_embed_dim=1024, semantic_dim=1024):
        super(ObjectEncoderDecoder, self).__init__()
        self.encoder = VideoFeatureExtractor(embed_dim=vid_embed_dim)
        self.decoder = ObjectDecoder(embed_dim=1024, num_queries=8, num_heads=8)
        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, temporal_features, object_features, context_features):
        """
        temporal_features: (B, N, feature_dim)
        object_features: (B, N, feature_dim)
        context_features: (B, N, feature_dim)
        """
        C = context_features.to(torch.float32)  # (B, N, feature_dim)
        M = temporal_features.to(torch.float32)  # (B, N, feature_dim)
        O = object_features.to(torch.float32)  # (B, N, feature_dim)


        v = self.encoder(C, M)  # (B, 1024)


        Œæ = self.decoder(O, v)  # (B, num_object_queries, 1024)
        Œæ_mapped = self.semantic_mapper(Œæ)  # (B, num_object_queries, 1024)

        return Œæ, Œæ_mapped  # (B, num_object_queries, 1024) (B, num_object_queries, 1024)



In [None]:
!pip install spacy
!pip install peft==0.10.0
!pip install sentence-transformers

In [10]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy
from sentence_transformers import SentenceTransformer

# Load SpaCy ƒë·ªÉ tr√≠ch xu·∫•t danh t·ª´
nlp = spacy.load("en_core_web_sm")

# Load SBERT ƒë·ªÉ t·∫°o embedding
sbert_model = SentenceTransformer("sentence-transformers/all-roberta-large-v1")

device = "cuda" if torch.cuda.is_available() else "cpu"
sbert_model = sbert_model.to(device)

def extract_noun_embeddings(caption):
    """Tr√≠ch xu·∫•t danh t·ª´ t·ª´ caption v√† t·∫°o embedding."""
    doc = nlp(caption)
    nouns = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]]
    # print(caption, ":", nouns)

    if not nouns:
        nouns = ["object"]  # Tr√°nh l·ªói n·∫øu kh√¥ng c√≥ danh t·ª´ n√†o

    embeddings = sbert_model.encode(nouns)  # (num_objects, 1024)
    return embeddings

tobe_verbs = {"be", "is", "am", "are", "was", "were", "been", "being"}


def extract_verb_embeddings(caption):
    """Tr√≠ch xu·∫•t ƒë·ªông t·ª´ (lo·∫°i b·ªè ƒë·ªông t·ª´ 'to be') t·ª´ caption v√† t·∫°o embedding."""
    doc = nlp(caption)
    verbs = [token.text for token in doc if token.pos_ == "VERB" and token.lemma_ not in tobe_verbs]

    if not verbs:
        verbs = ["action"]  # Tr√°nh l·ªói n·∫øu kh√¥ng c√≥ ƒë·ªông t·ª´ n√†o

    embeddings = sbert_model.encode(verbs[0])  # (1024)
    return embeddings




In [7]:
caption = "A woman is explaining the features of a tomato."
noun_embeddings = extract_verb_embeddings(caption)
print("Embedding Shape:", noun_embeddings.shape)  # (num_objects, 1024)


Embedding Shape: (1024,)


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
import random

import torch
import os

def load_video_features(video_id, feature_dir="./video_features"):
    """
    ƒê·ªçc file .pt c·ªßa m·ªôt video c·ª• th·ªÉ.
    
    Args:
        video_id (str): ID c·ªßa video.
        feature_dir (str): Th∆∞ m·ª•c ch·ª©a c√°c file .pt.
        
    Returns:
        tuple: (temporal_features, object_features, context_features) ho·∫∑c None n·∫øu file kh√¥ng t·ªìn t·∫°i.
    """
    file_path = os.path.join(feature_dir, f"{video_id}.pt")
    
    if not os.path.exists(file_path):
        print(f"File {file_path} kh√¥ng t·ªìn t·∫°i!")
        return None
    
    return torch.load(file_path)
 

class VideoDataset(Dataset):
    def __init__(self, video_data):
        """
        video_data: List ch·ª©a c√°c video d∆∞·ªõi d·∫°ng [(caption, video_id), ...]
        """
        self.video_data = video_data

    def preprocess(self, video_id):
        """H√†m n√†y tr·∫£ v·ªÅ temporal_features, object_features, context_features.""" 
        temporal_features, object_features, context_features = load_video_features(video_id)
        return temporal_features, object_features, context_features

    def __len__(self):
        return len(self.video_data)

    def __getitem__(self, idx):
        caption, video_id = self.video_data[idx]
        temporal_features, object_features, context_features = self.preprocess(video_id)

        # Tr√≠ch xu·∫•t danh t·ª´ v√† t·∫°o embedding
        label_embeddings = extract_noun_embeddings(caption)
        action_feature = extract_verb_embeddings(caption)
        caption_embedding = sbert_model.encode(caption)

        return temporal_features, object_features, context_features, torch.tensor(label_embeddings), action_feature, caption_embedding


import json
# Load MSRVTT captions from a JSON file (assuming it's in a dictionary format)
# Replace 'msrvtt_captions.json' with the actual path to your file
with open('./MSRVTT/annotation/MSR_VTT.json', 'r') as file:
    msrvtt_data = json.load(file)

train_list = []

for annotation in  msrvtt_data['annotations']:
    video_id = annotation['image_id']
    caption = annotation['caption']
    train_list.append([caption, video_id])
    
# Chia d·ªØ li·ªáu 80-20
video_data = train_list
random.shuffle(video_data)
split = int(0.8 * len(video_data))
train_data, val_data = video_data[:split], video_data[split:]

# T·∫°o DataLoader
batch_size = 8
train_dataset = VideoDataset(train_data)
val_dataset = VideoDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [9]:
from scipy.optimize import linear_sum_assignment
import torch
import torch.nn.functional as F

def hungarian_loss(pred, target):
    batch_size, num_queries, feature_dim = pred.shape
    total_loss = 0

    for i in range(batch_size):
        # L·ªçc c√°c vector target h·ª£p l·ªá (lo·∫°i b·ªè padding)
        valid_target = target[i][~(target[i] == 0).all(dim=1)]

        num_pred = pred.shape[1]
        num_target = valid_target.shape[0]

        if num_target == 0:  # N·∫øu kh√¥ng c√≥ ƒë·ªëi t∆∞·ª£ng th·ª±c t·∫ø n√†o
            total_loss += F.mse_loss(pred[i], torch.zeros_like(pred[i]))
            continue

        # Padding ƒë·ªÉ s·ªë l∆∞·ª£ng b·∫±ng nhau
        if num_pred > num_target:  # N·∫øu d·ª± ƒëo√°n nhi·ªÅu h∆°n th·ª±c t·∫ø
            pad_size = num_pred - num_target
            padded_target = torch.cat(
                [valid_target, torch.zeros(pad_size, feature_dim, device=valid_target.device)],
                dim=0
            )
            padded_pred = pred[i]  # Kh√¥ng c·∫ßn ch·ªânh s·ª≠a `pred`
        else:  # N·∫øu th·ª±c t·∫ø nhi·ªÅu h∆°n d·ª± ƒëo√°n
            pad_size = num_target - num_pred
            padded_pred = torch.cat(
                [pred[i], torch.zeros(pad_size, feature_dim, device=pred.device)],
                dim=0
            )
            padded_target = valid_target  # Gi·ªØ nguy√™n target

        # T·∫°o ma tr·∫≠n chi ph√≠ (d√πng t√≠ch v√¥ h∆∞·ªõng √¢m ƒë·ªÉ ch·ªçn gi√° tr·ªã l·ªõn nh·∫•t)
        cost_matrix = -torch.matmul(padded_pred, padded_target.T).cpu().detach().numpy()
        row_ind, col_ind = linear_sum_assignment(cost_matrix)

        # L·∫•y c√°c gi√° tr·ªã ƒë√£ ƒë∆∞·ª£c √°nh x·∫°
        matched_pred = padded_pred[row_ind]
        matched_target = padded_target[col_ind]

        total_loss += F.mse_loss(matched_pred, matched_target)

    return total_loss / batch_size


In [8]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    temporal_features, object_features, context_features, label_embeddings, action_embeddings, caption_embedding = zip(*batch)

    temporal_features = torch.stack(temporal_features)  # (batch_size, seq_len, 512)
    object_features = torch.stack(object_features)      # (batch_size, seq_len, 512)
    context_features = torch.stack(context_features)    # (batch_size, seq_len, 512)

    # Padding cho danh t·ª´
    padded_labels = pad_sequence(label_embeddings, batch_first=True, padding_value=0)  # (batch_size, max_num_objects, 1024)

    return temporal_features, object_features, context_features, padded_labels



#Train OBJECT HEAD

In [9]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = ObjectEncoderDecoder().to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [11]:
# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [None]:
# from tqdm import tqdm

# num_epochs = 10

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

#     for temporal, object_feats, context, label_emb in progress_bar:
#         temporal, object_feats, context, label_emb = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device)

#         Xi, pred = model(context, temporal, object_feats)
#         loss = hungarian_loss(pred, label_emb)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         batch_loss = loss.item()
#         total_loss += batch_loss

#         progress_bar.set_postfix({"Batch Loss": batch_loss, "Avg Loss": total_loss / len(train_loader)})

#     avg_train_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

#     # Validation phase
#     model.eval()
#     val_loss = 0

#     with torch.no_grad():
#         val_progress_bar = tqdm(val_loader, desc="Validating", leave=True)

#         for temporal, object_feats, context, label_emb in val_progress_bar:
#             temporal, object_feats, context, label_emb = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device)

#             pred = model(context, temporal, object_feats)
#             loss = hungarian_loss(pred, label_emb)

#             val_loss += loss.item()
#             val_progress_bar.set_postfix({"Batch Loss": loss.item(), "Avg Val Loss": val_loss / len(val_loader)})

#     avg_val_loss = val_loss / len(val_loader)
#     print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")


# Train Action Head

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ActionHead(nn.Module):
    def __init__(self, embed_dim=1024, num_objects=10, action_dim=1024):
        super(ActionHead, self).__init__()
        self.embed_dim = embed_dim
        self.num_objects = num_objects  # P
        self.action_dim = action_dim

        # Tham s·ªë h·ªçc ƒë∆∞·ª£c: WŒ±, UŒ±, bŒ±
        self.W_alpha = nn.Linear(embed_dim, embed_dim, bias=False)
        self.U_alpha = nn.Linear(embed_dim, embed_dim, bias=False)
        self.b_alpha = nn.Parameter(torch.zeros(embed_dim))

        # Transformer Encoder ƒë·ªÉ encode h√†nh ƒë·ªông
        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8)

        # Fully Connected Layer ƒë·ªÉ √°nh x·∫° sang kh√¥ng gian ng·ªØ nghƒ©a
        self.fc = nn.Linear(embed_dim, action_dim)

    def forward(self, M, Œæ):
        """
        M: Motion Features - (B, N, embed_dim)
        Œæ: Object Representations - (B, P, embed_dim)
        """
        B, N, _ = M.shape  # Batch size, S·ªë motion features
        _, P, _ = Œæ.shape  # S·ªë object features (P)

        # T√≠nh attention scores Œ±i,k
        M_exp = self.W_alpha(M).unsqueeze(2)  # (B, N, 1, embed_dim)
        Œæ_exp = self.U_alpha(Œæ).unsqueeze(1)  # (B, 1, P, embed_dim)
        attn_scores = torch.tanh(M_exp + Œæ_exp + self.b_alpha)  # (B, N, P, embed_dim)

        attn_scores = attn_scores.mean(dim=-1)  # (B, N, P)
        attn_weights = F.softmax(attn_scores, dim=-1)  # (B, N, P)

        # T√≠nh to√°n motion-object features M^e
        M_e = torch.bmm(attn_weights, Œæ)  # (B, N, embed_dim)

        # K·∫øt h·ª£p M v√† M_e ƒë·ªÉ ƒë∆∞a v√†o Transformer
        M_combined = torch.cat([M, M_e], dim=-1)  # (B, N, 2*embed_dim)
        M_combined = M_combined.view(B, N, 2, self.embed_dim).mean(dim=2)  # (B, N, embed_dim)

        Attn = self.transformer(M_combined)  # (B, N, embed_dim)

        # Max Pooling tr√™n chi·ªÅu th·ªùi gian (N)
        A_pooled, _ = torch.max(Attn, dim=1)  # (B, embed_dim)

        # Fully Connected Layer ƒë·ªÉ √°nh x·∫° sang kh√¥ng gian ng·ªØ nghƒ©a
        a = self.fc(A_pooled)  # (B, action_dim)

        return a


# =============================
# üîπ Test M√¥ H√¨nh
batch_size = 4  # S·ªë l∆∞·ª£ng video trong batch
embed_dim = 1024
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Kh√¥ng gian ng·ªØ nghƒ©a h√†nh ƒë·ªông

# Kh·ªüi t·∫°o m√¥ h√¨nh
action_head = ActionHead(embed_dim=embed_dim, num_objects=num_objects, action_dim=action_dim)

# Gi·∫£ l·∫≠p ƒë·∫ßu v√†o
M = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
Œæ = torch.randn(batch_size, num_objects, embed_dim)  # Object representations

# T√≠nh to√°n action representation
a = action_head(M, Œæ)
print("Shape of action representation:", a.shape)  # (batch_size, action_dim)


Shape of action representation: torch.Size([4, 1024])


In [11]:
import torch
import torch.nn as nn

class ActionEncoderDecoder(nn.Module):
    def __init__(self, vid_embed_dim=1024, objects_embed_dim=1024, num_object_queries=8, num_decoder_layers=6, num_heads=8, ff_dim=2048, decoder_embed_dim=1024, semantic_dim=1024):
        super(ActionEncoderDecoder, self).__init__()
        self.object_encoder_decoder = ObjectEncoderDecoder()
        self.action_head = ActionHead()
        self.linear_layer = nn.Linear(512, 1024)
        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, context_features, Œæ):
        """
        temporal_features: (B, N, feature_dim)
        object_features: (B, N, feature_dim)
        context_features: (B, N, feature_dim)
        """
        C = context_features.to(torch.float32)  # (B, N, feature_dim) 
        C_project = self.linear_layer(C)

        action_embed = self.action_head(C_project, Œæ)  # (B, 1024)

        action_embed_mapped = self.semantic_mapper(action_embed)  # (B, 1024)

        return action_embed_mapped


# =============================
# üîπ Test M√¥ H√¨nh
batch_size = 4  # S·ªë l∆∞·ª£ng video trong batch
embed_dim = 512
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Kh√¥ng gian ng·ªØ nghƒ©a h√†nh ƒë·ªông

# Kh·ªüi t·∫°o m√¥ h√¨nh
action_encoder_decoder = ActionEncoderDecoder()


In [12]:
# H√†m loss Euclidean
def euclidean_loss(pred, target):
    return torch.norm(pred - target, p=2, dim=-1).mean()

In [12]:
import torch
from torch.nn.utils.rnn import pad_sequence

def collate_action_fn(batch):
    """
    H√†m collate cho DataLoader, x·ª≠ l√Ω padding cho `label_embeddings` v√† ƒë·∫£m b·∫£o `action_embeddings` c√≥ d·∫°ng (batch_size, 1024).
    """
    temporal_features, object_features, context_features, label_embeddings, action_embeddings, caption_embedding = zip(*batch)

    # Stack c√°c ƒë·∫∑c tr∆∞ng theo batch
    temporal_features = torch.stack(temporal_features)  # (batch_size, seq_len, 512)
    object_features = torch.stack(object_features)      # (batch_size, seq_len, 512)
    context_features = torch.stack(context_features)    # (batch_size, seq_len, 512)

    # üîπ Padding cho `label_embeddings`
    padded_labels = pad_sequence([torch.tensor(lbl, dtype=torch.float32) for lbl in label_embeddings], batch_first=True, padding_value=0)
    # K·∫øt qu·∫£: (batch_size, max_num_objects_in_batch, feature_dim)

    # üîπ Chuy·ªÉn `action_embeddings` th√†nh Tensor c√≥ d·∫°ng (batch_size, 1024)
    action_embeddings = torch.stack([torch.tensor(a, dtype=torch.float32) for a in action_embeddings])  # (batch_size, 1024)
    caption_embeddings = torch.stack([torch.tensor(a, dtype=torch.float32) for a in caption_embedding])  # (batch_size, 1024)

    return temporal_features, object_features, context_features, padded_labels, action_embeddings, caption_embeddings


In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ActionEncoderDecoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_action_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_action_fn)



from tqdm import tqdm

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

    for temporal, object_feats, context, label_emb, action_emb in progress_bar:
        temporal, object_feats, context, label_emb, action_emb, caption_embed = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device), caption_embed.to(device)

        pred = model(temporal, object_feats, context)

        loss = euclidean_loss(pred, action_emb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_loss = loss.item()
        total_loss += batch_loss

        progress_bar.set_postfix({"Batch Loss": batch_loss, "Avg Loss": total_loss / len(train_loader)})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0

    with torch.no_grad():
        val_progress_bar = tqdm(val_loader, desc="Validating", leave=True)

        for temporal, object_feats, context, label_emb, action_emb, caption_embed in val_progress_bar:
            temporal, object_feats, context, label_emb, action_emb = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device)

            pred = model(temporal, object_feats, context)
            loss = euclidean_loss(pred, action_emb)

            val_loss += loss.item()
            val_progress_bar.set_postfix({"Batch Loss": loss.item(), "Avg Val Loss": val_loss / len(val_loader)})

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")


NameError: name 'collate_action_fn' is not defined

# Train global head

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GlobalHead(nn.Module):
    def __init__(self, embed_dim=1024, model_dim=1024, num_layers=6, nhead=8):
        super(GlobalHead, self).__init__()

        # Projection layers for attention mechanism
        self.Wg = nn.Linear(embed_dim, model_dim)
        self.Ug = nn.Linear(embed_dim, model_dim)
        self.bg = nn.Parameter(torch.zeros(model_dim))

        # Transformer Encoder for global representation
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim * 3, nhead=nhead, dim_feedforward=2048)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Final MLP to predict linguistic representation
        self.fc = nn.Linear(model_dim * 3, model_dim)

    def compute_weighted_representation(self, context, features):
        """
        T√≠nh tr·ªçng s·ªë attention cho features (action ho·∫∑c object) d·ª±a tr√™n context.

        context: (B, N, 1024)
        features: (B, P, 1024)
        """
        B, N, D = context.shape
        _, P, _ = features.shape  # P c√≥ th·ªÉ b·∫±ng N (cho action) ho·∫∑c kh√°c N (cho object)

        # T√≠nh attention scores Œ≤i,k
        beta = torch.tanh(self.Wg(context).unsqueeze(2) + self.Ug(features).unsqueeze(1) + self.bg)  # (B, N, P, 1024)
        beta = F.softmax(beta, dim=2)  # (B, N, P, 1024)

        # T·ªïng h·ª£p features theo tr·ªçng s·ªë beta
        weighted_features = torch.einsum("bnpd,bpd->bnd", beta, features)  # (B, N, 1024)
        return weighted_features

    def forward(self, obj_feat, action_feat, context):
        """
        obj_feat: (B, P, 1024)  # Object features
        action_feat: (B, N, 1024)  # Action features
        context: (B, N, 1024)  # Context representations
        """
        # T√≠nh Ca (context-aware action representation)
        Ca = self.compute_weighted_representation(context, action_feat)

        # T√≠nh Ce (context-aware object representation)
        Ce = self.compute_weighted_representation(context, obj_feat)

        # Gh√©p t·∫•t c·∫£ th√†nh input cho Transformer Encoder
        fusion_input = torch.cat([context, Ca, Ce], dim=-1)  # (B, N, 3*1024)

        # Transformer Encoder ƒë·ªÉ tr√≠ch xu·∫•t ƒë·∫∑c tr∆∞ng to√†n c·ª•c
        G = self.transformer_encoder(fusion_input)  # (B, N, 3*1024)

        # Max Pooling theo tr·ª•c N (ch·ªçn th√¥ng tin quan tr·ªçng nh·∫•t t·ª´ N context)
        g = torch.max(G, dim=1)[0]  # (B, 3*1024)

        # D·ª± ƒëo√°n ƒë·∫∑c tr∆∞ng ng√¥n ng·ªØ
        g = self.fc(g)  # (B, 1024)

        return g


# ===========================
# üîπ Ki·ªÉm tra Global Head v·ªõi Batch üîπ
# ===========================

B = 4  # Batch size
P = 10  # S·ªë object
N = 8  # S·ªë action/context

# T·∫°o d·ªØ li·ªáu ƒë·∫ßu v√†o gi·∫£ l·∫≠p
obj_feat = torch.randn(B, P, 1024)  # (B, P, 1024)
action_feat = torch.randn(B, N, 1024)  # (B, N, 1024)
context = torch.randn(B, N, 1024)  # (B, N, 1024)

# Kh·ªüi t·∫°o Global Head
global_head = GlobalHead()

# T√≠nh to√°n global representation
g = global_head(obj_feat, action_feat, context)
print("Global linguistic representation shape:", g.shape)  # (B, 1024)


Global linguistic representation shape: torch.Size([4, 1024])


In [13]:
import torch
import torch.nn as nn

class GlobalEncoderDecoder(nn.Module):
    def __init__(self, vid_embed_dim=1024, objects_embed_dim=1024, num_object_queries=8, num_decoder_layers=6, num_heads=8, ff_dim=2048, decoder_embed_dim=1024, semantic_dim=1024):
        super(GlobalEncoderDecoder, self).__init__()
        self.object_encoder_decoder = ObjectEncoderDecoder()
        self.global_head = GlobalHead()
        self.linear_layer = nn.Linear(512, 1024)

        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, temporal_features, object_features, context_features):
        """
        temporal_features: (B, N, feature_dim)
        object_features: (B, N, feature_dim)
        context_features: (B, N, feature_dim)
        """
        C = context_features.to(torch.float32)  # (B, N, feature_dim)
        M = temporal_features.to(torch.float32)  # (B, N, feature_dim)
        O = object_features.to(torch.float32)  # (B, N, feature_dim)

        C_project = self.linear_layer(C)
        M_project = self.linear_layer(M)

        Œæ, Œæ_mapped = self.object_encoder_decoder(temporal_features, object_features, context_features)

        obj_feat, action_feat, context = Œæ, M_project, C_project

        action_embed = self.global_head(obj_feat, action_feat, context)  # (B, 1024)

        # action_embed_mapped = self.semantic_mapper(action_embed)  # (B, 1024)

        return action_embed


# =============================
# üîπ Test M√¥ H√¨nh
batch_size = 4  # S·ªë l∆∞·ª£ng video trong batch
embed_dim = 512
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Kh√¥ng gian ng·ªØ nghƒ©a h√†nh ƒë·ªông

# Kh·ªüi t·∫°o m√¥ h√¨nh
global_encoder_decoder = GlobalEncoderDecoder()

# Gi·∫£ l·∫≠p ƒë·∫ßu v√†o
M = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
O = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
C = torch.randn(batch_size, num_motions, embed_dim)  # Motion features

# T√≠nh to√°n action representation
a = global_encoder_decoder(M, O, C)
print("Shape of action representation:", a.shape)  # (batch_size, action_dim)

Shape of action representation: torch.Size([4, 1024])


In [16]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = GlobalEncoderDecoder().to(device)

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# batch_size = 16
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_action_fn)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_action_fn)


# from tqdm import tqdm

# num_epochs = 10

# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0

#     progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

#     for temporal, object_feats, context, label_emb, action_emb, caption_embed in progress_bar:
#         temporal, object_feats, context, label_emb, action_emb, caption_embed = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device), caption_embed.to(device)

#         pred = model(temporal, object_feats, context)

#         loss = euclidean_loss(pred, caption_embed)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         batch_loss = loss.item()
#         total_loss += batch_loss

#         progress_bar.set_postfix({"Batch Loss": batch_loss, "Avg Loss": total_loss / len(train_loader)})

#     avg_train_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

#     # Validation phase
#     model.eval()
#     val_loss = 0

#     with torch.no_grad():
#         val_progress_bar = tqdm(val_loader, desc="Validating", leave=True)

#         for temporal, object_feats, context, label_emb, action_emb, caption_embed in val_progress_bar:
#             temporal, object_feats, context, label_emb, action_emb, caption_embed = temporal.to(device), object_feats.to(device), context.to(device), label_emb.to(device), action_emb.to(device), caption_embed.to(device)

#             pred = model(temporal, object_feats, context)
#             loss = euclidean_loss(pred, caption_embed)

#             val_loss += loss.item()
#             val_progress_bar.set_postfix({"Batch Loss": loss.item(), "Avg Val Loss": val_loss / len(val_loader)})

#     avg_val_loss = val_loss / len(val_loader)
#     print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import OPTForCausalLM, AutoTokenizer
import torch
from transformers import AutoModelForCausalLM

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images


# specify the path to the model





class QFormer(nn.Module):
    def __init__(self, embed_dim=1024, num_queries=8, output_dim=768):
        super(QFormer, self).__init__()
        self.num_queries = num_queries
        self.output_dim = output_dim

        # H·ªçc c√°c query embeddings
        self.query_embed = nn.Parameter(torch.randn(1, num_queries, embed_dim))

        # Transformer Encoder ƒë·ªÉ tr√≠ch xu·∫•t video embeddings
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=8, dim_feedforward=2048)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)

        # Bi·∫øn ƒë·ªïi v·ªÅ kh√¥ng gian video query tokens
        self.fc = nn.Linear(embed_dim, output_dim)

    def forward(self, obj_feat, action_feat, global_feat):
        """
        obj_feat: (B, P, 1024)
        action_feat: (B, 1024)
        global_feat: (B, 1024)
        """
        B, P, _ = obj_feat.shape  # Batch size, s·ªë object
        N = P + 2  # T·ªïng s·ªë token

        # Gh√©p t·∫•t c·∫£ th√†nh (B, N, 1024)
        video_features = torch.cat([obj_feat, action_feat.unsqueeze(1), global_feat.unsqueeze(1)], dim=1)

        # Transformer Encoder
        video_embeddings = self.transformer_encoder(video_features)  # (B, N, 1024)

        # L·∫•y output v√† mapping v·ªÅ kh√¥ng gian 768 chi·ªÅu cho LLM
        video_query_tokens = self.fc(video_embeddings)  # (B, N, 768)

        return video_query_tokens


 

class CaptionHead(nn.Module):
    def __init__(self, llm_model="deepseek-ai/deepseek-vl2-tiny"):
        super(CaptionHead, self).__init__()
        self.qformer = QFormer()
        self.vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(llm_model)
        self.tokenizer = self.vl_chat_processor.tokenizer
        
        self.vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(llm_model, trust_remote_code=True)
        self.vl_gpt = self.vl_gpt.to(torch.bfloat16).cuda().eval()


    def generate_prompt(self, video_tokens):
        """
        video_tokens: (B, N, 768)
        Tr·∫£ v·ªÅ danh s√°ch prompt c√≥ k√≠ch th∆∞·ªõc (B, num_prompts).
        """
        B = video_tokens.shape[0]
        Xa_options = ["subject", "initial state", "final state"]
        Xb_options = ["The subject", "state initial change", "state final change"]

        prompts = []
        for i in range(B):
            # video_desc_inputs = self.tokenizer("<video>", return_tensors="pt")
            # video_desc = self.llm.generate(**video_desc_inputs, max_length=20)
            # video_desc_text = self.tokenizer.decode(video_desc[0], skip_special_tokens=True)

            video_prompts = [f"Video: <video>. This video tells the {Xa}. {Xb} is: "
                             for Xa, Xb in zip(Xa_options, Xb_options)]
            prompts.append(video_prompts)

        return prompts  # (B, num_prompts)

    def forward(self, obj_feat, action_feat, global_feat, max_length=50):
        """
        Sinh caption t·ª´ video features (batch).
        """
        video_tokens = self.qformer(obj_feat, action_feat, global_feat)  # (B, N, 768)
        batch_prompts = self.generate_prompt(video_tokens)  # (B, num_prompts)

        captions = []
        for prompts in batch_prompts:
            video_captions = []
            for prompt in prompts:
                conversation = [
                    {
                        "role": "<|User|>",
                        "content": prompt, 
                    },
                    {"role": "<|Assistant|>", "content": ""},
                ]
                
                # load images and prepare for inputs
                pil_images = load_pil_images(conversation)
                
                prepare_inputs = self.vl_chat_processor(
                    conversations=conversation,
                    images=pil_images,
                    video = torch.rand(4, 1280, dtype=torch.bfloat16),
                    force_batchify=True,
                    system_prompt=""
                ).to(self.vl_gpt.device)
                 
                
                # run image encoder to get the image embeddings
                inputs_embeds = self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
                 
                
                # run the model to get the response
                outputs = self.vl_gpt.language.generate(
                    inputs_embeds=inputs_embeds,
                    attention_mask=prepare_inputs.attention_mask,
                    pad_token_id=self.tokenizer.eos_token_id,
                    bos_token_id=self.tokenizer.bos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    max_new_tokens=512,
                    do_sample=False,
                    use_cache=True
                )
                
                answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
                video_captions.append(answer)
            captions.append(video_captions)

        return captions  # (B, num_prompts)


 

Python version is above 3.10, patching the collections module.


In [None]:
import torch
import torch.nn as nn

class VidCapModel(nn.Module):
    def __init__(self, objects_embed_dim=1024, semantic_dim=1024):
        super(VidCapModel, self).__init__()
        self.video_content_encoder = VideoFeatureExtractor()
        self.object_head = ObjectEncoderDecoder()
        self.global_head = GlobalHead()
        self.action_head = ActionHead()
        self.caption_head = CaptionHead() 
        self.linear_layer = nn.Linear(512, 1024)
         
        self.semantic_mapper = SemanticMapper(objects_embed_dim, semantic_dim=semantic_dim)

    def forward(self, temporal_features, object_features, context_features):

        C_project = self.linear_layer(context_features)
        M_project = self.linear_layer(temporal_features)
        object_project = self.linear_layer(object_features)
        
        # T√≠nh to√°n object head output (B, P, dim)
        Œæ, object_head_output = self.object_head(temporal_features, object_features, context_features) 
        
        # T√≠nh to√°n action head output (B, dim)
        action_head_output = self.action_head(M_project, Œæ)
        
        # Tinhs to√°n global head output (B, dim)
        obj_feat, action_feat, context = Œæ, M_project, C_project
        global_head_output = self.global_head(obj_feat, action_feat, context) 

        caption_head_output = self.caption_head(object_head_output, action_head_output, global_head_output)

        # caption_head_output = ""
        return object_head_output, action_head_output, global_head_output, caption_head_output


# =============================
# üîπ Test M√¥ H√¨nh
batch_size = 4  # S·ªë l∆∞·ª£ng video trong batch
embed_dim = 512
num_objects = 8  # P = 8
num_motions = 5  # N = 5
action_dim = 1024  # Kh√¥ng gian ng·ªØ nghƒ©a h√†nh ƒë·ªông

# Kh·ªüi t·∫°o m√¥ h√¨nh
video_model = VidCapModel()

# Gi·∫£ l·∫≠p ƒë·∫ßu v√†o
M = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
O = torch.randn(batch_size, num_motions, embed_dim)  # Motion features
C = torch.randn(batch_size, num_motions, embed_dim)  # Motion features

# T√≠nh to√°n action representation
object_head_output, action_head_output, global_head_output, caption_head_output  = video_model(M, O, C)
print("Shape of object_head_output representation:",  object_head_output.shape)  # (batch_size, action_dim)
print("Shape of action_head :",  action_head_output.shape)  # (batch_size, action_dim)
print("Shape of global_head_output representation:",  global_head_output.shape)  # (batch_size, action_dim)
print("caption_head_output", caption_head_output)
