In [1]:
import numpy as np
import os
import torch
import timm
import torch.nn as nn
from transformers import Wav2Vec2Processor, HubertModel
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import time
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
! nvidia-smi

Thu Apr 25 23:34:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     Off | 00000000:81:00.0 Off |                    0 |
|  0%   47C    P0              74W / 300W |      7MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
device

'cuda:0'

In [4]:
# pwd

In [5]:
# ! ls /project/msoleyma_1026/Aff-Wild2/

In [6]:
# audio_feature = torch.load("/project/msoleyma_1026/Aff-Wild2/audio_feature/103-30-384x480.pt", map_location=torch.device(device)).to(device)
# video_feature = torch.tensor(np.load("/project/msoleyma_1026/Aff-Wild2/video_feature/103-30-384x480.npy")).to(device)

In [7]:
def seq_feature_generation(video_feature, audio_feature, seq_len, pooling = "mean"):
    #video_feature : (771, 1, 197, 768)
    #audio_feature : [1, 773, 1024]
    video_feature = torch.tensor(video_feature, dtype=torch.float32).to(device)
    audio_feature = audio_feature.to(dtype=torch.float32)

    video_feature = video_feature.permute(1,0,2,3)
    
    if pooling == "mean":
        video_feature = torch.mean(video_feature, dim = 2, keepdim=False)
    elif pooling == "max":
        video_feature = torch.max(video_feature, dim = 2, keepdim=False)[0]

    max_seq = min(video_feature.shape[1], audio_feature.shape[1])
    video_feature = video_feature[:, :max_seq, :]
    audio_feature = audio_feature[:, :max_seq, :]
    combined_feature = torch.cat([video_feature, audio_feature], dim = -1)
    #[1, max_seq, 1024 + 768]
    
    if max_seq < seq_len:
        # Pad both features to seq_len along the sequence dimension
        combined_sequences = F.pad(combined_feature, (0, 0, 0, seq_len - max_seq))
    else:
        num_complete_seqs = max_seq // seq_len
        combined_sequences = combined_feature[:,:num_complete_seqs*seq_len, :].view(-1, seq_len, combined_feature.shape[-1])
    #[-1, seq_len, combined_feature_size]
    return combined_sequences

In [8]:
# sequence = seq_feature_generation(video_feature, audio_feature, seq_len = 10)

In [9]:
# sequence.shape

In [10]:
class ViTHuBERTTransformer(nn.Module):
    def __init__(self, vit_base_model,
                 hubert_base_model,
                 num_classes,
                 nhead,
                 num_layers,
                small_dataset = True):
        super().__init__()

        self.vit = timm.create_model(vit_base_model, pretrained=True)

        #self.processor = Wav2Vec2Processor.from_pretrained(hubert_base_model)
        self.hubert = HubertModel.from_pretrained(hubert_base_model)

        if small_dataset:
            for param in self.vit.parameters():
                param.requires_grad = False
        
            for param in self.hubert.parameters():
                param.requires_grad = False
            

        encoder_layer = nn.TransformerEncoderLayer(d_model = self.vit.num_features + self.hubert.config.hidden_size,
                                                  nhead = nhead,
                                                  dim_feedforward = (self.vit.num_features + self.hubert.config.hidden_size)//2,
                                                  batch_first = True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)

        # Classifier
        self.classifier = nn.Linear(self.vit.num_features + self.hubert.config.hidden_size, num_classes)
    def forward(self, video_feature_raw, audio_feature_raw):

        vit_feature = self.vit.forward_features(video_feature_raw)
        audio_feature = self.hubert(audio_feature_raw).last_hidden_state
        
        # Combine features
        combined_features = torch.cat((vit_feature, audio_feature), dim=1)

        transformer_output = self.transformer_encoder(combined_features)

        logits = self.classifier(transformer_output.squeeze(1))
        return logits

In [11]:
class ViTHuBERTTransformer_prepossed(nn.Module):
    def __init__(self, vit_base_model,
                 hubert_base_model,
                 num_classes,
                 nhead,
                 num_layers,
                small_dataset = True):
        super().__init__()

        self.vit = timm.create_model(vit_base_model, pretrained=True)

        #self.processor = Wav2Vec2Processor.from_pretrained(hubert_base_model)
        self.hubert = HubertModel.from_pretrained(hubert_base_model)

        if small_dataset:
            for param in self.vit.parameters():
                param.requires_grad = False
        
            for param in self.hubert.parameters():
                param.requires_grad = False
            
        print(self.vit.num_features + self.hubert.config.hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model = self.vit.num_features + self.hubert.config.hidden_size,
                                                  nhead = nhead,
                                                  dim_feedforward = (self.vit.num_features + self.hubert.config.hidden_size)//2,
                                                  batch_first = True)
        # print(encoder_layer.head_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)

        # Classifier
        self.classifier = nn.Linear(self.vit.num_features + self.hubert.config.hidden_size, num_classes)
        self._init_transformer_weights()

    def _init_transformer_weights(self):
        # Initialize only the transformer encoder layers
        for layer in self.transformer_encoder.layers:
            for module in layer.modules():
                if isinstance(module, nn.Linear):
                    nn.init.xavier_uniform_(module.weight)
                    if module.bias is not None:
                        nn.init.constant_(module.bias, 0)
                # Optionally initialize other components like multi-head attention within the transformer layer
                # Depending on the PyTorch implementation, you may need to access sub-components directly

    def forward(self, combined_feature):

        transformer_output = self.transformer_encoder(combined_feature)

        logits = self.classifier(transformer_output.squeeze(1))
        return logits

In [12]:
model = ViTHuBERTTransformer_prepossed(
    vit_base_model = 'vit_base_patch16_224',
    hubert_base_model = "facebook/hubert-large-ls960-ft",
    num_classes = 12,
    nhead = 16,
    num_layers = 18,
    small_dataset = True
)

1792


In [13]:
model.to(device)

ViTHuBERTTransformer_prepossed(
  (vit): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(app

In [14]:
# for param in model.parameters():
#     print(param.requires_grad)

In [15]:
def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    loss_cumulative = 0.
    eval_labels = []
    eval_preds = []
    sub_batch_count = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluate"):
            for inputs, labels in batch:
                inputs, labels = inputs.squeeze().to(device), labels.squeeze().to(device)
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                loss_cumulative += loss.item()
                # print(loss_cumulative)
                preds = torch.sigmoid(outputs) if not hasattr(loss_fn, 'activation') else outputs
                preds = (preds > 0.5).int()
                eval_labels.append(labels.cpu().numpy().reshape(-1,12))
                eval_preds.append(preds.cpu().numpy().reshape(-1,12))
                sub_batch_count+=1
    eval_labels = np.concatenate(eval_labels, axis=0)
    eval_preds = np.concatenate(eval_preds, axis=0)
    # print(eval_labels.shape, eval_preds.shape)
    eval_f1_score = f1_score(eval_labels, eval_preds, average='macro')
    eval_recall_score = recall_score(eval_labels, eval_preds, average='macro')
    eval_precision_score = precision_score(eval_labels, eval_preds, average='macro')

    return loss_cumulative / sub_batch_count, eval_f1_score, eval_recall_score, eval_precision_score


In [16]:
def loglinspace(rate, step, end=None):
    t = 0
    while end is None or t <= end:
        yield t
        t = int(t + 1 + step * (1 - math.exp(-t * rate / step)))

In [17]:
def train(model, optimizer, dataloader_train, dataloader_valid, loss_fn,
             max_iter=101, scheduler=None, device="cpu"):
    model.to(device = device, dtype=torch.float32)
    print(device)
    checkpoint_generator = loglinspace(0.3, 5)
    checkpoint = next(checkpoint_generator)
    start_time = time.time()
    run_name = "vithubertformer"
    try:
        model.load_state_dict(torch.load(run_name + '.torch')['state'])
    except:
        results = {}
        history = []
        s0 = 0
    else:
        print('read torch history')
        results = torch.load(run_name + '.torch')
        history = results['history']
        s0 = history[-1]['step'] + 1
        
    
    for step in range(max_iter):
        if step == 0:
            for i in range(1):
                valid_avg_loss, val_f1_score, val_recall_score, val_precision_score= evaluate(model, dataloader_valid, loss_fn, device)
                print(f'{i}',f'valid_avg_loss before training: {valid_avg_loss:8.4f}', f'val_f1_score before training: , {val_f1_score:8.4f}')
                print(f'{i}',f'val_recall_score before training: {val_recall_score:8.4f}', f'val_precision_score before training: , {val_precision_score:8.4f}')

        
        model.train()
        loss_cumulative = 0.

        for batch in tqdm(dataloader_train, desc="Training"):
            for inputs, labels in batch:
                inputs, labels = inputs.squeeze().to(device), labels.squeeze().to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                loss.backward()
                optimizer.step()
                loss_cumulative += loss.item()
    
            wall = time.time() - start_time
        if step == checkpoint:
            checkpoint = next(checkpoint_generator)
            assert checkpoint > step

            valid_avg_loss, val_f1_score, val_recall_score, val_precision_score = evaluate(model, dataloader_valid, loss_fn, device)

            history.append({
                'step': s0 + step,
                'wall': wall,
                'batch': {
                    'loss': loss.item(),
                },
                'valid': {
                    'loss': valid_avg_loss,
                },
            })

            results = {
                'history': history,
                'state': model.state_dict()
            }

            print(f"epoch {step + 1:4d}   " +
                  f"valid loss mse= {valid_avg_loss:8.4f}   " +
                  f"wall = {time.strftime('%H:%M:%S', time.gmtime(wall))}  " +
                  f"val_f1_score = {val_f1_score:8.4f}  \n" +
                  f"val_recall_score = {val_recall_score:8.4f}  " +
                  f"val_precision_score = {val_precision_score:8.4f}  "
                 )
            print('Training loss: ',loss_cumulative)

            with open(run_name + '.torch', 'wb') as f:
                torch.save(results, f)

        if scheduler is not None:
            scheduler.step()
            print_learning_rates(optimizer)

In [18]:
class AudioVideoDataset(Dataset):
    def __init__(self, video_dir, audio_dir, label_dir):
        self.video_dir = video_dir
        self.audio_dir = audio_dir
        self.label_dir = label_dir

        # Collect all label files, and construct corresponding video and audio file paths
        self.entries = []
        for label_file in sorted(os.listdir(label_dir)):
            if label_file.endswith('.txt'):
                base_name = os.path.splitext(label_file)[0]
                video_file = os.path.join(video_dir, f"{base_name}.npy")
                audio_file = os.path.join(audio_dir, f"{base_name}.pt")
                label_file_path = os.path.join(label_dir, label_file)
                
                # Add entry only if corresponding video and audio files exist
                if os.path.exists(video_file) and os.path.exists(audio_file):
                    self.entries.append((video_file, audio_file, label_file_path))
                else:
                    print(f"Missing video or audio file for {label_file}")

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        video_file, audio_file, label_file = self.entries[idx]
        video_feature = np.load(video_file)
        audio_feature = torch.load(audio_file)
        labels = np.loadtxt(label_file, skiprows=1, delimiter=',')


        seq_len = 10  # Define the desired sequence length
        min_len = min(len(labels), video_feature.shape[0], audio_feature.shape[1])

        # Further truncate data to the minimum length across modalities
        labels = labels[:min_len, :]
        video_feature = video_feature[:min_len, :, :, :]
        audio_feature = audio_feature[:, :min_len, :]

        # Find indices where all labels are binary
        binary_indices = np.all(np.isin(labels, [0, 1]), axis=1)
        # Filter out non-binary frames
        labels = labels[binary_indices, :]
        video_feature = video_feature[binary_indices, :, :, :]
        audio_feature = audio_feature[:, binary_indices, :]  # Adjust this if necessary
        
        combined_features = seq_feature_generation(video_feature, audio_feature, seq_len)  # Adjust device as needed

        label_sequences = labels[:combined_features.shape[0] * seq_len].reshape(-1, seq_len, 12)

        return combined_features, label_sequences


In [19]:
class AudioVideoDataset2(Dataset):
    def __init__(self, video_dir, audio_dir, label_dir, device='cpu'):
        print(device)
        self.device = device
        self.entries = []
        
        # Load all data into memory
        for label_file in sorted(os.listdir(label_dir)):
            if label_file.endswith('.txt'):
                base_name = os.path.splitext(label_file)[0]
                video_file = os.path.join(video_dir, f"{base_name}.pt")
                audio_file = os.path.join(audio_dir, f"{base_name}.pt")
                label_file_path = os.path.join(label_dir, label_file)
                
                if os.path.exists(video_file) and os.path.exists(audio_file):
                    # Load features and labels
                    video_feature = torch.load(video_file).permute(1,0,2)
                    audio_feature = torch.load(audio_file)
                    labels = np.loadtxt(label_file_path, skiprows=1, delimiter=',')

                    min_len = min(len(labels), video_feature.shape[1], audio_feature.shape[1])
                    labels = labels[:min_len, :]
                    # print(video_feature.shape, audio_feature.shape)
                    video_feature = video_feature[:, :min_len, :]
                    audio_feature = audio_feature[:, :min_len, :]

                    binary_indices = np.all(np.isin(labels, [0, 1]), axis=1)
                    # Filter out non-binary frames
                    labels = labels[binary_indices, :]
                    video_feature = video_feature[:, binary_indices, :]
                    audio_feature = audio_feature[:, binary_indices, :]
                    
                    # Move data to the specified device
                    video_feature = video_feature.to(self.device)
                    audio_feature = audio_feature.to(self.device)
                    labels = torch.tensor(labels, dtype=torch.float32).to(self.device)

                    self.entries.append((video_feature, audio_feature, labels))
                else:
                    print(f"Missing video or audio file for {base_name}")

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        video_feature, audio_feature, labels = self.entries[idx]

        seq_len = 10  # Desired sequence length for analysis
        sub_batch_size = 128  # Desired number of sequences per sub-batch

        combined_feature = torch.cat([video_feature, audio_feature], dim=-1)
        total_seqs = labels.shape[0] // seq_len * seq_len  # Discard incomplete final sequence

        # Collect full sub-batches only
        smaller_batches = []
        for start_idx in range(0, total_seqs, sub_batch_size * seq_len):
            end_idx = start_idx + sub_batch_size * seq_len
            if end_idx > total_seqs:
                break  # Discard last smaller batch if it doesn't fill the complete sub_batch_size
            batch_features = combined_feature[:, start_idx:end_idx, :].view(-1, seq_len, combined_feature.shape[-1])
            batch_labels = labels[start_idx:end_idx, :].view(-1, seq_len, 12)
            smaller_batches.append((batch_features, batch_labels))

        return smaller_batches

In [20]:
class AudioVideoDataset2(Dataset):
    def __init__(self, video_dir, audio_dir, label_dir, device='cpu'):
        print(device)
        self.device = device
        self.entries = []
        
        # Load all data into memory
        for label_file in sorted(os.listdir(label_dir)):
            if label_file.endswith('.txt'):
                base_name = os.path.splitext(label_file)[0]
                video_file = os.path.join(video_dir, f"{base_name}.pt")
                audio_file = os.path.join(audio_dir, f"{base_name}.pt")
                label_file_path = os.path.join(label_dir, label_file)
                
                if os.path.exists(video_file) and os.path.exists(audio_file):
                    # Load features and labels
                    video_feature = torch.load(video_file).permute(1,0,2)
                    audio_feature = torch.load(audio_file)
                    labels = np.loadtxt(label_file_path, skiprows=1, delimiter=',')

                    min_len = min(len(labels), video_feature.shape[1], audio_feature.shape[1])
                    labels = labels[:min_len, :]
                    # print(video_feature.shape, audio_feature.shape)
                    video_feature = video_feature[:, :min_len, :]
                    audio_feature = audio_feature[:, :min_len, :]

                    binary_indices = np.all(np.isin(labels, [0, 1]), axis=1)
                    # Filter out non-binary frames
                    labels = labels[binary_indices, :]
                    video_feature = video_feature[:, binary_indices, :]
                    audio_feature = audio_feature[:, binary_indices, :]
                    
                    # Move data to the specified device
                    video_feature = video_feature.to(self.device)
                    audio_feature = audio_feature.to(self.device)
                    labels = torch.tensor(labels, dtype=torch.float32).to(self.device)

                    self.entries.append((video_feature, audio_feature, labels))
                else:
                    print(f"Missing video or audio file for {base_name}")

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        video_feature, audio_feature, labels = self.entries[idx]

        seq_len = 10  # Desired sequence length for analysis
        sub_batch_size = 128  # Desired number of sequences per sub-batch

        combined_feature = torch.cat([video_feature, audio_feature], dim=-1)
        total_seqs = labels.shape[0] // seq_len * seq_len  # Discard incomplete final sequence

        # Collect full sub-batches only
        smaller_batches = []
        for start_idx in range(0, total_seqs, sub_batch_size * seq_len):
            end_idx = start_idx + sub_batch_size * seq_len
            if end_idx > total_seqs:
                break  # Discard last smaller batch if it doesn't fill the complete sub_batch_size
            batch_features = combined_feature[:, start_idx:end_idx, :].view(-1, seq_len, combined_feature.shape[-1])
            batch_labels = labels[start_idx:end_idx, :].view(-1, seq_len, 12)
            smaller_batches.append((batch_features, batch_labels))

        return smaller_batches

In [132]:
class AudioVideoDataset3(Dataset):
    def __init__(self, video_dir, audio_dir, label_dir, device):
        self.video_dir = video_dir
        self.audio_dir = audio_dir
        self.label_dir = label_dir
        self.entries = []

        # Collect all the filenames without loading data into memory
        for label_file in sorted(os.listdir(label_dir)):
            if label_file.endswith('.txt'):
                base_name = os.path.splitext(label_file)[0]
                video_file = os.path.join(video_dir, f"{base_name}.pt")
                audio_file = os.path.join(audio_dir, f"{base_name}.pt")
                label_file_path = os.path.join(label_dir, label_file)
                
                if os.path.exists(video_file) and os.path.exists(audio_file):
                    self.entries.append((video_file, audio_file, label_file_path))
                else:
                    print(f"Missing video or audio file for {base_name}")

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        video_file, audio_file, label_file = self.entries[idx]

        # Load features and labels
        video_feature = torch.load(video_file).permute(1, 0, 2)
        audio_feature = torch.load(audio_file)
        labels = np.loadtxt(label_file, skiprows=1, delimiter=',')
        labels = torch.tensor(labels)

        # Ensure consistent lengths across modalities
        min_len = min(len(labels), video_feature.shape[1], audio_feature.shape[1])
        labels = labels[:min_len, :]
        video_feature = video_feature[:, :min_len, :]
        audio_feature = audio_feature[:, :min_len, :]

        # Filter non-binary frames
        binary_indices = np.all(np.isin(labels, [0, 1]), axis=1)
        labels = labels[binary_indices, :]
        video_feature = video_feature[:, binary_indices, :]
        audio_feature = audio_feature[:, binary_indices, :]

        # Define the desired sequence length and sub-batch size
        seq_len = 10
        sub_batch_size = 128

        # Concatenate and prepare batches
        combined_feature = torch.cat([video_feature.to(device), audio_feature.to(device)], dim=-1)
        total_seqs = len(labels) // seq_len * seq_len

        smaller_batches = []
        for start_idx in range(0, total_seqs, sub_batch_size * seq_len):
            end_idx = start_idx + sub_batch_size * seq_len
            if end_idx > total_seqs:
                break  # Discard last smaller batch if incomplete
            # print(start_idx, end_idx, labels.shape)
            batch_features = combined_feature[:, start_idx:end_idx, :].view(-1, seq_len, combined_feature.shape[-1])
            batch_labels = labels[start_idx:end_idx, :].view(-1, seq_len, 12)
            smaller_batches.append((batch_features, batch_labels))

        return smaller_batches



In [133]:
device

'cuda:0'

In [134]:
# train_label_dir = '/project/msoleyma_1026/Aff-Wild2/test/test1/Train_Set/label_30'
train_label_dir = '/project/msoleyma_1026/Aff-Wild2/labels/AU_Detection_Challenge/Train_Set'
audio_feature_dir = '/project/msoleyma_1026/Aff-Wild2/features4'
video_feature_dir = '/project/msoleyma_1026/Aff-Wild2/video_feature_pooled'
dataset_train = AudioVideoDataset3(video_feature_dir, audio_feature_dir, train_label_dir, device)
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)

Missing video or audio file for 10-60-1280x720_right
Missing video or audio file for 135-24-1920x1080_left
Missing video or audio file for 135-24-1920x1080_right
Missing video or audio file for 46-30-484x360_left
Missing video or audio file for 46-30-484x360_right


In [135]:
dataset_train.__len__()

290

In [136]:
768+1024

1792

In [137]:
# ! ls /project/msoleyma_1026/Aff-Wild2/test/Validation_Set/label_10

In [138]:
# ! ls /project/msoleyma_1026/Aff-Wild2/test/Validation_Set/video_feature_10

In [139]:
# ! ls /project/msoleyma_1026/Aff-Wild2/test/Validation_Set/audio_feature_10

In [140]:
# val_label_dir = '/project/msoleyma_1026/Aff-Wild2/test/test1/Validation_Set/label_10'
val_label_dir = '/project/msoleyma_1026/Aff-Wild2/labels/AU_Detection_Challenge/Validation_Set'
val_video_feature_dir = "/project/msoleyma_1026/Aff-Wild2/video_feature_pooled"
val_audio_feature_dir='/project/msoleyma_1026/Aff-Wild2/features4'
dataset_val = AudioVideoDataset3(val_video_feature_dir, val_audio_feature_dir, val_label_dir, device)
dataloader_val = DataLoader(dataset_val, batch_size=1, shuffle=False)


In [141]:
dataloader_val.__len__()

105

In [142]:
# a = torch.randint(0, 10, (1287, 10))

In [143]:
# print(a.shape)

In [144]:
# a[0:1280, :].view(-1, 128, 10)

In [145]:
# # Extract one batch from the dataloader
# count=0
# for batch in dataloader_val:
#     for features, labels in batch:
#         print(count, "Features shape:", features.shape, "Labels shape:", labels.shape)
#         # print("Labels shape:", labels.shape)
#         # print("Features example (one batch):", features)
#         # print("Labels example (one batch):", labels)
#         # count+=1
#         break  # Break after the first batch to only print one batch of data


In [146]:
! rm vithubertformer.torch

rm: cannot remove 'vithubertformer.torch': No such file or directory


In [147]:
def print_learning_rates(optimizer):
    for idx, param_group in enumerate(optimizer.param_groups):
        print(f"Learning rate for parameter group {idx}: {param_group['lr']}")


In [None]:
loss_function = torch.nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ExponentialLR(opt, gamma=0.96)
train(model, opt, dataloader_train, dataloader_val, loss_function,
             max_iter=10, scheduler=scheduler, device=device)


cuda:0


Evaluate: 100%|██████████| 105/105 [00:18<00:00,  5.64it/s]


0 valid_avg_loss before training:   4.6662 val_f1_score before training: ,   0.1551
0 val_recall_score before training:   0.3486 val_precision_score before training: ,   0.1841


Training:  19%|█▉        | 55/290 [00:44<03:24,  1.15it/s]

In [None]:
# label_dir = '/project/msoleyma_1026/Aff-Wild2/labels/AU_Detection_Challenge/Train_Set'
# audio_feature_dir = '/project/msoleyma_1026/Aff-Wild2/features4'
# video_feature_dir = '/project/msoleyma_1026/Aff-Wild2/video_feature'

In [None]:
# label_names = [f for f in os.listdir(label_dir) if os.path.isfile(os.path.join(label_dir, f))]
# audio_names = [f for f in os.listdir(audio_feature_dir) if os.path.isfile(os.path.join(audio_feature_dir, f))]
# video_names = [f for f in os.listdir(video_feature_dir) if os.path.isfile(os.path.join(video_feature_dir, f))]

In [None]:
# label_names

In [None]:
# audio_names

In [None]:
# share_names = []
# for name in label_names:
#     if os.path.basename(name).replace('.txt', '.npy') in video_names:
#         share_names.append(name)
#     # if os.path.basename(name).replace('.txt', '.pt') in audio_names:
#     #     share_names.append(name)

    

In [None]:
# share_names

In [None]:
# len(share_names)

In [None]:
# len(label_names)