# preparation

## imports

In [1]:
# environment imports
import sys, os, shutil, glob, random
import warnings
warnings.filterwarnings('ignore')

# visualization imports
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('classic')

import seaborn as sns
sns.set_style("whitegrid")

In [2]:
import numpy as np
import math, timm, time
import pandas as pd

# PyTorch import block
import torch
import torch.nn as nn
import torch.nn.functional as F
from   torch.utils.data import Dataset, DataLoader, TensorDataset
from   torch.utils.data.dataloader import default_collate
from   torch.optim.optimizer import Optimizer
from   torch.nn import TransformerEncoderLayer

# Torchvision import block
import torchvision
import torchvision.transforms as transforms

# einpos import block
import einops
from   einops import rearrange, repeat
from   einops.layers.torch import Rearrange

from tqdm.notebook import tqdm
from scipy import stats

from typing import Optional
import copy

In [3]:
print(f'current packages version:\n* torch: {torch.__version__};' \
      + f'\n* timm: {timm.__version__};\n* torchvision: {torchvision.__version__}')

current packages version:
* torch: 2.2.1;
* timm: 0.6.12;
* torchvision: 0.17.1


## variables

In [4]:
DATA_DIR, LOCAL_DIR, FE_MODELS_DIR = './data', './', './models/feature_extractors'

TRAIN_MP4_DIR, TRAIN_WAV_DIR = f'{DATA_DIR}/train/mp4', f'{DATA_DIR}/train/wav'
VALID_MP4_DIR, VALID_WAV_DIR = f'{DATA_DIR}/val/mp4', f'{DATA_DIR}/val/wav'

TRAIN_MP4_FEATURES_DIR, TRAIN_WAV_FEATURES_DIR = f'{DATA_DIR}/train/mp4_features', f'{DATA_DIR}/train/wav_features'
VALID_MP4_FEATURES_DIR, VALID_WAV_FEATURES_DIR = f'{DATA_DIR}/val/mp4_features', f'{DATA_DIR}/val/wav_features'

In [5]:
ABAW5_MODELS_CHECKPOINTS = './models/abaw_checkpoints'

In [6]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Current connected device is {DEVICE}')

Current connected device is cuda


## classes

In [7]:
def sinusoidal_embedding(n_channels: int=None, dim: int=None):
    '''
    Function define Positional Encoding.
    
            Parameters:
                n_channels (int): Number of channels in the input;
                dim (int): Dimension value.
    '''
    pe = torch.FloatTensor([[p / (10000 ** (2 * (i // 2) / dim)) for i in range(dim)] for p in range(n_channels)])
    pe[:, 0::2] = torch.sin(pe[:, 0::2])
    pe[:, 1::2] = torch.cos(pe[:, 1::2])
    
    return rearrange(pe, '... -> 1 ...')

class PreNorm(nn.Module):
    """Class for layer normalization in the current network"""
    def __init__(self, dim: int=None, fn: nn.Sequential=None):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
        
    def forward(self, x: torch.Tensor=None, **kwargs):
        return self.fn(self.norm(x), **kwargs)
    
class FeedForward(nn.Module):
    """Feed-forward block for the Transformer class"""
    def __init__(self, dim: int=None, hidden_dim: int=None, dropout: float=0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
        
    def forward(self, x: torch.Tensor=None):
        return self.net(x)
    
class Attention(nn.Module):
    """Attention mechanism realiztion"""
    def __init__(self, dim: int=None, heads: int=8, dim_head: int=64, dropout: float=0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x: torch.Tensor=None):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        
        return self.to_out(out)
    
class Transformer(nn.Module):
    """Transformer block class"""
    def __init__(
        self, dim: int=None, depth: int=None, heads: int=None, 
        dim_head: int=None, mlp_dim: int=None, dropout: float=0.
    ):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))
            
    def forward(self, x: torch.Tensor=None):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
            
        return x

In [8]:
def get_prob(
    features:np.ndarray=None, classifier_weights:np.ndarray=None, classifier_bias:np.ndarray=None, 
    logits:bool=True
) -> np.ndarray:
    '''
    Function for the getting probabilities of the classes of the feature_extraction model.

            Parameters:
                features (np.ndarray): Current extracted features;
                classifier_weights (np.ndarray): Classifier weights;
                classifier_bias(np.ndarray): Classifier bias;
                logits (bool): Get the logits or not.
    '''
    xs = np.dot(features, np.transpose(classifier_weights)) + classifier_bias

    if logits:
        return xs
    else:
        e_x = np.exp(xs - np.max(xs, axis=1)[:,np.newaxis])
        return e_x / e_x.sum(axis=1)[:, None]

def stack_npy_files(directory: str=None):
    """
    Open all .npy files in the given directory and stack them into one big array.
    
            Parameters:
                directory (str): The directory containing the .npy files.
    """
    npy_files = [file for file in os.listdir(directory) if file.endswith('.npy')]
    if not npy_files:
        print(f"No .npy files found in the directory {directory}.")
        return None
    
    stacked_array = np.concatenate([np.load(os.path.join(directory, file)) for file in npy_files], axis=0)
    return stacked_array

class AcousticModalityDataset(Dataset):
    """Dataset class for the visual-modality"""
    
    def __init__(
        self, data_info_path: str=None, split: str='Train', acoustic_features_dir_path: str=None,
        use_mean: bool=False, max_seq_length: int=400
    ):
        super(AcousticModalityDataset, self).__init__()
        self.data_info = pd.read_csv(data_info_path)
        self.split, self.acoustic_features_dir_path = split, acoustic_features_dir_path
        self.data_info = self.data_info[self.data_info['Split'] == self.split]
        self.file_name_padding, self.target_labels = 5, [
            'Adoration', 'Amusement', 'Anxiety', 'Disgust','Empathic-Pain', 'Fear', 'Surprise'
        ]
        
        self.use_mean, self.max_seq_length = use_mean, max_seq_length
        
        self.__init_inputs_labels()
        
    def __init_inputs_labels(self):
        print(f'Configure dataset from directory {self.acoustic_features_dir_path}')
        self.inputs, self.meta, self.labels = [], [], []
        
        for index, row_values in tqdm(self.data_info.iterrows(), total=self.data_info.shape[0]):
            current_ID = str(row_values['ID'])
            current_file_ID = '0' * (self.file_name_padding - len(current_ID)) + current_ID
            
            feature_dir_path = f'{self.acoustic_features_dir_path}/{current_file_ID}_features'
            if len(os.listdir(feature_dir_path)) < 1:
                continue
            current_acostic_features = np.load(f'{feature_dir_path}/feature-extraction-model_features.npy')
            
            seq_length = current_acostic_features.shape[0]
            if seq_length > self.max_seq_length: 
                current_acostic_features = current_acostic_features[:self.max_seq_length]
            else:
                current_acostic_features = np.pad(
                    current_acostic_features, pad_width=((0, self.max_seq_length - seq_length),(0,0)))
            
            if self.use_mean:
                current_acostic_features = np.mean(current_acostic_features, axis=0)
            current_labels = row_values[self.target_labels].values
            current_meta_info = [row_values['Age'], int(row_values['Country'] in ['United States'])]
            
            self.inputs.append(current_acostic_features)
            self.labels.append(current_labels)
            self.meta.append(current_meta_info)
            
            torch.cuda.empty_cache()
            
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index: int=None):
        inputs, labels, meta_info = self.inputs[index], self.labels[index], self.meta[index]
        
        inputs = torch.tensor(inputs.astype(np.float32), dtype=torch.float)
        labels = torch.tensor(labels.astype(np.float32), dtype=torch.float)
        meta_info = torch.tensor(np.array(meta_info).astype(np.float32), dtype=torch.float)
        
        return inputs, labels, meta_info

## functions

In [9]:
def seed_everything(seed:int=None) -> None:
    '''
    Function seed every random asprect.

            Parameters:
                seed (int): The seed number.
    '''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
def configure_feature_extraction_model_visual(
    feature_extractor_model_path: str=None, device: torch.device=None, return_initial: bool=True
) -> np.ndarray:
    '''
    Function configure feature-extraction model
    
            Parameters:
                extraction_model_path (str): path to feature-extraction model;
                device (torch.device): torch device (default=torch.cuda);
                return_initial (bool): Return the initial model or not
            Returns:
                features-extraction model
    '''
    feature_extractor_model = torch.load(feature_extractor_model_path)
    feature_extractor_model.classifier = torch.nn.Identity()
    feature_extractor_model.to(device)
    feature_extractor_model.eval()

    if return_initial:
        return feature_extractor_model, torch.load(feature_extractor_model_path)
    else:
        return feature_extractor_model
    
def calc_pearsons(predictions:np.array=None, ground_truth:np.array=None):
    '''
    Function calculates Pearson's Correlation Coefficient.
    
            Parameters:
                predictions (np.array): Model's forecasts;
                ground_truth (np.array): The fact.
    '''
    pcc = stats.pearsonr(predictions, ground_truth)
    return pcc[0]

def mean_pearsons(predictions:np.array=None, ground_truth:np.array=None, n_classes:int=7):
    '''
    Function calculates mean PCC between predictions and fact.
    
            Parameters:
                predictions (np.array): Model's forecasts;
                ground_truth (np.array): The fact;
                n_classes (int): number of classes.
    '''
    predictions, ground_truth = predictions.detach().cpu().numpy(), ground_truth.detach().cpu().numpy()
    class_wise_pcc = np.array([calc_pearsons(predictions[:, i], ground_truth[:, i]) for i in range(n_classes)])
    mean_pcc = np.mean(class_wise_pcc)
    
    return mean_pcc, class_wise_pcc

In [10]:
def train_one_epoch_visual(
    model:nn.Module=None, train_dataloader:DataLoader=None, criterion:nn.Module=None, 
    optimizer:torch.optim.Optimizer=None, scheduler:torch.optim.lr_scheduler=None, device=None, mode:str=None,
    use_meta: bool=False
):
    '''
    Function perform one epoch train iteration.
    
            Parameters:
                model (nn.Module): Current model;
                train_dataloder (Dataloader): Current train dataloader;
                criterion (nn.Module): Loss-function;
                optimizer (torch.optim.Optimizer): Current optimization function;
                scheduler (torch.optim.Scheduler): Current scheduler;
                device: Current device;
                mode (str): Current mode ('only-visual', 'only-audio', 'multi-modal')
    '''
    train_loss, train_mean_pcc, total_outputs, total_labels = 0, 0, [], []
    model.train()
    
    for current_batch in tqdm(train_dataloader):
        visual_features, labels, meta_info = current_batch
        visual_features, labels, meta_info = torch.tensor(data=visual_features, dtype=torch.float32), \
            torch.tensor(data=labels, dtype=torch.float32), torch.tensor(data=meta_info, dtype=torch.float32)
        visual_features, labels, meta_info = visual_features.to(device), labels.to(device), meta_info.to(device)
        
        if use_meta: visual_features = torch.concat((visual_features, meta_info.unsqueeze(1).expand(-1, 128, -1)), dim=-1)
        
        optimizer.zero_grad()
        
        outputs = model(visual_features)
        
        train_batch_loss = criterion(labels, outputs)
        train_batch_loss.backward()
        train_loss += train_batch_loss.item()
        
        batch_mean_pcc, _ = mean_pearsons(outputs, labels)
        train_mean_pcc += batch_mean_pcc
        
        optimizer.step()
            
    train_loss = train_loss / len(train_dataloader)
    train_mean_pcc = train_mean_pcc / len(train_dataloader)
    
    return train_loss, train_mean_pcc

In [11]:
def eval_one_epoch_visual(
    model:nn.Module=None, valid_dataloader:DataLoader=None, criterion:nn.Module=None, 
    scheduler:torch.optim.lr_scheduler=None, device=None, mode:str=None,
    use_meta: bool=False, model_checkpoint_dir: str=None, best_validation_score: float=None,
    model_suffix: str=None
):
    '''
    Function perform one epoch train iteration.
    
            Parameters:
                model (nn.Module): Current model;
                train_dataloder (Dataloader): Current train dataloader;
                criterion (nn.Module): Loss-function;
                optimizer (torch.optim.Optimizer): Current optimization function;
                scheduler (torch.optim.Scheduler): Current scheduler;
                device: Current device;
                mode (str): Current mode ('only-visual', 'only-audio', 'multi-modal')
    '''
    valid_loss, valid_mean_pcc, total_outputs, total_labels = 0, 0, [], []
    model.eval()
    
    with torch.no_grad():
        for current_batch in tqdm(valid_dataloader):
            visual_features, labels, meta_info = current_batch
            visual_features, labels, meta_info = torch.tensor(data=visual_features, dtype=torch.float32), \
                torch.tensor(data=labels, dtype=torch.float32), torch.tensor(data=meta_info, dtype=torch.float32)
            visual_features, labels, meta_info = visual_features.to(device), labels.to(device), meta_info.to(device)

            if use_meta: visual_features = torch.concat(
                (visual_features, meta_info.unsqueeze(1).expand(-1, 128, -1)), dim=-1)

            outputs = model(visual_features)

            valid_batch_loss = criterion(labels, outputs)
            valid_loss += valid_batch_loss.item()

            batch_mean_pcc, _ = mean_pearsons(outputs, labels)
            valid_mean_pcc += batch_mean_pcc

        valid_loss = valid_loss / len(valid_dataloader)
        valid_mean_pcc = valid_mean_pcc / len(valid_dataloader)
        
        if scheduler is not None:
            scheduler.step()

    if valid_mean_pcc >= best_validation_score:
        torch.save(model.state_dict(), f'{model_checkpoint_dir}/{model_suffix}_{valid_mean_pcc:.4f}.pt')
        
    return valid_loss, valid_mean_pcc

# Model

## dataset

In [12]:
seed_everything(2002)

In [13]:
start_time_extraction = time.time()

train_dataset = AcousticModalityDataset(
    data_info_path=f'{DATA_DIR}/preprocessed_data_info.csv', split='Train', 
    acoustic_features_dir_path=TRAIN_WAV_FEATURES_DIR + '-feature-extraction-model'
)
valid_dataset = AcousticModalityDataset(
    data_info_path=f'{DATA_DIR}/preprocessed_data_info.csv', split='Val', 
    acoustic_features_dir_path=VALID_WAV_FEATURES_DIR + '-feature-extraction-model'
)

elapsed_time = time.time() - start_time_extraction
print(f'Dataset were configured in {np.round((elapsed_time/60), 0)} minutes')

Configure dataset from directory ./data/train/wav_features-feature-extraction-model


  0%|          | 0/15806 [00:00<?, ?it/s]

Configure dataset from directory ./data/val/wav_features-feature-extraction-model


  0%|          | 0/4657 [00:00<?, ?it/s]

Dataset were configured in 4.0 minutes


## dataloader

In [14]:
class Collator(object):
    """Class for appropriate collate_fn"""
    def __init__(self, imgRandomLen: int=64):
        super().__init__()
        self.imgRandomLen = imgRandomLen

    def __call__(self, batch: torch.Tensor=None):
        '''
        Select a specfic number of images randomly for the time being.
        
                Parameters:
        '''
        audio = np.stack([tensors[0] for tensors in batch], axis=0).astype(np.float)
        labels = np.stack([tensors[1] for tensors in batch], axis=0).astype(np.float)
        meta   = np.stack([tensors[2] for tensors in batch], axis=0).astype(np.float)
        
        return np.stack(audio, axis=0).astype(np.float), labels, meta

In [15]:
collate_fn = Collator()
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn, drop_last=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn, drop_last=True)

## model

In [43]:
class AcousticModalityModel(nn.Module):
    def __init__(self, input_size:int=None, n_classes:int=7):
        super(AcousticModalityModel, self).__init__()
        self.rnn_aud = nn.LSTM(768, 128, 2, batch_first=True)
        self.transformer_aud = Transformer(128, 4, 4, dim_head=128, mlp_dim=256, dropout=0.2)
        self.dropout = nn.Dropout(p=0.3)
        
        self.head = nn.Sequential(
            nn.Linear(128, 256),
            nn.Dropout(0.3),
            nn.GELU(),
            nn.Linear(256, 7),
        )
    
    def forward(self, acoustic_input: torch.Tensor=None):
        acoustic_input, hidden_state = self.rnn_aud(acoustic_input)
        acoustic_input = self.transformer_aud(self.dropout(acoustic_input))
        
        return torch.sigmoid(self.head(acoustic_input)).mean(dim=1)

In [57]:
model = AcousticModalityModel(input_size=768).to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 150)

In [None]:
n_epochs, model_suffix = 150, 'acoustic'
all_checkpoints = [
    float(checkpoint.split('_')[-1].replace('.pt', '')) for checkpoint 
    in os.listdir(ABAW5_MODELS_CHECKPOINTS) if ((checkpoint.endswith('.pt')) & (model_suffix in checkpoint))
]
best_val_pcc = max(all_checkpoints)

for epoch in range(n_epochs):
    if epoch in range(9):
        print(f"Epoch 0{epoch+1}/{n_epochs}")
    else:
        print(f"Epoch {epoch+1}/{n_epochs}")
        
    train_loss, train_mean_pcc = train_one_epoch_visual(
        model=model, train_dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, 
        device=DEVICE, use_meta=False
    )
    print(f'training results. mean_pcc: {train_mean_pcc:.4f}; loss: {train_loss:.4f}.')
    
    valid_loss, valid_mean_pcc = eval_one_epoch_visual(
        model=model, valid_dataloader=valid_dataloader, device=DEVICE, use_meta=False, criterion=criterion,
        model_checkpoint_dir=ABAW5_MODELS_CHECKPOINTS, best_validation_score=best_val_pcc,
        model_suffix=model_suffix
    )
    print(f'validating results. mean_pcc: {valid_mean_pcc:.4f}; loss: {valid_loss:.4f}. best result: {best_val_pcc:.4f}') 