In [1]:
import pandas as pd 
import numpy as np
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [12]:
df = pd.read_csv('../data/csv/dataset.csv')
print(len(df))
df.columns

1336


Index(['file_name_list', 'speakers', 'visual_features', 'acoustic_features',
       'lexical_features', 'emotion_labels'],
      dtype='object')

In [3]:
for i in range(2):
    print(np.load(f"../data/{df['visual_features'].loc[i]}").shape)
    print(np.load(f"../data/{df['acoustic_features'].loc[i]}").shape)
    print(np.load(f"../data/{df['lexical_features'].loc[i]}").shape)


(41, 2048)
(1, 128)
(768,)
(218, 2048)
(7, 128)
(768,)


In [4]:
def collate_fn(batch):
    #  (seq_len, features)
    batch_features = [item for item in batch]
    batch_features_padded = pad_sequence(batch_features, batch_first=True)
    return batch_features_padded

In [5]:
class VisualFeatureDataset(Dataset):
    def __init__(self, dataframe, base_path="../data/"):
        self.dataframe = dataframe
        self.base_path = base_path

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        file_path = self.dataframe.iloc[idx]['visual_features']
        visual_features = np.load(self.base_path + file_path)
        return torch.tensor(visual_features, dtype=torch.float)


In [6]:
class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMFeatureExtractor, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x):
        # x shape: (batch, seq_len, features)
        lstm_out, (hn, cn) = self.lstm(x)  # lstm_out shape: (batch, seq_len, hidden_dim)
        return lstm_out 


In [7]:
input_dim_vis = 2048  
input_dim_aud = 2048  
hidden_dim = 128  
num_layers = 1    

feature_extractor_vis = LSTMFeatureExtractor(input_dim_vis, hidden_dim, num_layers)

### Retrieve feature vector per data point 

In [16]:
dataset = VisualFeatureDataset(df)
dataloader = DataLoader(dataset, batch_size = 1, shuffle=True, collate_fn=collate_fn)
features = []

for batch_features in tqdm(dataloader):
    # print(batch_features.shape)
    #  batch_features = (batch_size, seq_len, input_dim)
    features.append( feature_extractor_vis(batch_features).cpu().detach().numpy() ) # features.shape = (batch_size, seq_len, input_dim) 
    # print(features.shape)

    
    


100%|██████████| 1336/1336 [00:08<00:00, 158.78it/s]


In [17]:
df['extracted_visual_labels'] = features