In [2]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import torch
from PIL import Image
from numpy import asarray

import argparse
from collections import namedtuple, OrderedDict
import itertools
import os
import numpy as np
from typing import Tuple
from typing import List
from typing import Dict
import random
from itertools import product
import copy
import re
import random
import hashlib
import pathlib
import json

import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
)

import logging

preprocess = {
    'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
    'acoustic_timer': lambda df : df.loc[:,' frameTime'],
    'linguistic': lambda df : df.loc[:,'word'],
    'linguistic_timer': lambda df : df.loc[:,'time-offset'],
    'target': lambda df : df.loc[:,' rating'],
    'target_timer': lambda df : df.loc[:,'time'],
}

class InputFeature:
    
    def __init__(
        self, video_id="",
        acoustic_feature=[],
        linguistic_feature=[],
        visual_feature=[],
        labels=[],
    ):
        self.video_id = video_id
        self.acoustic_feature = acoustic_feature
        self.linguistic_feature = linguistic_feature
        self.visual_feature = visual_feature
        self.labels = labels
        
def preprocess_SEND_files(
    data_dir, # Multitmodal X
    target_data_dir, # Y
    time_window_in_sec=5.0,
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested,
                        "target": "target",
                       },
    linguistic_tokenizer=None,
    pad_symbol=0,
):
    SEND_videos = []
    
    # basically, let us gett all the video ids?
    a_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["acoustic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["acoustic"], f))]
    l_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["linguistic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["linguistic"], f))]
    v_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["visual"])) 
             if f != ".DS_Store"]
    assert len(a_ids) == len(l_ids) and len(l_ids) == len(v_ids)
    assert len(set(a_ids).intersection(set(l_ids))) == len(l_ids)
    assert len(set(a_ids).intersection(set(v_ids))) == len(v_ids)
    
    # We need the first pass for linguistic modality process?
    max_window_l_length = -1
    for video_id in a_ids: # pick any one!
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        l_words = np.array(preprocess["linguistic"](l_df))
        l_words = [w.strip().lower() for w in l_words]
        l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        # sample based on interval
        current_time = 0.0
        keep_first = True
        sampled_l_words = [] # different from other modality, it is essentially a list of list!
        tmp_words = []
        for i in range(0, l_timestamps.shape[0]):
            if keep_first:
                sampled_l_words += [[]]
                keep_first = False
            if l_timestamps[i] >= current_time+time_window_in_sec:
                sampled_l_words.append(tmp_words)
                tmp_words = [l_words[i]] # reinit the buffer
                current_time += time_window_in_sec
                continue
            tmp_words += [l_words[i]]
        # overflow
        if len(tmp_words) > 0:
            sampled_l_words.append(tmp_words)
        for window_words in sampled_l_words:
            token_ids = linguistic_tokenizer.convert_tokens_to_ids(window_words)
            if len(token_ids) > max_window_l_length:
                max_window_l_length = len(token_ids)
    max_window_l_length += 2 # the start and the end token
    
    max_seq_len = -1
    video_count = 0
    for video_id in a_ids: # pick any one!
        if video_count > 1 and video_count%100 == 0:
            logger.info(f"Processed #{len(SEND_videos)} videos.")
            logger.info(SEND_videos[-1])
        
        # we need to fix this to get features aligned.
        
        # Step 1: Load rating data, and we can get window partitioned according to our interval.
        target_id = video_id.split("_")[0][2:] + "_" + video_id.split("_")[1][3:]
        target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"target_{target_id}_normal.csv")
        target_df = pd.read_csv(target_file)
        target_ratings = np.array(preprocess["target"](target_df))
        target_timestamps = np.array(preprocess["target_timer"](target_df))
        assert target_ratings.shape[0] == target_timestamps.shape[0]
        windows = []
        number_of_window = int(max(target_timestamps)//time_window_in_sec)
        for i in range(0, number_of_window):
            windows += [(i*time_window_in_sec, (i+1)*time_window_in_sec)]
        windows += [((i+1)*time_window_in_sec, max(target_timestamps))]
        # [(0, 5], (5, 10], ...]

        # acoustic features process
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        a_feature_dim = a_features.shape[1]
        assert a_features.shape[0] == a_timestamps.shape[0]
        sampled_a_features_raw = [[] for i in range(len(windows))]
        for i in range(0, a_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(a_timestamps[i]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_a_features_raw[hash_in_window].append(a_features[i])
        sampled_a_features = []
        for window in sampled_a_features_raw:
            # only acoustic need to consider this I think.
            if len(window) == 0:
                collate_window = np.zeros(a_feature_dim)
            else:
                collate_window = np.mean(np.array(window), axis=0)
            sampled_a_features.append(collate_window)
        
        
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        l_words = np.array(preprocess["linguistic"](l_df))
        l_words = [w.strip().lower() for w in l_words]
        l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        assert len(l_words) == l_timestamps.shape[0]
        sampled_l_features_raw = [[] for i in range(len(windows))]
        for i in range(0, l_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(l_timestamps[i]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_l_features_raw[hash_in_window].append(l_words[i])
        sampled_l_features = []
        sampled_l_window_length = []
        for window in sampled_l_features_raw:
            complete_window_word = ["[CLS]"] + window + ["[SEP]"]
            token_ids = linguistic_tokenizer.convert_tokens_to_ids(complete_window_word)
            sampled_l_window_length += [len(token_ids)]
            for _ in range(0, max_window_l_length-len(token_ids)):
                token_ids.append(linguistic_tokenizer.pad_token_id)
            sampled_l_features += [token_ids]

        
        # visual features process
        # for visual, we actually need to active control what image we load, we
        # cannot just load all images, it will below memory.
        fps=30 # We may need to dynamically figure out this number?
        frame_names = []
        for f in listdir(os.path.join(data_dir, modality_dir_map["visual"], video_id)):
            if ".jpg" in f:
                frame_names += [(int(f.split("_")[0][5:])*(1.0/fps), f)]
        frame_names.sort(key=lambda x:x[0])
        sampled_v_features_raw = [[] for i in range(len(windows))]
        for f in frame_names:
            # using mod to hash to the correct bucket.
            hash_in_window = int(f[0]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_v_features_raw[hash_in_window].append(f)

        sampled_v_features = []
        for window in sampled_v_features_raw:
            if len(window) == 0:
                f_data = np.zeros((224,224,3))
            else:
                # we collate by using the last frame in the time window.
                f = window[-1]
                f_path = os.path.join(data_dir, modality_dir_map["visual"], video_id, f[1])
                f_image = Image.open(f_path)
                f_data = asarray(f_image)
            sampled_v_features.append(f_data)

        # ratings (target)
        target_id = video_id.split("_")[0][2:] + "_" + video_id.split("_")[1][3:]
        target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"target_{target_id}_normal.csv")
        target_df = pd.read_csv(target_file)
        target_ratings = np.array(preprocess["target"](target_df))
        target_timestamps = np.array(preprocess["target_timer"](target_df))
        assert target_ratings.shape[0] == target_timestamps.shape[0]
        sampled_ratings_raw = [[] for i in range(len(windows))]
        for i in range(0, target_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(target_timestamps[i]//time_window_in_sec)
            sampled_ratings_raw[hash_in_window].append(target_ratings[i])
        sampled_ratings = []
        for window in sampled_ratings_raw:
            collate_window = np.mean(np.array(window), axis=0)
            sampled_ratings.append(collate_window)
        
        # we truncate features based on linguistic avaliabilities.
        assert len(sampled_a_features) == len(sampled_l_features)
        assert len(sampled_a_features) == len(sampled_v_features)
        
        max_window_cutoff_l = int(max(l_timestamps)//time_window_in_sec)
        max_window_cutoff_a = int(max(a_timestamps)//time_window_in_sec)
        max_window_cutoff_v = int(frame_names[-1][0]//time_window_in_sec)
        max_window_cutoff = min([max_window_cutoff_l, max_window_cutoff_a, max_window_cutoff_v])
        sampled_a_features = sampled_a_features[:max_window_cutoff]
        sampled_l_features = sampled_l_features[:max_window_cutoff]
        sampled_v_features = sampled_v_features[:max_window_cutoff]
        sampled_ratings = sampled_ratings[:max_window_cutoff]
        sampled_l_window_length = sampled_l_window_length[:max_window_cutoff]
        
        video_struct = {
            "video_id": video_id,
            "a_feature": sampled_a_features,
            "l_feature": sampled_l_features,
            "v_feature": sampled_v_features,
            "rating": sampled_ratings,
            "l_inwindow_length": sampled_l_window_length,
            "seq_len": len(sampled_a_features)
        }
        video_count += 1
        SEND_videos += [video_struct]
        if len(sampled_a_features) > max_seq_len:
            max_seq_len = len(sampled_a_features)
    
    # padding based on length
    for video_struct in SEND_videos:
        for i in range(max_seq_len-video_struct["seq_len"]):
            video_struct["a_feature"].append(np.zeros(a_feature_dim))
            video_struct["l_feature"].append(np.zeros(max_window_l_length))
            video_struct["v_feature"].append(np.zeros((224,224,3)))
            video_struct["rating"].append(0.0)
            video_struct["l_inwindow_length"].append(0)
        video_struct["a_feature"] = torch.tensor(video_struct["a_feature"])
        video_struct["l_feature"] = torch.LongTensor(video_struct["l_feature"])
        video_struct["v_feature"] = torch.tensor(video_struct["v_feature"])
        video_struct["rating"] = torch.tensor(video_struct["rating"])
        
    return SEND_videos

In [2]:
def arg_parse():
    
    # This is a single loop to generate the dataset.
    n_processes = 1
    mode = "all"
    n_command_struct = 10000
    grid_size = 6
    n_object_max = 10
    seed = 42
    date = "2021-05-07"
    per_command_world_retry_max = 200
    per_command_world_target_count = 10 # for each command, we target to have 50 shapeWorld!
    resumed_from_file_path = ""
    is_tensorboard = False
    
    parser = argparse.ArgumentParser(description='multimodal emotion analysis argparse.')
    # Experiment management:

    parser.add_argument('--train_batch_size', type=int, default=6,
                        help='Training batch size.')
    parser.add_argument('--eval_batch_size', type=int, default=12,
                        help='Evaluation batch size.')
    parser.add_argument('--lr', type=float, default=1e-4,
                        help='Learning rate.')
    parser.add_argument('--seed', type=int, default=42,
                        help='Random seed.')
    
    parser.add_argument('--resumed_from_file_path', type=str, default="",
                        help='Whether to resume for this file.')
    parser.add_argument('--data_dir', type=str, default="../../SENDv1-data/",
                        help='Whether to resume for this file.')
    parser.add_argument('--output_dir', type=str, default="../default_output_log/",
                        help='Whether to resume for this file.')
    parser.add_argument("--is_tensorboard",
                        default=False,
                        action='store_true',
                        help="Whether to use tensorboard.")
    parser.add_argument("--eval_only",
                        default=False,
                        action='store_true',
                        help="Whether to use tensorboard.")
    
    parser.set_defaults(
        # Exp management:
        seed=42,
    )
    try:
        get_ipython().run_line_magic('matplotlib', 'inline')
        args = parser.parse_args([])
    except:
        args = parser.parse_args()
    return args

In [3]:
if __name__ == "__main__":
    
    # Loading arguments
    args = arg_parse()
    try:        
        get_ipython().run_line_magic('matplotlib', 'inline')
        # Experiment management:
        args.train_batch_size=6
        args.eval_batch_size=12
        args.lr=1e-4
        args.seed=42
        args.is_tensorboard=True # Let us try this!
        args.output_dir="../default_output_log/"
        is_jupyter = True

    except:
        is_jupyter = False
        
    # Create output directory if not exists.
    pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) 
    
    logging.basicConfig(
        level=logging.INFO, 
        format='%(asctime)s %(levelname)-8s %(message)s', 
        datefmt='%a, %d %b %Y %H:%M:%S', 
        filename=os.path.join(args.output_dir, "training.log"),
    )
    logger = logging.getLogger(__name__)
    logging.getLogger().addHandler(logging.StreamHandler(os.sys.stdout))
    
    logger.info("Training the model with the following parameters: ")
    logger.info(args)
    
    # We don't allow flexibility here..
    tokenizer = AutoTokenizer.from_pretrained(
        "bert-base-uncased",
        use_fast=False,
        cache_dir="../.huggingface_cache/"
    )
    
    train_SEND_features = None
    test_SEND_features = None
    if args.eval_only:
        # Training data loading 
        train_modalities_data_dir = os.path.join(args.data_dir, "features/Train/")
        train_target_data_dir = os.path.join(args.data_dir, "ratings/Train")

        test_modalities_data_dir = os.path.join(args.data_dir, "features/Valid/")
        test_target_data_dir = os.path.join(args.data_dir, "ratings/Valid")
        
        train_SEND_features = preprocess_SEND_files(
            train_modalities_data_dir,
            train_target_data_dir,
            linguistic_tokenizer=tokenizer,
        )
        test_SEND_features = preprocess_SEND_files(
            test_modalities_data_dir,
            test_target_data_dir,
            linguistic_tokenizer=tokenizer,
        )
        
    else:
        test_modalities_data_dir = os.path.join(args.data_dir, "features/Test/")
        test_target_data_dir = os.path.join(args.data_dir, "ratings/Test")
    
        test_SEND_features = preprocess_SEND_files(
            test_modalities_data_dir,
            test_target_data_dir,
            linguistic_tokenizer=tokenizer,
        )
    logger.info("Finish Loading Datasets...")
        
        

Training the model with the following parameters: 
Namespace(data_dir='../../SENDv1-data/', eval_batch_size=12, eval_only=False, is_tensorboard=True, lr=0.0001, output_dir='../default_output_log/', resumed_from_file_path='', seed=42, train_batch_size=6)
Finish Loading Datasets...


In [4]:
test_input_a_feature = torch.stack([video_struct["a_feature"] for video_struct in test_SEND_features])
test_input_l_feature = torch.stack([video_struct["l_feature"] for video_struct in test_SEND_features])
test_input_v_feature = torch.stack([video_struct["v_feature"] for video_struct in test_SEND_features])
test_rating_labels = torch.stack([video_struct["rating"] for video_struct in test_SEND_features])
test_l_seq_lens = torch.tensor([video_struct["l_inwindow_length"] for video_struct in test_SEND_features], dtype=torch.long)
test_seq_lens = torch.tensor([[video_struct["seq_len"]] for video_struct in test_SEND_features])

In [6]:
# Building up our SEND model.
from models.BERT import *
from models.VGGFace2 import *

In [14]:
class MultimodalEmotionPrediction(nn.Module):

    def __init__(
        self, 
        linguistic_model="bert-base-uncased",
        visual_model="vggface-2",
        visual_model_path="../saved-models/resnet50_scratch_dag.pth",
        acoustic_model="mlp",
        cache_dir="../.huggingface_cache/",
    ):
        super(MultimodalEmotionPrediction, self).__init__()
        
        # Loading BERT using huggingface?
        linguistic_config = AutoConfig.from_pretrained(
            linguistic_model,
            cache_dir=cache_dir
        )
        self.linguistic_encoder = LinguisticEncoderBERT.from_pretrained(
            linguistic_model,
            from_tf=False,
            config=linguistic_config,
            cache_dir=cache_dir
        )
        
        # Loading visual model using vggface-2
        self.visual_encoder = Resnet50_scratch_dag()
        state_dict = torch.load(visual_model_path)
        self.visual_encoder.load_state_dict(state_dict)
        
        # Creating acoustic model.
        acoustic_dim = 88
        self.acoustic_encoder = nn.Linear(acoustic_dim, 128)
        pass

In [15]:
model = MultimodalEmotionPrediction()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing LinguisticEncoderBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.word_embeddings.weight']
- This IS expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LinguisticEncoderBERT were not initialized from the model che

In [16]:
model.linguistic_encoder(
    
)

LinguisticEncoderBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (pretrain_word_embeddings): Embedding(31160, 300, padding_idx=0)
      (study_abroad_transformation_layer): Linear(in_features=300, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=