#### Evaluate Script of Trained Models

In [54]:
from run_multimodal_time_series import *
from collections import OrderedDict
import csv

import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import torch
from PIL import Image
from numpy import asarray

import argparse
from collections import namedtuple, OrderedDict
import itertools
import os
import numpy as np
from typing import Tuple
from typing import List
from typing import Dict
import random
from itertools import product
import copy
import re
import random
import hashlib
import pathlib
import json
import torch.nn.functional as F
from scipy.stats import pearsonr
import wandb

import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
)

import logging

from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from tqdm import tqdm, trange

# Building up our SEND model.
from models.BERT import *
from models.VGGFace2 import *
from models.optimization import *

def query_index_time_inout(timestamps, time_inout):
    # filter based on the time in and out.
    start_index = 0
    for t in timestamps:
        if t >= time_inout[0]:
            break
        start_index += 1
    end_index = 0
    for t in timestamps:
        if t >= time_inout[1]:
            break
        end_index += 1
    return start_index, end_index

class InputFeature:
    
    def __init__(
        self, video_id="",
        acoustic_feature=[],
        linguistic_feature=[],
        visual_feature=[],
        labels=[],
    ):
        self.video_id = video_id
        self.acoustic_feature = acoustic_feature
        self.linguistic_feature = linguistic_feature
        self.visual_feature = visual_feature
        self.labels = labels
        
def preprocess_SEND_files(
    data_dir, # Multitmodal X
    target_data_dir, # Y
    use_target_ratings,
    time_window_in_sec=4.0,
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested,
                        "target": "observer_EWE",
                       },
    preprocess= {'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
                 'acoustic_timer': lambda df : df.loc[:,' frameTime'],
                 'linguistic': lambda df : df.loc[:,'word'],
                 'linguistic_timer': lambda df : df.loc[:,'time-offset'],
                 'target': lambda df : ((df.loc[:,'evaluatorWeightedEstimate'] / 50.0) - 1.0),
                 'target_timer': lambda df : df.loc[:,'time'],
                },
    linguistic_tokenizer=None,
    pad_symbol=0,
    max_number_of_file=-1,
    time_inout_map=None,
):
    
    import time

    start = time.time()    
    SEND_videos = []
    
    # basically, let us gett all the video ids?
    a_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["acoustic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["acoustic"], f))]
    l_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["linguistic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["linguistic"], f))]
    v_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["visual"])) 
             if f != ".DS_Store"]
    assert len(a_ids) == len(l_ids) and len(l_ids) == len(v_ids)
    assert len(set(a_ids).intersection(set(l_ids))) == len(l_ids)
    assert len(set(a_ids).intersection(set(v_ids))) == len(v_ids)
    
    # We need the first pass for linguistic modality process?
    max_window_l_length = -1
    for video_id in a_ids: # pick any one!
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        #l_words = np.array(preprocess["linguistic"](l_df))
        #l_words = [w.strip().lower() for w in l_words]
        l_words = []
        l_timestamps = []
        head = True
        with open(l_file) as fp:
            for line in fp:
                if head:
                    head = False
                    continue
                l_words.append(line.strip().split("\t")[2].lower().strip())
                l_timestamps.append(float(line.strip().split("\t")[1]))
        #l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        l_timestamps = np.array(l_timestamps)
        s_idx, e_idx = query_index_time_inout(l_timestamps, time_inout_map[video_id])
        l_words = l_words[s_idx:e_idx]
        l_timestamps = l_timestamps[s_idx:e_idx]
        l_timestamps = np.array([t-min(l_timestamps) for t in l_timestamps])
        
        # sample based on interval
        current_time = 0.0
        keep_first = True
        sampled_l_words = [] # different from other modality, it is essentially a list of list!
        tmp_words = []
        for i in range(0, l_timestamps.shape[0]):
            if keep_first:
                sampled_l_words += [[]]
                keep_first = False
            if l_timestamps[i] >= current_time+time_window_in_sec:
                sampled_l_words.append(tmp_words)
                tmp_words = [l_words[i]] # reinit the buffer
                current_time += time_window_in_sec
                continue
            tmp_words += [l_words[i]]
        # overflow
        if len(tmp_words) > 0:
            sampled_l_words.append(tmp_words)
        for window_words in sampled_l_words:
            window_str = " ".join(window_words)
            window_tokens = linguistic_tokenizer.tokenize(window_str)
            token_ids = linguistic_tokenizer.convert_tokens_to_ids(window_tokens)
            if len(token_ids) > max_window_l_length:
                max_window_l_length = len(token_ids)

    max_window_l_length += 2 # the start and the end token
    
    if max_number_of_file != -1:
        logger.info(f"WARNING: Only loading #{max_number_of_file} videos.")
    max_seq_len = -1
    video_count = 0
    for video_id in a_ids: # pick any one!
        if video_id not in time_inout_map:
            print(f"WARNING: skipping video {video_id} as no fMRI related data recorded.")
            continue
        if max_number_of_file != -1 and video_count >= max_number_of_file:
            break # we enforce!
        if video_count > 1 and video_count%100 == 0:
            logger.info(f"Processed #{len(SEND_videos)} videos.")
            # logger.info(SEND_videos[-1])
        
        # we need to fix this to get features aligned.
        
        # Step 1: Load rating data, and we can get window partitioned according to our interval.
        target_id = video_id.split("_")[0][2:] + "_" + video_id.split("_")[1][3:]
        if use_target_ratings:
            target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"target_{target_id}_normal.csv")
        else:
            target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"results_{target_id}.csv")
        target_df = pd.read_csv(target_file)
        target_ratings = np.array(preprocess["target"](target_df))
        target_timestamps = np.array(preprocess["target_timer"](target_df))
        s_idx, e_idx = query_index_time_inout(target_timestamps, time_inout_map[video_id])
        target_ratings = target_ratings[s_idx:e_idx]
        target_timestamps = target_timestamps[s_idx:e_idx]
        original_timeinout = copy.deepcopy(target_timestamps)
        original_timeinout = np.array(original_timeinout)
        target_timestamps = np.array([t-min(target_timestamps) for t in target_timestamps])
        
        assert target_ratings.shape[0] == target_timestamps.shape[0]
        windows = []
        number_of_window = int(max(target_timestamps)//time_window_in_sec)
        for i in range(0, number_of_window):
            windows += [(i*time_window_in_sec, (i+1)*time_window_in_sec)]
        windows += [((i+1)*time_window_in_sec, max(target_timestamps))]
        # [(0, 5], (5, 10], ...]

        # acoustic features process
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        s_idx, e_idx = query_index_time_inout(a_timestamps, time_inout_map[video_id])
        a_features = a_features[s_idx:e_idx]
        a_timestamps = a_timestamps[s_idx:e_idx]
        a_timestamps = np.array([t-min(a_timestamps) for t in a_timestamps])

        a_feature_dim = a_features.shape[1]
        assert a_features.shape[0] == a_timestamps.shape[0]
        sampled_a_features_raw = [[] for i in range(len(windows))]
        for i in range(0, a_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(a_timestamps[i]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_a_features_raw[hash_in_window].append(a_features[i])
        sampled_a_features = []
        for window in sampled_a_features_raw:
            # only acoustic need to consider this I think.
            if len(window) == 0:
                collate_window = np.zeros(a_feature_dim)
            else:
                collate_window = np.mean(np.array(window), axis=0)
            sampled_a_features.append(collate_window)
        
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        # the following line is buggy, it may parse file incorrectly!
        #l_words = np.array(preprocess["linguistic"](l_df))
        #l_words = [w.strip().lower() for w in l_words]
        l_words = []
        l_timestamps = []
        head = True
        with open(l_file) as fp:
            for line in fp:
                if head:
                    head = False
                    continue
                l_words.append(line.strip().split("\t")[2].lower().strip())
                l_timestamps.append(float(line.strip().split("\t")[1]))
        #l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        l_timestamps = np.array(l_timestamps)
        s_idx, e_idx = query_index_time_inout(l_timestamps, time_inout_map[video_id])
        l_words = l_words[s_idx:e_idx]
        l_timestamps = l_timestamps[s_idx:e_idx]
        l_timestamps = np.array([t-min(l_timestamps) for t in l_timestamps])
        
        assert len(l_words) == l_timestamps.shape[0]
        
        sampled_l_features_raw = [[] for i in range(len(windows))]
        for i in range(0, l_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(l_timestamps[i]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_l_features_raw[hash_in_window].append(l_words[i])

        sampled_l_features = []
        sampled_l_mask = []
        sampled_l_segment_ids = []
        for window in sampled_l_features_raw:
            window_str = " ".join(window)
            window = linguistic_tokenizer.tokenize(window_str)
            complete_window_word = ["[CLS]"] + window + ["[SEP]"]
            token_ids = linguistic_tokenizer.convert_tokens_to_ids(complete_window_word)
            input_mask = [1 for _ in range(len(token_ids))]
            for _ in range(0, max_window_l_length-len(token_ids)):
                token_ids.append(linguistic_tokenizer.pad_token_id)
                input_mask.append(0)
            segment_ids = [0] * len(token_ids)
            sampled_l_features += [token_ids]
            sampled_l_mask += [input_mask]
            sampled_l_segment_ids += [segment_ids]


        # visual features process
        # for visual, we actually need to active control what image we load, we
        # cannot just load all images, it will below memory.
        fps=30 # We may need to dynamically figure out this number?
        frame_names = []
        for f in listdir(os.path.join(data_dir, modality_dir_map["visual"], video_id)):
            if ".jpg" in f:
                frame_names += [(int(f.split("_")[0][5:])*(1.0/fps), f)]
        frame_names.sort(key=lambda x:x[0])

        v_timestamps = []
        v_frame_names = []
        for f in frame_names:
            v_timestamps.append(f[0])
            v_frame_names.append(f[1])
        s_idx, e_idx = query_index_time_inout(v_timestamps, time_inout_map[video_id])
        v_frame_names = v_frame_names[s_idx:e_idx]
        v_timestamps = v_timestamps[s_idx:e_idx]
        v_timestamps = np.array([t-min(v_timestamps) for t in v_timestamps])
        new_frame_names = []
        for i in range(len(v_timestamps)):
            new_frame_names += [(v_timestamps[i], v_frame_names[i])]
        frame_names = new_frame_names
        
        sampled_v_features_raw = [[] for i in range(len(windows))]
        for f in frame_names:
            # using mod to hash to the correct bucket.
            hash_in_window = int(f[0]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_v_features_raw[hash_in_window].append(f)
            
        sampled_v_features = []
        for window in sampled_v_features_raw:
            if len(window) == 0:
                f_data = np.zeros((224,224,3))
            else:
                # we collate by using the last frame in the time window.
                f = window[-1]
                f_path = os.path.join(data_dir, modality_dir_map["visual"], video_id, f[1])
                f_image = Image.open(f_path)
                f_data = asarray(f_image)
                f_data = f_data[...,::-1] # reverse the order.
            sampled_v_features.append(f_data)

        # ratings (target)
        target_id = video_id.split("_")[0][2:] + "_" + video_id.split("_")[1][3:]
        if use_target_ratings:
            target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"target_{target_id}_normal.csv")
        else:
            target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"results_{target_id}.csv")
        target_df = pd.read_csv(target_file)
        target_ratings = np.array(preprocess["target"](target_df))
        target_timestamps = np.array(preprocess["target_timer"](target_df))
        s_idx, e_idx = query_index_time_inout(target_timestamps, time_inout_map[video_id])
        target_ratings = target_ratings[s_idx:e_idx]
        target_timestamps = target_timestamps[s_idx:e_idx]
        original_timeinout = copy.deepcopy(target_timestamps)
        target_timestamps = np.array([t-min(target_timestamps) for t in target_timestamps])
        
        assert target_ratings.shape[0] == target_timestamps.shape[0]
        sampled_ratings_raw = [[] for i in range(len(windows))]
        for i in range(0, target_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(target_timestamps[i]//time_window_in_sec)
            sampled_ratings_raw[hash_in_window].append(target_ratings[i])
        sampled_ratings = []
        for window in sampled_ratings_raw:
            collate_window = np.mean(np.array(window), axis=0)
            sampled_ratings.append(collate_window)
        
        # we truncate features based on linguistic avaliabilities.
        assert len(sampled_a_features) == len(sampled_l_features)
        assert len(sampled_a_features) == len(sampled_v_features)
        
        max_window_cutoff_l = int(max(l_timestamps)//time_window_in_sec)
        max_window_cutoff_a = int(max(a_timestamps)//time_window_in_sec)
        max_window_cutoff_v = int(frame_names[-1][0]//time_window_in_sec)
        max_window_cutoff = min([max_window_cutoff_l, max_window_cutoff_a, max_window_cutoff_v])
        sampled_a_features = sampled_a_features[:max_window_cutoff]
        sampled_l_features = sampled_l_features[:max_window_cutoff]
        sampled_v_features = sampled_v_features[:max_window_cutoff]
        sampled_ratings = sampled_ratings[:max_window_cutoff]
        sampled_l_mask = sampled_l_mask[:max_window_cutoff]
        sampled_l_segment_ids = sampled_l_segment_ids[:max_window_cutoff]
        input_mask = np.ones(len(sampled_a_features)).tolist()
        max_seq_len = 60
        seq_len = len(sampled_a_features)
        for i in range(max_seq_len-len(sampled_a_features)):
            sampled_a_features.append(np.zeros(a_feature_dim))
            sampled_l_features.append(np.zeros(max_window_l_length))
            sampled_l_mask.append(np.zeros(max_window_l_length))
            sampled_l_segment_ids.append(np.zeros(max_window_l_length))
            sampled_v_features.append(np.zeros((224,224,3)))
            sampled_ratings.append(0.0)
            input_mask.append(0)

        sampled_a_features = torch.tensor(sampled_a_features)
        sampled_l_features = torch.LongTensor(sampled_l_features)
        sampled_l_mask = torch.LongTensor(sampled_l_mask)
        sampled_l_segment_ids = torch.LongTensor(sampled_l_segment_ids)
        processed_tensor = torch.tensor(sampled_v_features).float()
        processed_tensor[..., 0] -= 91.4953
        processed_tensor[..., 1] -= 103.8827
        processed_tensor[..., 2] -= 131.0912
        sampled_v_features = processed_tensor
        sampled_ratings = torch.tensor(sampled_ratings)
        input_mask = torch.LongTensor(input_mask)
        
        video_struct = {
            "video_id": video_id,
            "a_feature": sampled_a_features,
            "l_feature": sampled_l_features,
            "l_mask": sampled_l_mask,
            "l_segment_ids": sampled_l_segment_ids,
            "v_feature": sampled_v_features,
            "rating": sampled_ratings,
            "seq_len": seq_len,
            "input_mask": input_mask
        }

        video_count += 1
        SEND_videos += [video_struct]
    
    end = time.time()
    elapsed = end - start
    logger.info(f"Time elapsed for first-pass: {elapsed}")
        
    return SEND_videos

In [55]:
# loading model from saved model.
model = MultimodalEmotionPrediction()
new_state_dict = OrderedDict()
DEVICE = torch.device('cpu')   # 'cpu' in this case
model_path = "../fMRI/best_ccc_pytorch_model.bin"
print("loading the model from: ", model_path)
state_dict = torch.load(model_path, map_location=DEVICE)["model"]
for k, v in state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v
model.load_state_dict(new_state_dict)
_ = model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing LinguisticEncoderBERT: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loading the model from:  ../fMRI/best_ccc_pytorch_model.bin


In [56]:
modality_dir_map = {"acoustic": "acoustic-egemaps",  
                    "linguistic": "linguistic-word-level", # we don't load features
                    "visual": "image-raw", # image is nested,
                    "target": "observer_EWE"}
preprocess = {
    'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
    'acoustic_timer': lambda df : df.loc[:,' frameTime'],
    'linguistic': lambda df : df.loc[:,'word'],
    'linguistic_timer': lambda df : df.loc[:,'time-offset'],
    'target': lambda df : ((df.loc[:,'evaluatorWeightedEstimate'] / 50.0) - 1.0),
    'target_timer': lambda df : df.loc[:,'time'],
}

In [57]:
output_dir = "../data-files/fMRI/"

In [58]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased",
    use_fast=False,
    cache_dir="../.huggingface_cache/"
)

In [59]:
eval_inout_path = "../data-files/fMRI/eval_inout.csv"
time_inout_map = {}
with open(eval_inout_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        in_str = row[1].split(":")
        out_str = row[2].split(":")
        start_sec = int(in_str[1])*60 + int(in_str[2]) + int(in_str[3])*0.01
        end_sec = int(out_str[1])*60 + int(out_str[2]) + int(out_str[3])*0.01
        time_inout_map[row[0]] = (start_sec, end_sec)

In [60]:
# Loading all the data partitions.
data_dir = "../../SENDv1-data-fMRI/"
dev_modalities_data_dir = os.path.join(data_dir, "features/Valid/")
dev_target_data_dir = os.path.join(data_dir, "ratings/Valid")
dev_SEND_features = preprocess_SEND_files(
    dev_modalities_data_dir,
    dev_target_data_dir,
    False,
    modality_dir_map=modality_dir_map,
    preprocess=preprocess,
    linguistic_tokenizer=tokenizer,
    max_number_of_file=-1,
    time_inout_map=time_inout_map,
)

In [61]:
dev_video_id = [video_struct["video_id"] for video_struct in dev_SEND_features]
dev_input_a_feature = torch.stack([video_struct["a_feature"] for video_struct in dev_SEND_features]).float()
dev_input_l_feature = torch.stack([video_struct["l_feature"] for video_struct in dev_SEND_features])
dev_input_l_mask = torch.stack([video_struct["l_mask"] for video_struct in dev_SEND_features])
dev_input_l_segment_ids = torch.stack([video_struct["l_segment_ids"] for video_struct in dev_SEND_features])
dev_input_v_feature = torch.stack([video_struct["v_feature"] for video_struct in dev_SEND_features]).float()
dev_rating_labels = torch.stack([video_struct["rating"] for video_struct in dev_SEND_features]).float()
dev_seq_lens = torch.tensor([[video_struct["seq_len"]] for video_struct in dev_SEND_features]).float()
dev_input_mask = torch.stack([video_struct["input_mask"] for video_struct in dev_SEND_features])
dev_data = TensorDataset(
    dev_input_a_feature, 
    dev_input_l_feature, dev_input_l_mask, dev_input_l_segment_ids,
    dev_input_v_feature, dev_rating_labels, dev_seq_lens, dev_input_mask
)
dev_dataloader = DataLoader(dev_data, batch_size=1, shuffle=False)

In [66]:
def evaluate_ablation(
    video_id, dataloader, model, condition="A,V,L"
):
    ret = {}
    video_index = 0
    pbar = tqdm(dataloader, desc="videos")
    ret_ccc = []
    for step, batch in enumerate(pbar):
        vid_id = video_id[video_index]
        ret[vid_id] = {}
        # print(f"analyzing ablation studies on video_id={vid_id}")
        input_a_feature, input_l_feature, input_l_mask, input_l_segment_ids, \
            input_v_feature, rating_labels, seq_lens, input_mask = batch
        # based one condition, we need to mask out some channels!
        if "A" not in condition:
            input_a_feature = torch.zeros_like(input_a_feature)
        if "V" not in condition:
            input_v_feature = torch.zeros_like(input_v_feature)
        if "L" not in condition:
            input_l_feature = torch.zeros_like(input_l_feature)
        _, output = \
            model(input_a_feature, input_l_feature, input_l_mask, input_l_segment_ids,
                  input_v_feature, rating_labels, input_mask)
        seq_l = int(seq_lens[0].tolist()[0])
        pred = output[0][:seq_l].cpu().detach().numpy()
        true = rating_labels[0][:seq_l].cpu().detach().numpy()
        ccc = eval_ccc(pred, true)
        ret_ccc += [ccc]
        ret[vid_id]["pred"] = pred
        ret[vid_id]["true"] = true
        video_index += 1
    return ret, ret_ccc

In [67]:
conditions = ["A,V,L", "A,V", "A,L", "V,L", "A", "V", "L"]
mega_results = {}
for condition in conditions:
    print("analyzing results for condition: ", condition)
    dev_results, ret_ccc = evaluate_ablation(
        dev_video_id, dev_dataloader, model,
        condition=condition
    )
    mega_results[condition] = {}
    for k,v in dev_results.items():
        mega_results[condition][k] = v
    ret_ccc = sum(ret_ccc)/len(ret_ccc)
    print(f"condition={condition}; ccc={ret_ccc}")

videos:   0%|          | 0/23 [00:00<?, ?it/s]

analyzing results for condition:  A,V,L


videos: 100%|██████████| 23/23 [00:32<00:00,  1.43s/it]
videos:   0%|          | 0/23 [00:00<?, ?it/s]

condition=A,V,L; ccc=0.3708349489829502
analyzing results for condition:  A,V


videos: 100%|██████████| 23/23 [00:33<00:00,  1.47s/it]
videos:   0%|          | 0/23 [00:00<?, ?it/s]

condition=A,V; ccc=0.07197026611342432
analyzing results for condition:  A,L


videos: 100%|██████████| 23/23 [00:33<00:00,  1.47s/it]
videos:   0%|          | 0/23 [00:00<?, ?it/s]

condition=A,L; ccc=0.3708313459776084
analyzing results for condition:  V,L


videos: 100%|██████████| 23/23 [00:34<00:00,  1.49s/it]
videos:   0%|          | 0/23 [00:00<?, ?it/s]

condition=V,L; ccc=0.08231625583360447
analyzing results for condition:  A


videos: 100%|██████████| 23/23 [00:32<00:00,  1.42s/it]
videos:   0%|          | 0/23 [00:00<?, ?it/s]

condition=A; ccc=0.07194071184540089
analyzing results for condition:  V


videos: 100%|██████████| 23/23 [00:33<00:00,  1.44s/it]
videos:   0%|          | 0/23 [00:00<?, ?it/s]

condition=V; ccc=-0.09604260306237272
analyzing results for condition:  L


videos: 100%|██████████| 23/23 [00:33<00:00,  1.48s/it]

condition=L; ccc=0.08232090024541795





In [64]:
print("output dir: ", output_dir)

output dir:  ../data-files/fMRI/


In [65]:
# for each video, we are creating a file to save ratings for all conditions.
for video in mega_results["A,V,L"].keys():
    with open(os.path.join(output_dir, f"{video}.csv"), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        headers = [c for c in conditions]
        headers += ["actual"]
        writer.writerow(headers)
        s_len = len(mega_results["A,V,L"][video]["pred"])
        for i in range(s_len): # write line by line.
            row = []
            for condition in conditions:
                norm_r = (mega_results[condition][video]["pred"][i]+1.0)/2.0
                row.append(norm_r)
            norm_r = (mega_results[condition][video]["true"][i]+1.0)/2.0
            row.append(norm_r)
            writer.writerow(row)

In [44]:
mega_results

{'A,V,L': {'ID120_vid4': {'pred': array([ 0.00204343,  0.03048068,  0.05848294,  0.2607689 ,  0.3158558 ,
           0.20077902,  0.04205656,  0.05471766,  0.16307807,  0.31644154,
           0.4691913 ,  0.5639667 ,  0.57912236,  0.51974314,  0.20055723,
          -0.01477933, -0.20519435, -0.2481538 , -0.3204354 , -0.33551264,
          -0.31058693,  0.00210041,  0.07723832,  0.13520497,  0.32631218,
           0.5231671 ,  0.5950488 ,  0.6421465 ,  0.5857694 ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
         dtype=float32),
   'true': array([0.4037694 , 0.48145318, 0

In [52]:
with open("../data-files/train_ids.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    headers = ["vid_id"]
    writer.writerow(headers)
    for vid_id in train_video_id:
        writer.writerow([vid_id])
with open("../data-files/dev_ids.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    headers = ["vid_id"]
    writer.writerow(headers)
    for vid_id in dev_video_id:
        writer.writerow([vid_id])
with open("../data-files/test_ids.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    headers = ["vid_id"]
    writer.writerow(headers)
    for vid_id in test_video_id:
        writer.writerow([vid_id])

#### Evaluate with Hebrew Videos

In [58]:
use_target_ratings = True

In [59]:
# loading model from saved model.
model = MultimodalEmotionPrediction()
new_state_dict = OrderedDict()
DEVICE = torch.device('cpu')   # 'cpu' in this case
if use_target_ratings:
    model_path = "../target/best_ccc_pytorch_model.bin"
else:
    model_path = "../observer/best_ccc_pytorch_model.bin"
print("loading the model from: ", model_path)
state_dict = torch.load(model_path, map_location=DEVICE)["model"]
for k, v in state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v
model.load_state_dict(new_state_dict)
_ = model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing LinguisticEncoderBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loading the model from:  ../target/best_ccc_pytorch_model.bin


In [60]:
if use_target_ratings:
    output_dir = "../data-files/target_hebrew/"
else:
    output_dir = "../data-files/observer_hebrew/"

In [61]:
def preprocess_HEBREW_files(
    data_dir, # Multitmodal X
    time_window_in_sec=4.0,
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested,
                        "target": "observer_EWE",
                       },
    preprocess= {'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
                 'acoustic_timer': lambda df : df.loc[:,' frameTime'],
                 'linguistic': lambda df : df.loc[:,'word'],
                 'linguistic_timer': lambda df : df.loc[:,'time-offset'],
                 'target': lambda df : ((df.loc[:,'evaluatorWeightedEstimate'] / 50.0) - 1.0),
                 'target_timer': lambda df : df.loc[:,'time'],
                },
    pad_symbol=0,
    max_number_of_file=-1
):
    SEND_videos = []
    
    # basically, let us gett all the video ids?
    a_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["acoustic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["acoustic"], f))]
    v_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["visual"])) 
             if f != ".DS_Store"]

    if max_number_of_file != -1:
        logger.info(f"WARNING: Only loading #{max_number_of_file} videos.")
    max_seq_len = -1
    video_count = 0
    for video_id in a_ids: # pick any one!
        if max_number_of_file != -1 and video_count >= max_number_of_file:
            break # we enforce!
        if video_count > 1 and video_count%100 == 0:
            logger.info(f"Processed #{len(SEND_videos)} videos.")
            # logger.info(SEND_videos[-1])
        
        # we need to fix this to get features aligned.
        
        # Step 1: Load rating data, and we can get window partitioned according to our interval.
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        windows = []
        number_of_window = int(max(a_timestamps)//time_window_in_sec)
        for i in range(0, number_of_window):
            windows += [(i*time_window_in_sec, (i+1)*time_window_in_sec)]
        if max(a_timestamps) > (i+1)*time_window_in_sec:
            windows += [((i+1)*time_window_in_sec, max(a_timestamps))]
        # [(0, 5], (5, 10], ...]

        # acoustic features process
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        a_feature_dim = a_features.shape[1]
        assert a_features.shape[0] == a_timestamps.shape[0]
        sampled_a_features_raw = [[] for i in range(len(windows))]
        for i in range(0, a_timestamps.shape[0]):
            # using mod to hash to the correct bucket.
            hash_in_window = int(a_timestamps[i]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_a_features_raw[hash_in_window].append(a_features[i])
        sampled_a_features = []
        for window in sampled_a_features_raw:
            # only acoustic need to consider this I think.
            if len(window) == 0:
                collate_window = np.zeros(a_feature_dim)
            else:
                collate_window = np.mean(np.array(window), axis=0)
            sampled_a_features.append(collate_window)

        # visual features process
        # for visual, we actually need to active control what image we load, we
        # cannot just load all images, it will below memory.
        fps=30 # We may need to dynamically figure out this number?
        frame_names = []
        for f in listdir(os.path.join(data_dir, modality_dir_map["visual"], video_id)):
            if ".jpg" in f:
                frame_names += [(int(f.split("_")[0][5:])*(1.0/fps), f)]
        frame_names.sort(key=lambda x:x[0])
        sampled_v_features_raw = [[] for i in range(len(windows))]
        for f in frame_names:
            # using mod to hash to the correct bucket.
            hash_in_window = int(f[0]//time_window_in_sec)
            if hash_in_window >= len(windows):
                continue # we cannot predict after ratings max.
            sampled_v_features_raw[hash_in_window].append(f)

        sampled_v_features = []
        for window in sampled_v_features_raw:
            if len(window) == 0:
                f_data = np.zeros((224,224,3))
            else:
                # we collate by using the last frame in the time window.
                f = window[-1]
                f_path = os.path.join(data_dir, modality_dir_map["visual"], video_id, f[1])
                f_image = Image.open(f_path)
                f_data = asarray(f_image)
            sampled_v_features.append(f_data)
        
        max_window_cutoff_a = int(max(a_timestamps)//time_window_in_sec)
        max_window_cutoff_v = int(frame_names[-1][0]//time_window_in_sec)
        max_window_cutoff = min([max_window_cutoff_a, max_window_cutoff_v])
        sampled_a_features = sampled_a_features[:max_window_cutoff]
        sampled_v_features = sampled_v_features[:max_window_cutoff]
        
        video_struct = {
            "video_id": video_id,
            "a_feature": sampled_a_features,
            "v_feature": sampled_v_features,
            "seq_len": len(sampled_a_features),
            "input_mask": np.ones(len(sampled_a_features)).tolist()
        }
        video_count += 1
        SEND_videos += [video_struct]
        if len(sampled_a_features) > max_seq_len:
            max_seq_len = len(sampled_a_features)
    
    # padding based on length
    for video_struct in SEND_videos:
        for i in range(max_seq_len-video_struct["seq_len"]):
            video_struct["a_feature"].append(np.zeros(a_feature_dim))
            video_struct["v_feature"].append(np.zeros((224,224,3)))
            video_struct["input_mask"].append(0)

        video_struct["a_feature"] = torch.tensor(video_struct["a_feature"])
        video_struct["v_feature"] = torch.tensor(video_struct["v_feature"])
        video_struct["input_mask"] = torch.LongTensor(video_struct["input_mask"])
        
    return SEND_videos

In [16]:
# Loading all the data partitions.
data_dir = "../../SENDv1-data/"
test_modalities_data_dir = os.path.join(data_dir, "features/Test-Hebrew/")
test_HEBREW_features = preprocess_HEBREW_files(
    test_modalities_data_dir,
    modality_dir_map=modality_dir_map,
    preprocess=preprocess,
    max_number_of_file=-1
)

In [18]:
test_video_id = [video_struct["video_id"] for video_struct in test_HEBREW_features]
test_input_a_feature = torch.stack([video_struct["a_feature"] for video_struct in test_HEBREW_features]).float()
test_input_v_feature = torch.stack([video_struct["v_feature"] for video_struct in test_HEBREW_features]).float()
test_seq_lens = torch.tensor([[video_struct["seq_len"]] for video_struct in test_HEBREW_features]).float()
test_input_mask = torch.stack([video_struct["input_mask"] for video_struct in test_HEBREW_features])
test_data = TensorDataset(
    test_input_a_feature, 
    test_input_v_feature, 
    test_seq_lens, test_input_mask
)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)

In [62]:
def evaluate_ablation(
    video_id, dataloader, model, condition="A,V"
):
    ret = {}
    video_index = 0
    pbar = tqdm(dataloader, desc="videos")
    for step, batch in enumerate(pbar):
        vid_id = video_id[video_index]
        ret[vid_id] = {}
        # print(f"analyzing ablation studies on video_id={vid_id}")
        input_a_feature, input_v_feature, seq_lens, input_mask = batch        
        
        # based one condition, we need to mask out some channels!
        if "A" not in condition:
            input_a_feature = torch.zeros_like(input_a_feature)
        if "V" not in condition:
            input_v_feature = torch.zeros_like(input_v_feature)

        # mock linguistic and rating data.
        batch = input_a_feature.shape[0]
        seq_l = input_a_feature.shape[1]
        input_l_feature = torch.zeros((batch, seq_l, 3)).long()
        input_l_mask = torch.ones((batch, seq_l, 3)).long()
        input_l_segment_ids = torch.zeros((batch, seq_l, 3)).long()
        rating_labels = torch.zeros((batch, seq_l))
            
        _, output = \
            model(input_a_feature, input_l_feature, input_l_mask, input_l_segment_ids,
                  input_v_feature, rating_labels, input_mask)
        seq_l = int(seq_lens[0].tolist()[0])
        pred = output[0][:seq_l].cpu().detach().numpy()
        true = rating_labels[0][:seq_l].cpu().detach().numpy()
        ccc = eval_ccc(pred, true)
        ret[vid_id]["pred"] = pred
        ret[vid_id]["true"] = true
        video_index += 1
    return ret

In [63]:
mega_results = {}

In [64]:
conditions = ["A,V", "A", "V",]
for condition in conditions:
    print("analyzing results for condition: ", condition)

    test_results = evaluate_ablation(
        test_video_id, test_dataloader, model,
        condition=condition
    )
    mega_results[condition] = {}
    for k,v in test_results.items():
        mega_results[condition][k] = v

videos:   0%|          | 0/9 [00:00<?, ?it/s]

analyzing results for condition:  A,V


videos: 100%|██████████| 9/9 [03:04<00:00, 20.50s/it]
videos:   0%|          | 0/9 [00:00<?, ?it/s]

analyzing results for condition:  A


videos: 100%|██████████| 9/9 [03:16<00:00, 21.80s/it]
videos:   0%|          | 0/9 [00:00<?, ?it/s]

analyzing results for condition:  V


videos: 100%|██████████| 9/9 [03:04<00:00, 20.47s/it]


In [65]:
print("output dir: ", output_dir)

output dir:  ../data-files/target_hebrew/


In [57]:
# for each video, we are creating a file to save ratings for all conditions.
conditions = ["A,V", "A", "V",]
for video in mega_results["A,V"].keys():
    with open(os.path.join(output_dir, f"{video}.csv"), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        headers = [c for c in conditions]
        writer.writerow(headers)
        s_len = len(mega_results["A,V"][video]["pred"])
        for i in range(s_len): # write line by line.
            row = []
            for condition in conditions:
                norm_r = (mega_results[condition][video]["pred"][i]+1.0)/2.0
                row.append(norm_r)
            writer.writerow(row)