In [53]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import torch
from PIL import Image
from numpy import asarray

import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
)

import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [2]:
# after process data structure for each modality
# acoustic = [[[feature at time 1], [feature at time 2], [...]], [...]]
# linguistic = [[[token_1, token_2, ... at time 1], [token_1, token_2, ... at time 2], [...]], [...]]
# visual = [[[feature at time 1], [feature at time 2], [...]], [...]]

In [8]:
modalities_data_dir = "../../SENDv1-data/features/Train/"
target_data_dir = "../../SENDv1-data/ratings/Train/"

In [32]:
preprocess = {
    'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
    'acoustic_timer': lambda df : df.loc[:,' frameTime'],
    'linguistic': lambda df : df.loc[:,'word'],
    'linguistic_timer': lambda df : df.loc[:,'time-offset'],
    'target': lambda df : df.loc[:,' rating'],
    'target_timer': lambda df : df.loc[:,'time'],
}

class InputFeature:
    
    def __init__(
        self, video_id="",
        acoustic_feature=[],
        linguistic_feature=[],
        visual_feature=[],
        labels=[],
    ):
        self.video_id = video_id
        self.acoustic_feature = acoustic_feature
        self.linguistic_feature = linguistic_feature
        self.visual_feature = visual_feature
        self.labels = labels

In [72]:
def preprocess_SEND_files(
    data_dir, # Multitmodal X
    target_dir, # Y
    time_window_in_sec=5.0,
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested,
                        "target": "target",
                       },
    linguistic_tokenizer=None,
):
    SEND_videos = []
    
    # basically, let us gett all the video ids?
    a_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["acoustic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["acoustic"], f))]
    l_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["linguistic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["linguistic"], f))]
    v_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["visual"])) 
             if f != ".DS_Store"]
    assert len(a_ids) == len(l_ids) and len(l_ids) == len(v_ids)
    assert len(set(a_ids).intersection(set(l_ids))) == len(l_ids)
    assert len(set(a_ids).intersection(set(v_ids))) == len(v_ids)
    
    # We need the first pass for linguistic modality process?
    max_window_l_length = -1
    for video_id in a_ids: # pick any one!
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        l_words = np.array(preprocess["linguistic"](l_df))
        l_words = [w.strip().lower() for w in l_words]
        l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        # sample based on interval
        current_time = 0.0
        keep_first = True
        sampled_l_words = [] # different from other modality, it is essentially a list of list!
        tmp_words = []
        for i in range(0, l_timestamps.shape[0]):
            if keep_first:
                sampled_l_words += [[]]
                keep_first = False
            if l_timestamps[i] >= current_time+time_window_in_sec:
                sampled_l_words.append(tmp_words)
                tmp_words = [l_words[i]] # reinit the buffer
                current_time += time_window_in_sec
                continue
            tmp_words += [l_words[i]]
        # overflow
        if len(tmp_words) > 0:
            sampled_l_words.append(tmp_words)
        for window_words in sampled_l_words:
            token_ids = linguistic_tokenizer.convert_tokens_to_ids(window_words)
            if len(token_ids) > max_window_l_length:
                max_window_l_length = len(token_ids)
    max_window_l_length += 2 # the start and the end token
    
    video_count = 0
    for video_id in a_ids: # pick any one!
        if video_count > 1 and video_count%100 == 0:
            logger.info(f"Processed #{len(SEND_videos)} videos.")
            logger.info(SEND_videos[-1])
        
        # we need to fix this to get features aligned.
        
        # Step 1: Load rating data, and we can get window partitioned according to our interval.
        target_id = video_id.split("_")[0][2:] + "_" + video_id.split("_")[1][3:]
        target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"target_{target_id}_normal.csv")
        target_df = pd.read_csv(target_file)
        target_ratings = np.array(preprocess["target"](target_df))
        target_timestamps = np.array(preprocess["target_timer"](target_df))
        assert target_ratings.shape[0] == target_timestamps.shape[0]
        
        
        
        
        # acoustic features process
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        assert a_features.shape[0] == a_timestamps.shape[0]
        # sample based on interval
        current_time = 0.0
        keep_first = True
        sampled_a_features = []
        sampled_a_timestamps  = []
        tmp_a_features = []
        for i in range(0, a_timestamps.shape[0]):
            if keep_first:
                sampled_a_features += [a_features[i]]
                sampled_a_timestamps += [a_timestamps[i]]
                keep_first = False
            if a_timestamps[i] >= current_time+time_window_in_sec:
                tmp_a_features = np.mean(np.array(tmp_a_features), axis=0)
                sampled_a_features += [tmp_a_features]
                this_timestamp = current_time+time_window_in_sec
                sampled_a_timestamps += [this_timestamp]
                current_time += time_window_in_sec
                tmp_a_features = [a_features[i]]
                continue
            tmp_a_features += [a_features[i]]
        sampled_a_features = np.array(sampled_a_features)
        sampled_a_timestamps = np.array(sampled_a_timestamps)
        assert sampled_a_features.shape[0] == sampled_a_timestamps.shape[0]
        
        
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        l_words = np.array(preprocess["linguistic"](l_df))
        l_words = [w.strip().lower() for w in l_words]
        l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        assert len(l_words) == l_timestamps.shape[0]
        # sample based on interval
        current_time = 0.0
        keep_first = True
        sampled_l_words = [] # different from other modality, it is essentially a list of list!
        sampled_l_timestamps = []
        tmp_words = []
        for i in range(0, l_timestamps.shape[0]):
            if keep_first:
                sampled_l_words += [[]]
                sampled_l_timestamps += [0.0]
                keep_first = False
            if l_timestamps[i] >= current_time+time_window_in_sec:
                sampled_l_words.append(tmp_words)
                this_timestampe = current_time+time_window_in_sec
                sampled_l_timestamps += [this_timestampe]
                tmp_words = [l_words[i]] # reinit the buffer
                current_time += time_window_in_sec
                continue
            tmp_words += [l_words[i]]
        # overflow
        if len(tmp_words) > 0:
            sampled_l_words.append(tmp_words)
            this_timestampe = current_time+time_window_in_sec
            sampled_l_timestamps += [this_timestampe]
        sampled_l_timestamps = np.array(sampled_l_timestamps)
        sampled_l_token_ids = []
        sampled_l_window_length = []
        for window_words in sampled_l_words:
            complete_window_word = ["[CLS]"] + window_words + ["[SEP]"]
            token_ids = linguistic_tokenizer.convert_tokens_to_ids(complete_window_word)
            sampled_l_window_length += [len(token_ids)]
            for _ in range(0, max_window_l_length-len(token_ids)):
                token_ids.append(linguistic_tokenizer.pad_token_id)
            sampled_l_token_ids += [token_ids]
        assert len(sampled_l_words) == sampled_l_timestamps.shape[0]
        assert len(sampled_l_token_ids) == sampled_l_timestamps.shape[0]
        
        
        # visual features process
        # for visual, we actually need to active control what image we load, we
        # cannot just load all images, it will below memory.
        fps=30 # We may need to dynamically figure out this number?
        frame_names = []
        for f in listdir(os.path.join(data_dir, modality_dir_map["visual"], video_id)):
            if ".jpg" in f:
                frame_names += [(int(f.split("_")[0][5:])*(1.0/fps), f)]
        frame_names.sort(key=lambda x:x[0])
        sampled_frames = []
        current_time = 0.0
        keep_first = True
        for f in frame_names:
            if keep_first:
                sampled_frames += [f]
                keep_first = False
            if f[0] >= current_time+time_window_in_sec:
                sampled_frames += [f]
                current_time += time_window_in_sec
        v_images = []
        v_timestamps = []
        for f in sampled_frames:
            f_path = os.path.join(data_dir, modality_dir_map["visual"], video_id, f[1])
            f_image = Image.open(f_path)
            f_data = asarray(f_image)
            v_images += [f_data] 
            v_timestamps += [f[0]]
        v_images = np.array(v_images)
        v_timestamps = np.array(v_timestamps)
        
        
        # ratings (target)
        target_id = video_id.split("_")[0][2:] + "_" + video_id.split("_")[1][3:]
        target_file = os.path.join(target_data_dir, modality_dir_map["target"], f"target_{target_id}_normal.csv")
        target_df = pd.read_csv(target_file)
        target_ratings = np.array(preprocess["target"](target_df))
        target_timestamps = np.array(preprocess["target_timer"](target_df))
        assert target_ratings.shape[0] == target_timestamps.shape[0]
        # sample based on interval
        current_time = 0.0
        keep_first = True
        sampled_target_ratings = []
        sampled_target_timestamps  = []
        tmp_target_ratings = []
        for i in range(0, target_timestamps.shape[0]):
            if keep_first:
                sampled_target_ratings += [0.5]
                sampled_target_timestamps += [0.0]
                keep_first = False
            if target_timestamps[i] >= current_time+time_window_in_sec:
                this_rating = sum(tmp_target_ratings)/len(tmp_target_ratings)
                sampled_target_ratings += [this_rating]
                this_timestampe = current_time+time_window_in_sec
                sampled_target_timestamps += [this_timestampe]
                tmp_target_ratings = [target_ratings[i]]
                current_time += time_window_in_sec
                continue
            tmp_target_ratings += [target_ratings[i]]
        # overflow
        if len(tmp_target_ratings) > 0:
            this_rating = sum(tmp_target_ratings)/len(tmp_target_ratings)
            sampled_target_ratings.append(this_rating)
            this_timestampe = current_time+time_window_in_sec
            sampled_target_timestamps += [this_timestampe]
        sampled_target_ratings = np.array(sampled_target_ratings)
        sampled_target_timestamps = np.array(sampled_target_timestamps)
        assert sampled_target_ratings.shape[0] == sampled_target_timestamps.shape[0]
        
        video_struct = {
            "video_id": video_id,
            "a_feature": sampled_a_features,
            "a_timer": sampled_a_timestamps,
            "l_feature": sampled_l_token_ids,
            "l_inwindow_length": sampled_l_window_length,
            "l_timer": sampled_l_timestamps,
            "v_feature": v_images,
            "v_timer": v_timestamps,
            "r": sampled_target_ratings,
            "r_timer": sampled_target_timestamps,
        }
        video_count += 1
        SEND_videos += [video_struct]
        print(video_struct)
        
        break
        
    return SEND_videos

In [48]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased",
    use_fast=False,
    cache_dir="../.huggingface_cache/"
)

In [73]:
SEND_features = preprocess_SEND_files(
    modalities_data_dir,
    target_data_dir,
    linguistic_tokenizer=tokenizer,
)

{'video_id': 'ID162_vid4', 'a_feature': array([[ 1.96395800e+01,  9.25502700e-02,  1.79369000e+01, ...,
         5.00000000e-02,  3.00000000e-02, -3.19951500e+01],
       [ 2.33612200e+01,  1.36073739e-01,  2.04801220e+01, ...,
         6.71666660e-02,  1.81246690e-02, -3.00094960e+01],
       [ 2.00474110e+01,  1.68061494e-01,  1.73047470e+01, ...,
         6.30000000e-02,  3.00000000e-03, -3.51375060e+01],
       ...,
       [ 1.94123810e+01,  1.45793940e-01,  1.71546900e+01, ...,
         6.96666660e-02,  1.30912060e-02, -3.61036850e+01],
       [ 1.90082020e+01,  1.58153399e-01,  1.66785640e+01, ...,
         8.81666670e-02,  1.61996730e-02, -3.75135450e+01],
       [ 1.90940720e+01,  1.35960025e-01,  1.67032110e+01, ...,
         4.73333330e-02,  1.45042950e-02, -3.53600390e+01]]), 'a_timer': array([  0.,   5.,  10.,  15.,  20.,  25.,  30.,  35.,  40.,  45.,  50.,
        55.,  60.,  65.,  70.,  75.,  80.,  85.,  90.,  95., 100., 105.,
       110., 115., 120., 125., 130., 135., 14