In [12]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import torch
from PIL import Image
from numpy import asarray

import logging
logger = logging.getLogger(__name__)

In [2]:
# after process data structure for each modality
# acoustic = [[[feature at time 1], [feature at time 2], [...]], [...]]
# linguistic = [[[token_1, token_2, ... at time 1], [token_1, token_2, ... at time 2], [...]], [...]]
# visual = [[[feature at time 1], [feature at time 2], [...]], [...]]

In [3]:
modalities_data_dir = "../../SENDv1-data/features/Train/"

In [4]:
preprocess = {
    'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
    'acoustic_timer': lambda df : df.loc[:,' frameTime'],
    'linguistic': lambda df : df.loc[:,'word'],
    'linguistic_timer': lambda df : df.loc[:,'time-offset'],
}

class InputFeature:
    
    def __init__(
        self, video_id="",
        acoustic_feature=[],
        linguistic_feature=[],
        visual_feature=[],
        labels=[],
    ):
        self.video_id = video_id
        self.acoustic_feature = acoustic_feature
        self.linguistic_feature = linguistic_feature
        self.visual_feature = visual_feature
        self.labels = labels

In [23]:
def preprocess_SEND_files(
    data_dir,
    time_window_in_sec=5.0,
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested
                       },
    keep_first=True,
):
    # basically, let us gett all the video ids?
    a_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["acoustic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["acoustic"], f))]
    l_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["linguistic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["linguistic"], f))]
    v_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["visual"])) 
             if f != ".DS_Store"]
    assert len(a_ids) == len(l_ids) and len(l_ids) == len(v_ids)
    assert len(set(a_ids).intersection(set(l_ids))) == len(l_ids)
    assert len(set(a_ids).intersection(set(v_ids))) == len(v_ids)
    
    for video_id in a_ids: # pick any one!
        
        # acoustic features process
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        # sample based on interval
        current_time = 0.0
        keep_first = keep_first
        sampled_a_features = []
        sampled_a_timestamps  = []
        
        
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        l_words = np.array(preprocess["linguistic"](l_df))
        l_words = [w.strip().lower() for w in l_words]
        l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        # sample based on interval
        current_time = 0.0
        keep_first = keep_first
        sampled_l_words = [] # different from other modality, it is essentially a list of list!
        sampled_l_timestamps = []
        
        
        # visual features process
        # for visual, we actually need to active control what image we load, we
        # cannot just load all images, it will below memory.
        fps=30 # We may need to dynamically figure out this number?
        frame_names = []
        for f in listdir(os.path.join(data_dir, modality_dir_map["visual"], video_id)):
            if ".jpg" in f:
                frame_names += [(int(f.split("_")[0][5:])*(1.0/fps), f)]
        frame_names.sort(key=lambda x:x[0])
        sampled_frames = []
        current_time = 0.0
        keep_first = keep_first
        for f in frame_names:
            if keep_first:
                sampled_frames += [f]
                keep_first = False
            if f[0] >= current_time+time_window_in_sec:
                sampled_frames += [f]
                current_time += time_window_in_sec
        v_images = []
        v_timestamps = []
        for f in sampled_frames:
            f_path = os.path.join(data_dir, modality_dir_map["visual"], video_id, f[1])
            f_image = Image.open(f_path)
            f_data = asarray(f_image)
            v_images += [f_data] 
            v_timestamps += [f[0]]
        v_images = np.array(v_images)
        v_timestamps = np.array(v_timestamps)

        break

In [24]:
preprocess_SEND_files(modalities_data_dir)

(308,)
