In [1]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import torch

import logging
logger = logging.getLogger(__name__)

In [2]:
# after process data structure for each modality
# acoustic = [[[feature at time 1], [feature at time 2], [...]], [...]]
# linguistic = [[[token_1, token_2, ... at time 1], [token_1, token_2, ... at time 2], [...]], [...]]
# visual = [[[feature at time 1], [feature at time 2], [...]], [...]]

In [3]:
modalities_data_dir = "../../SENDv1-data/features/Train/"

In [21]:
preprocess = {
    'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
    'acoustic_timer': lambda df : df.loc[:,' frameTime'],
    'linguistic': lambda df : df.loc[:,'word'],
    'linguistic_timer': lambda df : df.loc[:,'time-offset'],
}

class InputFeature:
    
    def __init__(
        self, video_id="",
        acoustic_feature=[],
        linguistic_feature=[],
        visual_feature=[],
        labels=[],
    ):
        self.video_id = video_id
        self.acoustic_feature = acoustic_feature
        self.linguistic_feature = linguistic_feature
        self.visual_feature = visual_feature
        self.labels = labels

In [41]:
def preprocess_SEND_files(
    data_dir,
    time_window_in_sec=5,
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested
                       },
):
    # basically, let us gett all the video ids?
    a_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["acoustic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["acoustic"], f))]
    l_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["linguistic"])) 
             if isfile(os.path.join(data_dir, modality_dir_map["linguistic"], f))]
    v_ids = [f.split("_")[0]+"_"+f.split("_")[1] 
             for f in listdir(os.path.join(data_dir, modality_dir_map["visual"])) 
             if f != ".DS_Store"]
    assert len(a_ids) == len(l_ids) and len(l_ids) == len(v_ids)
    assert len(set(a_ids).intersection(set(l_ids))) == len(l_ids)
    assert len(set(a_ids).intersection(set(v_ids))) == len(v_ids)
    
    for video_id in a_ids: # pick any one!
        
        # acoustic features process
        a_file = os.path.join(data_dir, modality_dir_map["acoustic"], f"{video_id}_acousticFeatures.csv")
        a_df = pd.read_csv(a_file)
        a_features = np.array(preprocess["acoustic"](a_df))
        a_timestamps = np.array(preprocess["acoustic_timer"](a_df))
        
        # linguistic features process
        l_file = os.path.join(data_dir, modality_dir_map["linguistic"], f"{video_id}_aligned.tsv")
        l_df = pd.read_csv(l_file, sep='\t')
        l_words = np.array(preprocess["linguistic"](l_df))
        l_words = [w.strip().lower() for w in l_words]
        l_timestamps = np.array(preprocess["linguistic_timer"](l_df))
        
        # visual features process
        # for visual, we actually need to active control what image we load, we
        # cannot just load all images, it will below memory.
        fps=30 # We may need to dynamically figure out this number?
        frame_names = []
        for f in listdir(os.path.join(data_dir, modality_dir_map["visual"], video_id)):
            if ".jpg" in f:
                frame_names += [(int(f.split("_")[0][5:])*(1.0/fps), f)]
        frame_names.sort(key=lambda x:x[0])
        current_time = 0.0
        for t in frame_names:
            
        print(frame_names)
        break

In [42]:
preprocess_SEND_files(modalities_data_dir)

[(0.0, 'frame0_resized.jpg'), (0.1, 'frame3_resized.jpg'), (0.2, 'frame6_resized.jpg'), (0.3, 'frame9_resized.jpg'), (0.4, 'frame12_resized.jpg'), (0.5, 'frame15_resized.jpg'), (0.6, 'frame18_resized.jpg'), (0.7, 'frame21_resized.jpg'), (0.8, 'frame24_resized.jpg'), (0.9, 'frame27_resized.jpg'), (1.0, 'frame30_resized.jpg'), (1.1, 'frame33_resized.jpg'), (1.2, 'frame36_resized.jpg'), (1.3, 'frame39_resized.jpg'), (1.4, 'frame42_resized.jpg'), (1.5, 'frame45_resized.jpg'), (1.6, 'frame48_resized.jpg'), (1.7, 'frame51_resized.jpg'), (1.8, 'frame54_resized.jpg'), (1.9, 'frame57_resized.jpg'), (2.0, 'frame60_resized.jpg'), (2.1, 'frame63_resized.jpg'), (2.2, 'frame66_resized.jpg'), (2.3, 'frame69_resized.jpg'), (2.4, 'frame72_resized.jpg'), (2.5, 'frame75_resized.jpg'), (2.6, 'frame78_resized.jpg'), (2.7, 'frame81_resized.jpg'), (2.8, 'frame84_resized.jpg'), (2.9, 'frame87_resized.jpg'), (3.0, 'frame90_resized.jpg'), (3.1, 'frame93_resized.jpg'), (3.2, 'frame96_resized.jpg'), (3.3, 'frame9