In [1]:
!pip3 -q install h5py validators tqdm numpy argparse requests colorama

In [None]:
!git clone https://github.com/CMU-MultiComp-Lab/CMU-MultimodalSDK.git
!cp -r CMU-MultimodalSDK/mmsdk ./
!rm -rf CMU-MultimodalSDK

In [13]:
import mmsdk
import os
import re
import numpy as np
from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError

DATA_PATH = './cmu_mosi/'
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
    
# download highlevel features, low-level (raw) data and labels for the dataset MOSI
# if the files are already present, instead of downloading it you just load it yourself.
# here we use CMU_MOSI dataset as example.

DATASET = md.cmu_mosi
# obtain the train/dev/test splits - these splits are based on video IDs
train_split = DATASET.standard_folds.standard_train_fold
dev_split = DATASET.standard_folds.standard_valid_fold
test_split = DATASET.standard_folds.standard_test_fold

try:
    md.mmdataset(DATASET.highlevel, DATA_PATH)
except RuntimeError:
    print("High-level features have been downloaded previously.")

try:
    md.mmdataset(DATASET.raw, DATA_PATH)
except RuntimeError:
    print("Raw data have been downloaded previously.")
    
try:
    md.mmdataset(DATASET.labels, DATA_PATH)
except RuntimeError:
    print("Labels have been downloaded previously.")

[91m[1m[2024-01-20 06:01:21.704] | Error   | [0m./cmu_mosi/CMU_MOSI_TimestampedWordVectors.csd file already exists ...
High-level features have been downloaded previously.
[91m[1m[2024-01-20 06:01:21.705] | Error   | [0m./cmu_mosi/CMU_MOSI_TimestampedWords.csd file already exists ...
Raw data have been downloaded previously.
[91m[1m[2024-01-20 06:01:21.705] | Error   | [0m./cmu_mosi/CMU_MOSI_Opinion_Labels.csd file already exists ...
Labels have been downloaded previously.


In [14]:
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))

CMU_MOSI_TimestampedWordVectors.csd
CMU_MOSI_Visual_Facet_42.csd
CMU_MOSI_Opinion_Labels.csd
CMU_MOSI_TimestampedWords.csd
CMU_MOSI_TimestampedPhones.csd
CMU_MOSI_openSMILE_IS09.csd
CMU_MOSI_OpenSmile_EB10.csd
CMU_MOSI_Visual_Facet_41.csd


In [15]:
# define your different modalities - refer to the filenames of the CSD files
visual_field = 'CMU_MOSI_Visual_Facet_42'
acoustic_field = 'CMU_MOSI_OpenSmile_EB10'
text_field = 'CMU_MOSI_TimestampedWords'
label_field = 'CMU_MOSI_Opinion_Labels'

features = [
    text_field, 
    visual_field, 
    acoustic_field,
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)

[92m[1m[2024-01-20 06:01:27.656] | Success | [0mComputational sequence read from file ./cmu_mosi/CMU_MOSI_TimestampedWords.csd ...
[94m[1m[2024-01-20 06:01:27.662] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-01-20 06:01:27.662] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                   

[92m[1m[2024-01-20 06:01:27.693] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-01-20 06:01:27.693] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2024-01-20 06:01:27.693] | Success | [0mComputational sequence read from file ./cmu_mosi/CMU_MOSI_Visual_Facet_42.csd ...
[94m[1m[2024-01-20 06:01:27.698] | Status  | [0mChecking the integrity of the <FACET_4.2> computational sequence ...
[94m[1m[2024-01-20 06:01:27.698] | Status  | [0mChecking the format of the data in <FACET_4.2> computational sequence ...


                                                                   

[92m[1m[2024-01-20 06:01:27.727] | Success | [0m<FACET_4.2> computational sequence data in correct format.
[94m[1m[2024-01-20 06:01:27.727] | Status  | [0mChecking the format of the metadata in <FACET_4.2> computational sequence ...
[92m[1m[2024-01-20 06:01:27.728] | Success | [0mComputational sequence read from file ./cmu_mosi/CMU_MOSI_OpenSmile_EB10.csd ...
[94m[1m[2024-01-20 06:01:27.731] | Status  | [0mChecking the integrity of the <OpenSmile_emobase2010> computational sequence ...
[94m[1m[2024-01-20 06:01:27.731] | Status  | [0mChecking the format of the data in <OpenSmile_emobase2010> computational sequence ...


                                                                   

[92m[1m[2024-01-20 06:01:27.777] | Success | [0m<OpenSmile_emobase2010> computational sequence data in correct format.
[94m[1m[2024-01-20 06:01:27.777] | Status  | [0mChecking the format of the metadata in <OpenSmile_emobase2010> computational sequence ...
[92m[1m[2024-01-20 06:01:27.777] | Success | [0mDataset initialized successfully ... 




In [16]:
print(list(dataset.keys()))
print("=" * 80)

print(list(dataset[visual_field].keys())[:10])
print("=" * 80)

some_id = list(dataset[visual_field].keys())[15]
print(list(dataset[visual_field][some_id].keys()))
print("=" * 80)

print(dataset[visual_field][some_id]['features'].shape, dataset[visual_field][some_id]['intervals'].shape)
print(dataset[text_field][some_id]['features'].shape, dataset[text_field][some_id]['intervals'].shape)
print(dataset[acoustic_field][some_id]['features'].shape, dataset[acoustic_field][some_id]['intervals'].shape)

print("Different modalities have different number of time steps!")

['CMU_MOSI_TimestampedWords', 'CMU_MOSI_Visual_Facet_42', 'CMU_MOSI_OpenSmile_EB10']
['03bSnISJMiM', '0h-zjBukYpk', '1DmNV9C1hbY', '1iG0909rllw', '2WGyTLYerpo', '2iD-tVS8NPw', '5W7Z1C_fDaE', '6Egk_28TtTM', '6_0THN4chvY', '73jzhE8R1TQ']
['features', 'intervals']
(5403, 35) (5403, 2)
(645, 1) (645, 2)
(25, 1585) (25, 2)
Different modalities have different number of time steps!


In [5]:
print(f"intervals -> \n{dataset[visual_field][some_id]['intervals'][:]}")
print(f"\nfeatures -> \n{dataset[visual_field][some_id]['features'][:]}")

intervals -> 
[[0.00000e+00 3.33333e-02]
 [3.33333e-02 6.66667e-02]
 [6.66667e-02 1.00000e-01]
 ...
 [1.80000e+02 1.80033e+02]
 [1.80033e+02 1.80067e+02]
 [1.80067e+02 1.80100e+02]]

features -> 
[[-2.25762    0.206646  -1.12043   ... -2.705     -6.46052    4.66611  ]
 [-2.26915    0.287491  -1.13806   ... -2.69984   -6.46135    4.66685  ]
 [-2.91823   -0.35709   -0.474069  ... -0.0619697 -4.37588    3.71145  ]
 ...
 [-1.80459   -0.191162  -0.915293  ...  4.56445   -0.172315   3.0596   ]
 [-1.78493   -0.190438  -0.689186  ...  4.20761    0.0100678  3.15838  ]
 [-1.83387   -0.167352  -0.693682  ...  4.16486    0.0650193  3.12331  ]]


In [11]:
!pip3 uninstall gensim -y

Found existing installation: gensim 4.3.2
Uninstalling gensim-4.3.2:
  Successfully uninstalled gensim-4.3.2


In [24]:
from collections import defaultdict
# load the embeddings
embed_dim = 300
unk = '<UNK>'
pad = '<PAD>'
wordset = set([unk, pad])

for id in train_split:
    for w in dataset[text_field][id]['features']:
        word = w[0].decode()
        if word != 'sp':
            wordset.add(word)

word2id = defaultdict()
id2word = defaultdict()
for i, word in enumerate(sorted(list(wordset))):
    word2id[word] = i
    id2word[i] = word
    
unk_id = word2id[unk]
word2id.default_factory = lambda:unk_id
id2word.default_factory = lambda:unk

def load_glove():
    file = f'./nlpword2vecembeddingspretrained/glove.6B.{embed_dim}d.txt'
    embeddings_index = dict()
    f = open(file)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))

    all_embs = np.stack(list(embeddings_index.values()))
    emb_mean, emb_std = np.mean(all_embs), np.std(all_embs)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(wordset), embed_dim))
    embeddedCount = 0
    not_found = []
    for word, idx in word2id.items():
        embedding_vector = embeddings_index.get(word.lower())
        if word == pad:
            embedding_vector = np.zeros_like(emb_mean)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector
            embeddedCount += 1
    print('total embedded:',embeddedCount,'common words')
    return embedding_matrix
    
def load_word2vec():
    file = './nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin'
    word2vec = KeyedVectors.load_word2vec_format(file, binary=True)
    embedding_matrix = np.random.normal(size=(len(wordset), embed_dim))
    for word, idx in word2id.items():
        if word in word2vec.vocab: 
            embedding_matrix[idx] = embedding_vector
        elif word == pad:
            embedding_matrix[idx] = np.zeros_like(emb_mean)
    return embedding_matrix

emb = load_glove()

Loaded 400000 word vectors.
total embedded: 2577 common words


In [25]:
# we define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features

# first we align to words with averaging, collapse_function receives a list of functions
dataset.align(text_field, collapse_functions=[avg])

[94m[1m[2024-01-20 06:11:42.108] | Status  | [0mUnify was called ...
[92m[1m[2024-01-20 06:11:42.109] | Success | [0mUnify completed ...
[94m[1m[2024-01-20 06:11:42.109] | Status  | [0mPre-alignment based on <CMU_MOSI_TimestampedWords> computational sequence started ...
[94m[1m[2024-01-20 06:11:42.235] | Status  | [0mPre-alignment done for <CMU_MOSI_OpenSmile_EB10> ...
[94m[1m[2024-01-20 06:11:42.748] | Status  | [0mPre-alignment done for <CMU_MOSI_Visual_Facet_42> ...
[94m[1m[2024-01-20 06:11:42.765] | Status  | [0mAlignment starting ...


                                                                                              

[92m[1m[2024-01-20 06:12:21.025] | Success | [0mAlignment to <CMU_MOSI_TimestampedWords> complete.
[94m[1m[2024-01-20 06:12:21.025] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-01-20 06:12:21.027] | Success | [0mInitialized empty <CMU_MOSI_TimestampedWords> computational sequence.
[94m[1m[2024-01-20 06:12:21.027] | Status  | [0mChecking the format of the data in <CMU_MOSI_TimestampedWords> computational sequence ...


                                                                      

[92m[1m[2024-01-20 06:12:21.093] | Success | [0m<CMU_MOSI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:21.093] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_TimestampedWords> computational sequence ...
[92m[1m[2024-01-20 06:12:21.094] | Success | [0mInitialized empty <CMU_MOSI_Visual_Facet_42> computational sequence.
[94m[1m[2024-01-20 06:12:21.094] | Status  | [0mChecking the format of the data in <CMU_MOSI_Visual_Facet_42> computational sequence ...


                                                                      

[92m[1m[2024-01-20 06:12:21.158] | Success | [0m<CMU_MOSI_Visual_Facet_42> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:21.159] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Visual_Facet_42> computational sequence ...
[92m[1m[2024-01-20 06:12:21.159] | Success | [0mInitialized empty <CMU_MOSI_OpenSmile_EB10> computational sequence.
[94m[1m[2024-01-20 06:12:21.159] | Status  | [0mChecking the format of the data in <CMU_MOSI_OpenSmile_EB10> computational sequence ...


                                                                      

[92m[1m[2024-01-20 06:12:21.204] | Success | [0m<CMU_MOSI_OpenSmile_EB10> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:21.204] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_OpenSmile_EB10> computational sequence ...




In [26]:
# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

[92m[1m[2024-01-20 06:12:25.119] | Success | [0mComputational sequence read from file ./cmu_mosi/CMU_MOSI_Opinion_Labels.csd ...
[94m[1m[2024-01-20 06:12:25.132] | Status  | [0mChecking the integrity of the <Opinion Segment Labels> computational sequence ...
[94m[1m[2024-01-20 06:12:25.132] | Status  | [0mChecking the format of the data in <Opinion Segment Labels> computational sequence ...


                                                                   

[92m[1m[2024-01-20 06:12:25.199] | Success | [0m<Opinion Segment Labels> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:25.199] | Status  | [0mChecking the format of the metadata in <Opinion Segment Labels> computational sequence ...
[94m[1m[2024-01-20 06:12:25.199] | Status  | [0mUnify was called ...
[92m[1m[2024-01-20 06:12:25.281] | Success | [0mUnify completed ...
[94m[1m[2024-01-20 06:12:25.283] | Status  | [0mPre-alignment based on <CMU_MOSI_Opinion_Labels> computational sequence started ...




[94m[1m[2024-01-20 06:12:25.391] | Status  | [0mPre-alignment done for <CMU_MOSI_TimestampedWords> ...
[94m[1m[2024-01-20 06:12:25.516] | Status  | [0mPre-alignment done for <CMU_MOSI_OpenSmile_EB10> ...
[94m[1m[2024-01-20 06:12:25.607] | Status  | [0mPre-alignment done for <CMU_MOSI_Visual_Facet_42> ...
[94m[1m[2024-01-20 06:12:25.610] | Status  | [0mAlignment starting ...


                                                                                              

[92m[1m[2024-01-20 06:12:27.535] | Success | [0mAlignment to <CMU_MOSI_Opinion_Labels> complete.
[94m[1m[2024-01-20 06:12:27.535] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-01-20 06:12:27.666] | Success | [0mInitialized empty <CMU_MOSI_TimestampedWords> computational sequence.
[94m[1m[2024-01-20 06:12:27.667] | Status  | [0mChecking the format of the data in <CMU_MOSI_TimestampedWords> computational sequence ...


                                                                     

[92m[1m[2024-01-20 06:12:27.674] | Success | [0m<CMU_MOSI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:27.676] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_TimestampedWords> computational sequence ...
[92m[1m[2024-01-20 06:12:27.679] | Success | [0mInitialized empty <CMU_MOSI_Visual_Facet_42> computational sequence.
[94m[1m[2024-01-20 06:12:27.680] | Status  | [0mChecking the format of the data in <CMU_MOSI_Visual_Facet_42> computational sequence ...


                                                                     

[92m[1m[2024-01-20 06:12:27.688] | Success | [0m<CMU_MOSI_Visual_Facet_42> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:27.689] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Visual_Facet_42> computational sequence ...
[92m[1m[2024-01-20 06:12:27.693] | Success | [0mInitialized empty <CMU_MOSI_OpenSmile_EB10> computational sequence.
[94m[1m[2024-01-20 06:12:27.694] | Status  | [0mChecking the format of the data in <CMU_MOSI_OpenSmile_EB10> computational sequence ...


                                                                     

[92m[1m[2024-01-20 06:12:27.702] | Success | [0m<CMU_MOSI_OpenSmile_EB10> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:27.703] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_OpenSmile_EB10> computational sequence ...
[92m[1m[2024-01-20 06:12:27.704] | Success | [0mInitialized empty <CMU_MOSI_Opinion_Labels> computational sequence.
[94m[1m[2024-01-20 06:12:27.704] | Status  | [0mChecking the format of the data in <CMU_MOSI_Opinion_Labels> computational sequence ...


                                                                     

[92m[1m[2024-01-20 06:12:27.708] | Success | [0m<CMU_MOSI_Opinion_Labels> computational sequence data in correct format.
[94m[1m[2024-01-20 06:12:27.708] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Opinion_Labels> computational sequence ...




In [27]:
print(list(dataset[label_field].keys())[:10])
print("=" * 80)

some_id = np.random.choice(list(dataset[label_field].keys()))

print(dataset[visual_field][some_id]['features'].shape, dataset[visual_field][some_id]['intervals'].shape)
print(dataset[text_field][some_id]['features'].shape, dataset[text_field][some_id]['intervals'].shape)
print(dataset[acoustic_field][some_id]['features'].shape, dataset[acoustic_field][some_id]['intervals'].shape)
print(dataset[label_field][some_id]['features'].shape, dataset[label_field][some_id]['intervals'].shape)

['03bSnISJMiM[0]', '03bSnISJMiM[1]', '03bSnISJMiM[2]', '03bSnISJMiM[3]', '03bSnISJMiM[4]', '03bSnISJMiM[5]', '03bSnISJMiM[6]', '03bSnISJMiM[7]', '03bSnISJMiM[8]', '03bSnISJMiM[9]']
(15, 35) (15, 2)
(15, 1) (15, 2)
(15, 1585) (15, 2)
(1, 1) (1, 2)


In [29]:
def padder(instance, length, value=0):
    n = instance.shape[0]
    q, ex = n // length, n % length
    ret = []
    for i in range(q):
        l = i * length
        r = l + length
        ret.append((instance[l:r], np.ones(length, dtype=np.int32)))
    if ex != 0:
        mask = np.zeros(length, dtype=np.int32)
        mask[:ex] = 1
        cur = np.pad(instance[-ex:], pad_width=((0,length-ex),(0,0)), mode='constant', constant_values=value)
        ret.append((cur, mask))
    return ret

In [30]:
EPS = 0
# place holders for the final train/dev/test dataset
train = []
dev = []
test = []

# define a regular expression to extract the video ID out of the keys
pattern = re.compile('(.*)\[.*\]')
num_drop = 0 # a counter to count how many data points went into some processing issues
maxlen = 50

my_data = {}
for typ in ['train', 'test', 'valid']:
    my_data[typ] = {'vision':[], 'audio':[], 'text':[], 'labels':[], 'mask':[]}
                       
for segment in dataset[label_field].keys():
    # get the video ID and the features out of the aligned dataset
    vid = re.search(pattern, segment).group(1)
    label = dataset[label_field][segment]['features']
    _words = dataset[text_field][segment]['features']
    _visual = dataset[visual_field][segment]['features']
    _acoustic = dataset[acoustic_field][segment]['features']

    # if the sequences are not same length after alignment, there must be some problem with some modalities
    # we should drop it or inspect the data again
    if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
        print(f"Encountered datapoint {vid} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
        num_drop += 1
        continue

    # remove nan values
    label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)
    
    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    words = []
    visual = []
    acoustic = []
    for i, word in enumerate(_words):
        w = word[0].decode().lower()
        if w != 'sp':
            words.append(emb[word2id[w]]) # SDK stores strings as bytes, decode into strings here
            visual.append(_visual[i])
            acoustic.append(_acoustic[i])

    words = np.asarray(words)
    visual = np.asarray(visual)
    acoustic = np.asarray(acoustic)

    # z-normalization per instance and remove nan/infs
    visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
    acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
    
    if vid in dev_split:
        typ = 'valid'
    elif vid in test_split:
        typ = 'test'
    else:
        typ = 'train'

    for i in range(words.shape[0]):
        my_data[typ]['vision'].append(visual[i])
        my_data[typ]['audio'].append(acoustic[i])
        my_data[typ]['text'].append(words[i])
        my_data[typ]['labels'].append(label[0])
        
    mask = None
    for vision, mask in padder(visual, maxlen, 0):
        my_data[typ]['vision'].append(vision)
        my_data[typ]['mask'].append(mask)
        my_data[typ]['labels'].append(label)
    for audio, mask in padder(acoustic, maxlen, 0):
        my_data[typ]['audio'].append(audio)
    for text, mask in padder(words, maxlen, word2id[pad]):
        my_data[typ]['text'].append(text)

print(f"Total number of {num_drop} datapoints have been dropped.")

  acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
  visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
  acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))


Total number of 0 datapoints have been dropped.


In [33]:
import pickle

with open('custom_mosi.pickle', 'wb') as fp:
    pickle.dump(my_data, fp, protocol=pickle.HIGHEST_PROTOCOL)