In [10]:
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

In [2]:
with open('./custom_mosi.pickle', 'rb') as fp:
    data = pickle.load(fp)

In [5]:
# Model Definition

# Modality Specific Module
class ModalitySpecificModule:
    @staticmethod
    def extract_modality_specific_interactions(input_layer):
        bi_gru_output = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64))(input_layer)
        msl_l1 = tf.keras.layers.Dense(128, activation='relu')(bi_gru_output)
        msl_output = tf.keras.layers.Dense(64, activation='relu')(msl_l1)
        return msl_output

    # Input Definition
    text_input = tf.keras.Input(shape=(None, 300))
    audio_input = tf.keras.Input(shape=(None, 1585))
    visual_input = tf.keras.Input(shape=(None, 35))

    @staticmethod
    def compute(text_input, audio_input, visual_input):
        # Extract Modality Specific Interactions
        text_msm_output = ModalitySpecificModule.extract_modality_specific_interactions(text_input)
        audio_msm_output = ModalitySpecificModule.extract_modality_specific_interactions(audio_input)
        visual_msm_output = ModalitySpecificModule.extract_modality_specific_interactions(visual_input)
        return text_msm_output, audio_msm_output, visual_msm_output


# Dense Multimodal Fusion Module
class DenseMultimodalFusionModule:
    residual_features = []
    @staticmethod
    def df_block(df_input1, df_input2, df_input3):
        df_output1 = tf.keras.layers.Dense(64, activation='relu')(df_input1)
        df_output2 = tf.keras.layers.Dense(64, activation='relu')(df_input2)
        df_output3 = tf.keras.layers.Dense(64, activation='relu')(df_input3)
        return df_output1, df_output2, df_output3
        
    @staticmethod
    def dense_fusion_layer(f1, f2, f3):
        df_input1 = tf.keras.layers.Concatenate()([f1, f3])
        df_input2 = tf.keras.layers.Concatenate()([f1, f2])
        df_input3 = tf.keras.layers.Concatenate()([f2, f3])
        r = tf.add(tf.add(f1, f2), f3)
        DenseMultimodalFusionModule.residual_features.append(r)
        fusion_output = DenseMultimodalFusionModule.df_block(df_input1, df_input2, df_input3)
        return fusion_output
    
    @staticmethod
    def compute(f1_0, f2_0, f3_0):
        f1_1, f2_1, f3_1 = DenseMultimodalFusionModule.dense_fusion_layer(f1_0, f2_0, f3_0)
        f1_2, f2_2, f3_2 = DenseMultimodalFusionModule.dense_fusion_layer(f1_1, f2_1, f3_1)
        f1_3, f2_3, f3_3 = DenseMultimodalFusionModule.dense_fusion_layer(f1_2, f2_2, f3_2)
        r = tf.add(tf.add(f1_3, f2_3), f3_3)
        DenseMultimodalFusionModule.residual_features.append(r)
        return DenseMultimodalFusionModule.residual_features


# Multimodal Residual Module
class MultimodalResidualModule:
    @staticmethod
    def compute(residual_features):
        final_residual_feature = residual_features[0]
        for r in residual_features[1:]:
            final_residual_feature = tf.add(final_residual_feature, r)
        return final_residual_feature


# Sentiment Classification Module
class SentimentClassificationModule:
    @staticmethod
    def compute(residual_feature):
        output_l1 = tf.keras.layers.Dense(64, activation='relu')(residual_feature)
        output_l2 = tf.keras.layers.Dense(32, activation='relu')(output_l1)
        output_l3 = tf.keras.layers.Dense(16, activation='relu')(output_l2)
        output_l4 = tf.keras.layers.Dense(8, activation='relu')(output_l3)

        # Modify the final layer to use tanh activation function
        output = tf.keras.layers.Dense(1, activation='tanh')(output_l4)
        # Scale and shift the output to be in the range [-3, 3]
        sentiment_output = 3 * output

        return sentiment_output

text_input, audio_input, visual_input = ModalitySpecificModule.text_input, ModalitySpecificModule.audio_input, ModalitySpecificModule.visual_input
text_msm_output, audio_msm_output, visual_msm_output = ModalitySpecificModule.compute(text_input, audio_input, visual_input)
residual_features = DenseMultimodalFusionModule.compute(text_msm_output, audio_msm_output, visual_msm_output)
final_residual_feature = MultimodalResidualModule.compute(residual_features)
output = SentimentClassificationModule.compute(final_residual_feature)


model = tf.keras.Model(inputs=[text_input, audio_input, visual_input], outputs=output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, None, 300)]          0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, None, 1585)]         0         []                            
                                                                                                  
 input_9 (InputLayer)        [(None, None, 35)]           0         []                            
                                                                                                  
 bidirectional_4 (Bidirecti  (None, 128)                  140544    ['input_7[0][0]']             
 onal)                                                                                        

In [None]:
train_labels = []
for i in data['train']['labels']:
    if type(i[0]).__name__ == 'ndarray':
      train_labels.append(i[0][0])
      continue
    else:
      train_labels.append(i[0])

train_text_input = data['train']['text']
train_audio_input = data['train']['audio']
train_visual_input = data['train']['vision']

valid_labels = []
for i in data['train']['labels']:
    if type(i[0]).__name__ == 'ndarray':
      valid_labels.append(i[0][0])
      continue
    else:
      valid_labels.append(i[0])

valid_text_input = data['train']['text']
valid_audio_input = data['train']['audio']
valid_visual_input = data['train']['vision']

model.compile(
  optimizer='adam', 
  loss=tf.keras.losses.MeanAbsoluteError(), 
  metrics=[
    tf.keras.metrics.Accuracy(),
    tf.keras.metrics.F1Score()
  ]
)


In [11]:

# train_dataset = tf.data.Dataset.from_tensor_slices(
#     ({'text_input': train_text_input, 'audio_input': train_audio_input, 'visual_input': train_visual_input}, train_labels)
# )

# valid_dataset = tf.data.Dataset.from_tensor_slices(
#     ({'text_input': valid_text_input, 'audio_input': valid_audio_input, 'visual_input': valid_visual_input}, valid_labels)
# )

# # Assuming your model has input names 'text_input', 'audio_input', 'visual_input'
# model.fit(train_dataset, validation_data=valid_dataset, epochs=100)

# model.fit([train_text_input, train_audio_input, train_visual_input], train_labels, validation_data=([valid_text_input, valid_audio_input, valid_visual_input], valid_labels), epochs=100)



# Assuming you have text_input, audio_input, visual_input, train_labels
# Convert lists to numpy arrays
text_input = np.array(train_text_input)
audio_input = np.array(train_audio_input)
visual_input = np.array(train_visual_input)
train_labels = np.array(train_labels)

# Create a tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {'text_input': text_input, 'audio_input': audio_input, 'visual_input': visual_input},
        train_labels
    )
)

# Assuming you have valid_text_input, valid_audio_input, valid_visual_input, valid_labels
valid_text_input = np.array(valid_text_input)
valid_audio_input = np.array(valid_audio_input)
valid_visual_input = np.array(valid_visual_input)
valid_labels = np.array(valid_labels)

valid_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {'text_input': valid_text_input, 'audio_input': valid_audio_input, 'visual_input': valid_visual_input},
        valid_labels
    )
)

# Assuming your model has input names 'text_input', 'audio_input', 'visual_input'
model.fit(train_dataset, validation_data=valid_dataset, epochs=100)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16347,) + inhomogeneous part.