#### Evaluate Script of Trained Models

In [28]:
from run_multimodal_time_series import *
from collections import OrderedDict
import csv

In [37]:
use_target_ratings=True

In [38]:
# loading model from saved model.
model = MultimodalEmotionPrediction()
new_state_dict = OrderedDict()
DEVICE = torch.device('cpu')   # 'cpu' in this case
if use_target_ratings:
    model_path = "../target/best_ccc_pytorch_model.bin"
else:
    model_path = "../observer/best_ccc_pytorch_model.bin"
print("loading the model from: ", model_path)
state_dict = torch.load(model_path, map_location=DEVICE)["model"]
for k, v in state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v
model.load_state_dict(new_state_dict)
_ = model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing LinguisticEncoderBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LinguisticEncoderBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loading the model from:  ../target/best_ccc_pytorch_model.bin


In [43]:
if use_target_ratings:
    print("WARNING: use_target_ratings is setting to TRUE.")
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested,
                        "target": "target"}
    preprocess = {
        'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
        'acoustic_timer': lambda df : df.loc[:,' frameTime'],
        'linguistic': lambda df : df.loc[:,'word'],
        'linguistic_timer': lambda df : df.loc[:,'time-offset'],
        'target': lambda df : ((df.loc[:,' rating'] / 0.5) - 1.0),
        'target_timer': lambda df : df.loc[:,'time'],
    }
else:
    modality_dir_map = {"acoustic": "acoustic-egemaps",  
                        "linguistic": "linguistic-word-level", # we don't load features
                        "visual": "image-raw", # image is nested,
                        "target": "observer_EWE"}
    preprocess = {
        'acoustic': lambda df : df.loc[:,' F0semitoneFrom27.5Hz_sma3nz_amean':' equivalentSoundLevel_dBp'],
        'acoustic_timer': lambda df : df.loc[:,' frameTime'],
        'linguistic': lambda df : df.loc[:,'word'],
        'linguistic_timer': lambda df : df.loc[:,'time-offset'],
        'target': lambda df : ((df.loc[:,'evaluatorWeightedEstimate'] / 50.0) - 1.0),
        'target_timer': lambda df : df.loc[:,'time'],
    }



In [44]:
if use_target_ratings:
    output_dir = "../data-files/target/"
else:
    output_dir = "../data-files/observer/"

In [45]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased",
    use_fast=False,
    cache_dir="../.huggingface_cache/"
)

In [46]:
# Loading all the data partitions.
data_dir = "../../SENDv1-data/"
train_modalities_data_dir = os.path.join(data_dir, "features/Train/")
train_target_data_dir = os.path.join(data_dir, "ratings/Train")
train_SEND_features = preprocess_SEND_files(
    train_modalities_data_dir,
    train_target_data_dir,
    use_target_ratings,
    modality_dir_map=modality_dir_map,
    preprocess=preprocess,
    linguistic_tokenizer=tokenizer,
    max_number_of_file=-1
)

dev_modalities_data_dir = os.path.join(data_dir, "features/Valid/")
dev_target_data_dir = os.path.join(data_dir, "ratings/Valid")
dev_SEND_features = preprocess_SEND_files(
    dev_modalities_data_dir,
    dev_target_data_dir,
    use_target_ratings,
    modality_dir_map=modality_dir_map,
    preprocess=preprocess,
    linguistic_tokenizer=tokenizer,
    max_number_of_file=-1
)

test_modalities_data_dir = os.path.join(data_dir, "features/Test/")
test_target_data_dir = os.path.join(data_dir, "ratings/Test")
test_SEND_features = preprocess_SEND_files(
    test_modalities_data_dir,
    test_target_data_dir,
    use_target_ratings,
    modality_dir_map=modality_dir_map,
    preprocess=preprocess,
    linguistic_tokenizer=tokenizer,
    max_number_of_file=-1
)

In [47]:
# Put dataset into correct format.
train_video_id = [video_struct["video_id"] for video_struct in train_SEND_features]
train_input_a_feature = torch.stack([video_struct["a_feature"] for video_struct in train_SEND_features]).float()
train_input_l_feature = torch.stack([video_struct["l_feature"] for video_struct in train_SEND_features])
train_input_l_mask = torch.stack([video_struct["l_mask"] for video_struct in train_SEND_features])
train_input_l_segment_ids = torch.stack([video_struct["l_segment_ids"] for video_struct in train_SEND_features])
train_input_v_feature = torch.stack([video_struct["v_feature"] for video_struct in train_SEND_features]).float()
train_rating_labels = torch.stack([video_struct["rating"] for video_struct in train_SEND_features]).float()
train_seq_lens = torch.tensor([[video_struct["seq_len"]] for video_struct in train_SEND_features]).float()
train_input_mask = torch.stack([video_struct["input_mask"] for video_struct in train_SEND_features])
train_data = TensorDataset(
    train_input_a_feature, 
    train_input_l_feature, train_input_l_mask, train_input_l_segment_ids,
    train_input_v_feature, train_rating_labels, train_seq_lens, train_input_mask
)
train_dataloader = DataLoader(train_data, batch_size=1, shuffle=False)

dev_video_id = [video_struct["video_id"] for video_struct in dev_SEND_features]
dev_input_a_feature = torch.stack([video_struct["a_feature"] for video_struct in dev_SEND_features]).float()
dev_input_l_feature = torch.stack([video_struct["l_feature"] for video_struct in dev_SEND_features])
dev_input_l_mask = torch.stack([video_struct["l_mask"] for video_struct in dev_SEND_features])
dev_input_l_segment_ids = torch.stack([video_struct["l_segment_ids"] for video_struct in dev_SEND_features])
dev_input_v_feature = torch.stack([video_struct["v_feature"] for video_struct in dev_SEND_features]).float()
dev_rating_labels = torch.stack([video_struct["rating"] for video_struct in dev_SEND_features]).float()
dev_seq_lens = torch.tensor([[video_struct["seq_len"]] for video_struct in dev_SEND_features]).float()
dev_input_mask = torch.stack([video_struct["input_mask"] for video_struct in dev_SEND_features])
dev_data = TensorDataset(
    dev_input_a_feature, 
    dev_input_l_feature, dev_input_l_mask, dev_input_l_segment_ids,
    dev_input_v_feature, dev_rating_labels, dev_seq_lens, dev_input_mask
)
dev_dataloader = DataLoader(dev_data, batch_size=1, shuffle=False)

test_video_id = [video_struct["video_id"] for video_struct in test_SEND_features]
test_input_a_feature = torch.stack([video_struct["a_feature"] for video_struct in test_SEND_features]).float()
test_input_l_feature = torch.stack([video_struct["l_feature"] for video_struct in test_SEND_features])
test_input_l_mask = torch.stack([video_struct["l_mask"] for video_struct in test_SEND_features])
test_input_l_segment_ids = torch.stack([video_struct["l_segment_ids"] for video_struct in test_SEND_features])
test_input_v_feature = torch.stack([video_struct["v_feature"] for video_struct in test_SEND_features]).float()
test_rating_labels = torch.stack([video_struct["rating"] for video_struct in test_SEND_features]).float()
test_seq_lens = torch.tensor([[video_struct["seq_len"]] for video_struct in test_SEND_features]).float()
test_input_mask = torch.stack([video_struct["input_mask"] for video_struct in test_SEND_features])
test_data = TensorDataset(
    test_input_a_feature, 
    test_input_l_feature, test_input_l_mask, test_input_l_segment_ids,
    test_input_v_feature, test_rating_labels, test_seq_lens, test_input_mask
)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)

In [48]:
def evaluate_ablation(
    video_id, dataloader, model, condition="A,V,L"
):
    ret = {}
    video_index = 0
    pbar = tqdm(dataloader, desc="videos")
    for step, batch in enumerate(pbar):
        vid_id = video_id[video_index]
        ret[vid_id] = {}
        # print(f"analyzing ablation studies on video_id={vid_id}")
        input_a_feature, input_l_feature, input_l_mask, input_l_segment_ids, \
            input_v_feature, rating_labels, seq_lens, input_mask = batch
        # based one condition, we need to mask out some channels!
        if "A" not in condition:
            input_a_feature = torch.zeros_like(input_a_feature)
        if "V" not in condition:
            input_v_feature = torch.zeros_like(input_v_feature)
        if "L" not in condition:
            input_l_feature = torch.zeros_like(input_l_feature)
        _, output = \
            model(input_a_feature, input_l_feature, input_l_mask, input_l_segment_ids,
                  input_v_feature, rating_labels, input_mask)
        seq_l = int(seq_lens[0].tolist()[0])
        pred = output[0][:seq_l].cpu().detach().numpy()
        true = rating_labels[0][:seq_l].cpu().detach().numpy()
        ccc = eval_ccc(pred, true)
        ret[vid_id]["pred"] = pred
        ret[vid_id]["true"] = true
        video_index += 1
    return ret

In [49]:
conditions = ["A,V,L", "A,V", "A,L", "V,L", "A", "V", "L"]
mega_results = {}
for condition in conditions:
    print("analyzing results for condition: ", condition)
    train_results = evaluate_ablation(
        train_video_id, train_dataloader, model,
        condition=condition
    )
    
    dev_results = evaluate_ablation(
        dev_video_id, dev_dataloader, model,
        condition=condition
    )

    test_results = evaluate_ablation(
        test_video_id, test_dataloader, model,
        condition=condition
    )
    mega_results[condition] = {}
    for k,v in train_results.items():
        mega_results[condition][k] = v
    for k,v in dev_results.items():
        mega_results[condition][k] = v
    for k,v in test_results.items():
        mega_results[condition][k] = v

videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  A,V,L


videos: 100%|██████████| 114/114 [04:25<00:00,  2.33s/it]
videos: 100%|██████████| 40/40 [01:07<00:00,  1.70s/it]
videos: 100%|██████████| 39/39 [01:15<00:00,  1.93s/it]
videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  A,V


videos: 100%|██████████| 114/114 [04:17<00:00,  2.26s/it]
videos: 100%|██████████| 40/40 [00:59<00:00,  1.49s/it]
videos: 100%|██████████| 39/39 [00:58<00:00,  1.51s/it]
videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  A,L


videos: 100%|██████████| 114/114 [04:06<00:00,  2.16s/it]
videos: 100%|██████████| 40/40 [01:11<00:00,  1.78s/it]
videos: 100%|██████████| 39/39 [01:04<00:00,  1.65s/it]
videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  V,L


videos: 100%|██████████| 114/114 [04:12<00:00,  2.22s/it]
videos: 100%|██████████| 40/40 [01:08<00:00,  1.70s/it]
videos: 100%|██████████| 39/39 [01:15<00:00,  1.93s/it]
videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  A


videos: 100%|██████████| 114/114 [05:44<00:00,  3.02s/it]
videos: 100%|██████████| 40/40 [01:33<00:00,  2.35s/it]
videos: 100%|██████████| 39/39 [01:43<00:00,  2.65s/it]
videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  V


videos: 100%|██████████| 114/114 [07:04<00:00,  3.72s/it]
videos: 100%|██████████| 40/40 [01:46<00:00,  2.66s/it]
videos: 100%|██████████| 39/39 [01:35<00:00,  2.44s/it]
videos:   0%|          | 0/114 [00:00<?, ?it/s]

analyzing results for condition:  L


videos: 100%|██████████| 114/114 [05:38<00:00,  2.97s/it]
videos: 100%|██████████| 40/40 [01:31<00:00,  2.28s/it]
videos: 100%|██████████| 39/39 [01:40<00:00,  2.58s/it]


In [50]:
print("output dir: ", output_dir)

output dir:  ../data-files/target/


In [51]:
# for each video, we are creating a file to save ratings for all conditions.
for video in mega_results["A,V,L"].keys():
    with open(os.path.join(output_dir, f"{video}.csv"), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        headers = [c for c in conditions]
        headers += ["actual"]
        writer.writerow(headers)
        s_len = len(mega_results["A,V,L"][video]["pred"])
        for i in range(s_len): # write line by line.
            row = []
            for condition in conditions:
                norm_r = (mega_results[condition][video]["pred"][i]+1.0)/2.0
                row.append(norm_r)
            norm_r = (mega_results[condition][video]["true"][i]+1.0)/2.0
            row.append(norm_r)
            writer.writerow(row)

In [52]:
with open("../data-files/train_ids.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    headers = ["vid_id"]
    writer.writerow(headers)
    for vid_id in train_video_id:
        writer.writerow([vid_id])
with open("../data-files/dev_ids.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    headers = ["vid_id"]
    writer.writerow(headers)
    for vid_id in dev_video_id:
        writer.writerow([vid_id])
with open("../data-files/test_ids.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    headers = ["vid_id"]
    writer.writerow(headers)
    for vid_id in test_video_id:
        writer.writerow([vid_id])