In [1]:
import re
#import os
import cv2
import random
import torch
#import torchvision
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load dataset

In [3]:
df = pd.read_csv('mustard++_text.csv')

# Fetch labeled data instances

In [4]:
data_dict = []

list_of_text = []
for index, row in df.iterrows():
    if row['Sarcasm'] in [0.0, 1.0]:
      text = row['SENTENCE']
      text = re.sub("[\n]", " ", text)
      list_of_text.append(text)

      tmp = {'key': row['SCENE'], 
             'image': row['KEY'], 
             'text': list_of_text,
             'label': row['Sarcasm']}

      data_dict.append(tmp)
      list_of_text = []
    else:
      text = row['SENTENCE']
      text = re.sub("[\n]", " ", text)
      list_of_text.append(text)

In [5]:
for i in data_dict:
  i['label'] = int(i['label'])

In [None]:
len(data_dict)

# Preprocess valid data instances

In [6]:
def is_valid_frame(frame):
    return frame is not None and frame.size > 0

failed_data_points = []

videos = []
text = []
labels = []
ids = []

down_width = 384
down_height = 224
down_points = (down_width, down_height)

num_frames = 16
for data in data_dict[:]:
    video_id = data['image']
    video_path = 'videos/final_utterance_videos/'+video_id+'.mp4'
    cam = cv2.VideoCapture(video_path)
    total_frames = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))

    # skip data point which are shorter than num_frames
    if total_frames < num_frames:
        failed_data_points.append(video_path)
        continue

    random_frame_idxs = random.sample(range(total_frames), num_frames)

    frames = []
    for idx, frame_idx in enumerate(sorted(random_frame_idxs)):
        valid_frame = False
        attempts = 0 
        
        while not valid_frame and attempts < 3:
            cam.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            ret, frame = cam.read()

            if ret and is_valid_frame(frame):
                resized_frame = cv2.resize(frame, down_points, interpolation=cv2.INTER_LINEAR)
                frames.append(resized_frame)
                valid_frame = True
            else:
                attempts += 1
                if frame_idx < total_frames - 1:
                    frame_idx += 1
                else:
                    frame_idx -= 1

    # if any frames are corrupted, skip data point
    if len(frames) < num_frames:
        failed_data_points.append(video_path)
        continue

    # print(f'video: {video_id}, frames {len(frames)}')

    video = np.array(frames)
    tensor_video = torch.from_numpy(video)
    videos.append(tensor_video)

    text.append(data['text'])
    labels.append(data['label'])
    ids.append(data['key'])


# Holdout Data Split

In [None]:
train_data = []
for index, row in df.iterrows():
    if row['SHOW'] in ['BBT', 'SV']:
        train_data.append(row['SCENE'])

In [None]:
train_video = []
train_text = []
train_label = []
train_id = []
rest_videos = []
rest_text = []
rest_labels = []
rest_ids = []

for index, id in enumerate(ids):
    if id in train_data:
        train_video.append(videos[index])
        train_text.append(text[index])
        train_label.append(labels[index])
        train_id.append(ids[index])
    else:
        rest_videos.append(videos[index])
        rest_text.append(text[index])
        rest_labels.append(labels[index])
        rest_ids.append(ids[index])

In [None]:
val_text, test_text, val_video, test_video, val_label, test_label, val_id, test_id = train_test_split(rest_text, rest_videos, rest_labels, rest_ids, test_size=0.5, stratify=rest_labels)

In [None]:
torch.save(train_video, f"preprocessed/video_train_holdout.pt")
torch.save(train_text, f"preprocessed/text_train_holdout.pt")
torch.save(train_label, f"preprocessed/labels_train_holdout.pt")
torch.save(train_id, f"preprocessed/ids_train_holdout.pt")

torch.save(val_video, f"preprocessed/video_val_holdout.pt")
torch.save(val_text, f"preprocessed/text_val_holdout.pt")
torch.save(val_label, f"preprocessed/labels_val_holdout.pt")
torch.save(val_id, f"preprocessed/ids_val_holdout.pt")

torch.save(test_video, f"preprocessed/video_test_holdout.pt")
torch.save(test_text, f"preprocessed/text_test_holdout.pt")
torch.save(test_label, f"preprocessed/labels_test_holdout.pt")
torch.save(test_id, f"preprocessed/ids_test_holdout.pt")

# Stratified Data Split

In [None]:
df.dropna(subset=['Sarcasm'], inplace=True)

In [None]:
BBT = []
for index, row in df.iterrows():
    if row['SHOW'] == 'BBT':
        BBT.append(row)

bbt_df = pd.DataFrame(BBT)

SV = []
for index, row in df.iterrows():
    if row['SHOW'] == 'SV':
        SV.append(row)

sv_df = pd.DataFrame(SV)

FRIENDS = []
for index, row in df.iterrows():
    if row['SHOW'] == 'FRIENDS':
        FRIENDS.append(row)

friends_df = pd.DataFrame(FRIENDS)

GOLDENGIRLS = []
for index, row in df.iterrows():
    if row['SHOW'] == 'GOLDENGIRLS':
        GOLDENGIRLS.append(row)

golden_df = pd.DataFrame(GOLDENGIRLS)

SARCASMOHOLICS = []
for index, row in df.iterrows():
    if row['SHOW'] == 'SARCASMOHOLICS':
        SARCASMOHOLICS.append(row)

sar_df = pd.DataFrame(SARCASMOHOLICS)

In [None]:
train_friends, rest_friends = train_test_split(friends_df, test_size=0.3, stratify=friends_df['Sarcasm'])
test_friends, val_friends = train_test_split(rest_friends, test_size=0.5, stratify=rest_friends['Sarcasm'])

train_bbt, rest_bbt = train_test_split(bbt_df, test_size=0.3, stratify=bbt_df['Sarcasm'])
test_bbt, val_bbt = train_test_split(rest_bbt, test_size=0.5, stratify=rest_bbt['Sarcasm'])

train_sv, rest_sv = train_test_split(sv_df, test_size=0.3, stratify=sv_df['Sarcasm'])
test_sv, val_sv = train_test_split(rest_sv, test_size=0.5, stratify=rest_sv['Sarcasm'])

train_golde, rest_golde = train_test_split(golden_df, test_size=0.3)
test_golde, val_golde = train_test_split(rest_golde, test_size=0.5)

train_sarc, rest_sarc = train_test_split(sar_df, test_size=0.5)
test_sarc, val_sarc = train_test_split(rest_sarc, test_size=0.5)

In [None]:
train_data = []
for index, row in train_friends.iterrows():
    train_data.append(row['SCENE'])
for index, row in train_golde.iterrows():
    train_data.append(row['SCENE'])
for index, row in train_sarc.iterrows():
    train_data.append(row['SCENE'])
for index, row in train_bbt.iterrows():
    train_data.append(row['SCENE'])
for index, row in train_sv.iterrows():
    train_data.append(row['SCENE'])


val_data = []
for index, row in val_friends.iterrows():
    val_data.append(row['SCENE'])
for index, row in val_golde.iterrows():
    val_data.append(row['SCENE'])
for index, row in val_sarc.iterrows():
    val_data.append(row['SCENE'])
for index, row in val_bbt.iterrows():
    val_data.append(row['SCENE'])
for index, row in val_sv.iterrows():
    val_data.append(row['SCENE'])

test_data = []
for index, row in test_friends.iterrows():
    test_data.append(row['SCENE'])
for index, row in test_golde.iterrows():
    test_data.append(row['SCENE'])
for index, row in test_sarc.iterrows():
    test_data.append(row['SCENE'])
for index, row in test_bbt.iterrows():
    test_data.append(row['SCENE'])
for index, row in test_sv.iterrows():
    test_data.append(row['SCENE'])

In [None]:
train_video = []
train_text = []
train_label = []
train_id = []

val_video = []
val_text = []
val_label = []
val_id = []

test_video = []
test_text = []
test_label = []
test_id = []


for index, id in enumerate(ids):
    if id in train_data:
        train_video.append(videos[index])
        train_text.append(text[index])
        train_label.append(labels[index])
        train_id.append(ids[index])
    elif id in val_data:
        val_video.append(videos[index])
        val_text.append(text[index])
        val_label.append(labels[index])
        val_id.append(ids[index])
    elif id in test_data:
        test_video.append(videos[index])
        test_text.append(text[index])
        test_label.append(labels[index])
        test_id.append(ids[index])

In [None]:
torch.save(train_video, f"preprocessed/video_train_stratified.pt")
torch.save(train_text, f"preprocessed/text_train_stratified.pt")
torch.save(train_label, f"preprocessed/labels_train_stratified.pt")
torch.save(train_id, f"preprocessed/ids_train_stratified.pt")

torch.save(val_video, f"preprocessed/video_val_stratified.pt")
torch.save(val_text, f"preprocessed/text_val_stratified.pt")
torch.save(val_label, f"preprocessed/labels_val_stratified.pt")
torch.save(val_id, f"preprocessed/ids_val_stratified.pt")

torch.save(test_video, f"preprocessed/video_test_stratified.pt")
torch.save(test_text, f"preprocessed/text_test_stratified.pt")
torch.save(test_label, f"preprocessed/labels_test_stratified.pt")
torch.save(test_id, f"preprocessed/ids_test_stratified.pt")