In [3]:
#---cell-1
# --- Download and Save MELD Persistently to Google Drive ---

from google.colab import drive
import os

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted successfully!")

# 2. Define the path in your Google Drive where you want to save MELD
#    You can change 'My Drive/MELD_Dataset' to whatever you prefer.
#    The folder will be created if it doesn't exist.
drive_save_path = '/content/drive/My Drive/MELD_Dataset'
os.makedirs(drive_save_path, exist_ok=True)
print(f"Data will be saved in: {drive_save_path}")

# 3. Change the current working directory TO your Google Drive folder
os.chdir(drive_save_path)
print(f"Current working directory changed to: {os.getcwd()}") # Verify we are in the Drive folder

# 4. Download the MELD Raw video file DIRECTLY into the Drive folder
#    Only run this if the file doesn't already exist!
meld_zip_file = 'MELD.Raw.tar.gz'
meld_raw_folder = 'MELD.Raw' # This is the folder created after extraction

if not os.path.exists(meld_zip_file) and not os.path.exists(meld_raw_folder):
    print(f"Downloading {meld_zip_file} (10GB) to Google Drive...")
    print("This will take a significant amount of time, depending on Drive/Colab speed.")
    # Use !wget to download directly here
    !wget -O '{meld_zip_file}' http://web.eecs.umich.edu/~mihalcea/downloads/MELD.Raw.tar.gz
    print("Download complete!")
elif os.path.exists(meld_zip_file):
    print(f"'{meld_zip_file}' already exists in Google Drive. Skipping download.")
elif os.path.exists(meld_raw_folder):
     print(f"Extracted folder '{meld_raw_folder}' already exists. Skipping download and extraction.")

# 5. Extract the file DIRECTLY in the Drive folder
#    Only run this if the zip file exists AND the extracted folder doesn't
if os.path.exists(meld_zip_file) and not os.path.exists(meld_raw_folder):
    print(f"Extracting {meld_zip_file} within Google Drive...")
    print("This will also take a while.")
    # Use !tar to extract here
    !tar -xzf '{meld_zip_file}'
    print("Extraction complete!")
    # Optional: Remove the large zip file after extraction to save Drive space
    # os.remove(meld_zip_file)
    # print(f"Removed '{meld_zip_file}' to save space.")
elif not os.path.exists(meld_zip_file) and os.path.exists(meld_raw_folder):
     print(f"Extracted folder '{meld_raw_folder}' already exists. Nothing to extract.")
elif not os.path.exists(meld_zip_file) and not os.path.exists(meld_raw_folder):
    print("Neither zip file nor extracted folder found. Please check download step.")


print("\n--- MELD dataset is now stored persistently in Google Drive ---")
print(f"Video files should be in: {os.path.join(drive_save_path, meld_raw_folder)}")

# IMPORTANT: Change back to the default Colab directory if needed for later steps
# os.chdir('/content')
# print(f"Current working directory changed back to: {os.getcwd()}")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully!
Data will be saved in: /content/drive/My Drive/MELD_Dataset
Current working directory changed to: /content/drive/My Drive/MELD_Dataset
'MELD.Raw.tar.gz' already exists in Google Drive. Skipping download.

--- MELD dataset is now stored persistently in Google Drive ---
Video files should be in: /content/drive/My Drive/MELD_Dataset/MELD.Raw


In [4]:
#---cell-2
# --- Step 1: Install All Libraries ---
# (This cell combines all the libraries we need)

# For loading Hugging Face models and datasets
!pip install transformers datasets

# For deep learning
!pip install torch

# For video/image processing
!pip install opencv-python-headless

# For detecting faces
!pip install mtcnn

Collecting mtcnn
  Downloading mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting lz4>=4.3.3 (from mtcnn)
  Downloading lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading mtcnn-1.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lz4, mtcnn
Successfully installed lz4-4.4.4 mtcnn-1.0.0


In [5]:
#---cell-3
!pip install huggingface_hub --quiet

from huggingface_hub import login
# paste your token when prompted
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
#---cell-4
import os
os.environ["HF_TOKEN"] = "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"   # replace with your token

from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
#---cell-5
# (Run your actual Step 2 cell)
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch

model_name = "trpakov/vit-face-expression"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"Model {model_name} loaded to {device}")

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/915 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

Model trpakov/vit-face-expression loaded to cpu


In [9]:
#---cell-6
import os

meld_path = '/content/drive/My Drive/MELD_Dataset/MELD.Raw'
for root, dirs, files in os.walk(meld_path):
    print(root)
    for name in files[:5]:  # only show first few files per folder
        print("   ", name)


/content/drive/My Drive/MELD_Dataset/MELD.Raw
    ._output_repeated_splits_test
    dev_sent_emo.csv
    train_sent_emo.csv
    test_sent_emo.csv
    test.tar.gz
/content/drive/My Drive/MELD_Dataset/MELD.Raw/dev_splits_complete
    dia53_utt3.mp4
    dia108_utt14.mp4
    dia68_utt11.mp4
    dia89_utt1.mp4
    dia76_utt15.mp4
/content/drive/My Drive/MELD_Dataset/MELD.Raw/output_repeated_splits_test
    final_videos_testdia24_utt0.mp4
    dia45_utt5.mp4
    dia17_utt20.mp4
    ._dia259_utt3.mp4
    ._dia259_utt9.mp4
/content/drive/My Drive/MELD_Dataset/MELD.Raw/train_splits
    dia72_utt2.mp4
    dia132_utt5.mp4
    dia138_utt3.mp4
    dia351_utt12.mp4
    dia621_utt1.mp4


In [10]:
#---cell-7
# --- Step 4 (Revised): Extract Inner Archives & Load CSVs ---

import pandas as pd
import os

# Define the path where MELD.Raw was extracted in your Google Drive
meld_raw_path = '/content/drive/My Drive/MELD_Dataset/MELD.Raw'
print(f"Looking for data in: {meld_raw_path}")

# Change directory to the MELD.Raw folder
try:
    os.chdir(meld_raw_path)
    print(f"Changed directory to: {os.getcwd()}")
except FileNotFoundError:
    print(f"ERROR: Directory not found: {meld_raw_path}")
    print("Please ensure Step 3 completed successfully and the path is correct.")

# --- Extract Inner Archives ---
archives = {
    "train": "train.tar.gz",
    "dev": "dev.tar.gz",
    "test": "test.tar.gz" # Note: The test videos might be in 'output_repeated_splits_test' folder from initial extract
}

# Define expected output folders after extraction
output_folders = {
    "train": "train_splits", # Check exact name after extraction
    "dev": "dev_splits",     # Check exact name after extraction
    "test": "output_repeated_splits_test" # This often seems to be the test folder name
}


for split, archive_name in archives.items():
    output_folder = output_folders[split]
    if os.path.exists(archive_name) and not os.path.exists(output_folder):
        print(f"Extracting {archive_name}...")
        # Use '--one-top-level' if available to prevent tarbomb, adjust folder name if needed
        !tar -xzf {archive_name}
        # Check if extraction created the expected folder, might need adjustment
        if os.path.exists(output_folder):
             print(f"✅ {archive_name} extracted successfully to {output_folder}!")
        else:
             # If extraction didn't create the specific folder, list contents to see what was made
             print(f"Extraction of {archive_name} finished. Checking contents...")
             !ls -ld */ # List only directories
    elif not os.path.exists(archive_name) and os.path.exists(output_folder):
         print(f"✅ {output_folder} already exists. Skipping extraction for {split}.")
    elif os.path.exists(archive_name) and os.path.exists(output_folder):
         print(f"✅ {output_folder} already exists. Skipping extraction for {split}.")
    elif split == 'test' and os.path.exists(output_folders['test']):
         # Special check for test folder potentially already existing from first extraction
         print(f"✅ {output_folders['test']} already exists. Assuming test videos are present.")
    else:
        # Only print warning if the archive is truly missing and output folder isn't there either
        if not os.path.exists(output_folder):
             print(f"Warning: {archive_name} not found and {output_folder} doesn't exist.")


# --- Load CSV Files ---
csv_files = {
    "train": "train_sent_emo.csv", # Check if this is inside MELD.Raw or extracted train_splits
    "dev": "dev_sent_emo.csv",
    "test": "test_sent_emo.csv"
}

dataframes = {}

print("\nLoading CSV files...")
for split, csv_name in csv_files.items():
    if os.path.exists(csv_name):
        try:
            dataframes[split] = pd.read_csv(csv_name)
            print(f"✅ Loaded {csv_name} successfully ({len(dataframes[split])} rows).")
        except Exception as e:
            print(f"ERROR loading {csv_name}: {e}")
    else:
        # Try looking inside the split folder if it exists
        split_folder = output_folders.get(split)
        if split_folder and os.path.exists(os.path.join(split_folder, csv_name)):
             try:
                 dataframes[split] = pd.read_csv(os.path.join(split_folder, csv_name))
                 print(f"✅ Loaded {os.path.join(split_folder, csv_name)} successfully ({len(dataframes[split])} rows).")
             except Exception as e:
                  print(f"ERROR loading {os.path.join(split_folder, csv_name)}: {e}")
        else:
            print(f"Warning: {csv_name} not found in {meld_raw_path} or potential subdirectories.")


# --- Inspect the data ---
if "train" in dataframes:
    print("\n--- First 5 rows of Training Data ---")
    print(dataframes["train"].head())
    print("\n--- Training Data Info ---")
    dataframes["train"].info()
else:
    print("\nCould not load training data.")

# Optional: Change back to the main content directory if needed
# os.chdir('/content')
# print(f"\nChanged directory back to: {os.getcwd()}")

Looking for data in: /content/drive/My Drive/MELD_Dataset/MELD.Raw
Changed directory to: /content/drive/My Drive/MELD_Dataset/MELD.Raw
✅ train_splits already exists. Skipping extraction for train.
Extracting dev.tar.gz...
Extraction of dev.tar.gz finished. Checking contents...
drwx------ 2 root root 4096 Jul 23  2018 dev_splits_complete/
drwx------ 2 root root 4096 Jul 26  2018 output_repeated_splits_test/
drwx------ 2 root root 4096 Oct 27 18:14 train_splits/
✅ output_repeated_splits_test already exists. Skipping extraction for test.

Loading CSV files...
✅ Loaded train_sent_emo.csv successfully (9989 rows).
✅ Loaded dev_sent_emo.csv successfully (1109 rows).
✅ Loaded test_sent_emo.csv successfully (2610 rows).

--- First 5 rows of Training Data ---
   Sr No.                                          Utterance          Speaker  \
0       1  also I was the point person on my companys tr...         Chandler   
1       2                   You mustve had your hands full.  The Interviewer

In [11]:
#---cell-8
!pip install lz4



In [12]:
#---cell-9
# --- Step 5 (Debugging Version - Install lz4 inside): Check Paths and Process One Video ---

import cv2
# Import MTCNN later, after installing lz4
# from mtcnn import MTCNN
from PIL import Image
import numpy as np
import os
from datasets import Dataset, DatasetDict
import torch

# --- 1. Install lz4 ---
print("Attempting to install lz4...")
!pip install lz4
print("lz4 installation command finished.")

# --- 2. Verify Paths and Initialize Detector ---
drive_base_path = '/content/drive/My Drive/MELD_Dataset/'
meld_raw_path = os.path.join(drive_base_path, 'MELD.Raw')
print(f"\nBase path for MELD.Raw: {meld_raw_path}")

print("\nListing contents of MELD.Raw to verify folder names:")
try:
    original_dir = os.getcwd()
    os.chdir(meld_raw_path)
    print("Contents:")
    !ls -l
    os.chdir(original_dir)
except FileNotFoundError:
    print(f"ERROR: Cannot access {meld_raw_path}. Please check the path.")

# --- UPDATE THESE FOLDER NAMES BASED ON THE `ls` OUTPUT ABOVE ---
video_folders = {
    "train": os.path.join(meld_raw_path, "train_splits"),
    "dev": os.path.join(meld_raw_path, "dev_splits_complete"),
    "test": os.path.join(meld_raw_path, "output_repeated_splits_test")
}
print(f"\nExpected video folders:")
print(f"Train: {video_folders['train']}")
print(f"Dev:   {video_folders['dev']}")
print(f"Test:  {video_folders['test']}")


print("\nImporting MTCNN and initializing face detector...")
try:
    # Import MTCNN *after* installing lz4
    from mtcnn import MTCNN
    detector = MTCNN()
    print("✅ MTCNN detector initialized.")
except ImportError:
     print(f"❌ ERROR: Failed to import MTCNN even after pip install.")
     detector = None
except Exception as e:
    print(f"❌ ERROR initializing MTCNN: {e}")
    # Check if it's the lz4 error again specifically
    if 'LZ4' in str(e):
         print("The LZ4 error persists. This is very unusual.")
    detector = None

# Reload processor if needed
if 'processor' not in globals():
     print("Reloading processor...")
     from transformers import AutoImageProcessor
     processor_name = "trpakov/vit-face-expression"
     processor = AutoImageProcessor.from_pretrained(processor_name)
     print("Processor reloaded.")

# --- 3. Manually Process One Example ---
print("\n--- Attempting to process one training example manually ---")

# Check if detector initialized successfully
if detector and "train" in dataframes:
    example = dataframes["train"].iloc[0].to_dict()
    split_name = "train"
    print(f"Using example: Dia_ID={example.get('Dialogue_ID')}, Utt_ID={example.get('Utterance_ID')}, Emotion={example.get('Emotion')}")

    try:
        # --- [ The rest of the manual processing code remains the same ] ---
        # --- [ Find video, open, read frame, detect face, crop, process ] ---
        dialogue_id = example['Dialogue_ID']
        utterance_id = example['Utterance_ID']
        video_filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
        print(f"Constructed filename: {video_filename}")

        current_video_folder = video_folders.get(split_name)
        if not current_video_folder:
             raise ValueError(f"Video folder path for split '{split_name}' is not defined correctly.")
        print(f"Looking in folder: {current_video_folder}")

        full_video_path = os.path.join(current_video_folder, video_filename)
        print(f"Full video path: {full_video_path}")

        print(f"Checking if video file exists at path...")
        if not os.path.exists(full_video_path):
            print(f"❌ ERROR: Video file NOT FOUND at {full_video_path}")
            if os.path.exists(current_video_folder):
                 !ls -l '{current_video_folder}' | head -10
            else:
                 print(f"Folder {current_video_folder} does not exist.")
        else:
            print(f"✅ Video file found!")
            print("Attempting to open video with OpenCV...")
            cap = cv2.VideoCapture(full_video_path)
            if not cap.isOpened(): print(f"❌ ERROR: Could not open video file {full_video_path}")
            else:
                print("✅ Video opened successfully.")
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                middle_frame_idx = frame_count // 2
                print(f"Total frames: {frame_count}. Reading frame index: {middle_frame_idx}")
                cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
                ret, frame = cap.read(); cap.release()
                if not ret or frame is None: print(f"❌ ERROR: Could not read middle frame from video.")
                else:
                    print(f"✅ Frame read successfully (shape: {frame.shape}).")
                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    print("Attempting to detect face with MTCNN...")
                    faces = detector.detect_faces(frame_rgb) # Use detector here
                    if not faces: print(f"❌ ERROR: No face detected in the frame.")
                    else:
                        print(f"✅ Face detected! Details: {faces[0]['box']}")
                        bounding_box = faces[0]['box']; x, y, w, h = bounding_box; padding = 20
                        x1, y1 = max(0, x - padding), max(0, y - padding)
                        x2, y2 = min(frame_rgb.shape[1], x + w + padding), min(frame_rgb.shape[0], y + h + padding)
                        print("Attempting to crop face...")
                        face_crop = frame_rgb[y1:y2, x1:x2]
                        if face_crop.size == 0: print(f"❌ ERROR: Face crop resulted in an empty image.")
                        else:
                            print(f"✅ Face cropped successfully (shape: {face_crop.shape}).")
                            face_image = Image.fromarray(face_crop)
                            print("Attempting to process face image with processor...")
                            try:
                                inputs = processor(images=face_image, return_tensors="pt")
                                pixel_values = inputs['pixel_values'].squeeze(0)
                                print(f"✅ Image processed successfully! Tensor shape: {pixel_values.shape}")
                                emotion_map = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}
                                emotion_str = example.get('Emotion', 'unknown').lower()
                                emotion_label = emotion_map.get(emotion_str, -1)
                                print(f"Original emotion: '{example.get('Emotion')}', Mapped label: {emotion_label}")
                                print("\n--- ✅ MANUAL PREPROCESSING SUCCEEDED FOR ONE EXAMPLE ---")
                            except Exception as proc_e: print(f"❌ ERROR: Processor failed on face image: {proc_e}")
    except Exception as e:
        print(f"--- ❌ An unexpected error occurred during manual processing ---")
        print(f"Error: {e}")

elif not detector:
    print("MTCNN detector failed to initialize. Cannot proceed with processing.")
else:
    print("Could not load 'train' dataframe for testing.")

Attempting to install lz4...
lz4 installation command finished.

Base path for MELD.Raw: /content/drive/My Drive/MELD_Dataset/MELD.Raw

Listing contents of MELD.Raw to verify folder names:
Contents:
total 10622914
-rw------- 1 root root     120071 Sep 29  2018 dev_sent_emo.csv
drwx------ 2 root root       4096 Jul 23  2018 dev_splits_complete
-rw------- 1 root root  776067297 Oct 17  2018 dev.tar.gz
drwx------ 2 root root       4096 Jul 26  2018 output_repeated_splits_test
-rw------- 1 root root       9822 Oct  5  2018 README.txt
-rw------- 1 root root     290841 Sep 29  2018 test_sent_emo.csv
-rw------- 1 root root 2237023908 Oct  5  2018 test.tar.gz
-rw------- 1 root root    1105502 Sep 29  2018 train_sent_emo.csv
drwx------ 2 root root       4096 Oct 27 18:14 train_splits
-rw------- 1 root root 7863231551 Nov  2  2018 train.tar.gz

Expected video folders:
Train: /content/drive/My Drive/MELD_Dataset/MELD.Raw/train_splits
Dev:   /content/drive/My Drive/MELD_Dataset/MELD.Raw/dev_splits

In [15]:
#---cell-10
# --- Step 5: Define and Apply Video Preprocessing ---

import cv2
from mtcnn import MTCNN
from PIL import Image
import numpy as np
import os
from datasets import Dataset, DatasetDict # Import Dataset classes
import torch # Ensure torch is imported

# --- 1. Re-define paths and initialize detector ---
# (In case the session restarted or variables were lost)
drive_base_path = '/content/drive/My Drive/MELD_Dataset/'
meld_raw_path = os.path.join(drive_base_path, 'MELD.Raw')
print(f"Base path for videos: {meld_raw_path}")

print("Initializing MTCNN face detector...")
try:
    detector = MTCNN()
    print("MTCNN detector initialized.")
except Exception as e:
    print(f"Error initializing MTCNN: {e}. Please ensure mtcnn library is installed.")
    # Add exit or raise error if detector is crucial

# Define the expected video subfolder names
video_folders = {
    "train": os.path.join(meld_raw_path, "train_splits"),
    "dev": os.path.join(meld_raw_path, "dev_splits_complete"), # Adjust if needed
    "test": os.path.join(meld_raw_path, "output_repeated_splits_test") # Adjust if needed
}

# --- 2. The Preprocessing Function ---
# (Make sure the processor variable is still loaded from Step 2)
# Check if processor exists, otherwise reload it (example)
if 'processor' not in globals():
     print("Reloading processor...")
     from transformers import AutoImageProcessor
     processor_name = "microsoft/resnet-50" # Or the ViT one you loaded initially
     processor = AutoImageProcessor.from_pretrained(processor_name)
     print("Processor reloaded.")


def preprocess_video_frame(example):
    """
    Finds video, extracts frame, detects/crops face, processes for model.
    Uses Dialogue_ID and Utterance_ID to build the filename.
    """
    try:
        # Construct filename: e.g., dia0_utt0.mp4
        dialogue_id = example['Dialogue_ID']
        utterance_id = example['Utterance_ID']
        video_filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"

        # Determine which split folder to look in based on the dataframe source
        # This requires knowing which split this example came from, map handles this.
        # We'll determine the correct path inside the map function call later.
        # For now, placeholder - full_video_path will be set later.
        # We need the 'split_name' to be passed or inferred.

        # *** Path construction moved inside the mapped function below ***

        # --- Remaining steps from previous function ---
        # 2. Open the video file (path needs to be determined by map)
        # ... [rest of the function: open video, extract frame, detect face, crop, process, get label] ...
        # ... Make sure to use the correct video path based on the split ...

        # Placeholder return - actual logic needed
        return {'pixel_values': None, 'label': -1, 'video_filename_used': video_filename, 'status': 'placeholder'}


    except Exception as e:
        print(f"ERROR processing example (Dia:{example.get('Dialogue_ID', 'N/A')}, Utt:{example.get('Utterance_ID', 'N/A')}): {e}")
        return {'pixel_values': None, 'label': -1, 'video_filename_used': 'error', 'status': 'error'}


# --- 3. Convert Pandas DataFrames to Hugging Face Datasets ---
print("\nConverting pandas DataFrames to Datasets...")
raw_datasets = DatasetDict()
successful_conversions = []
for split in ["train", "dev", "test"]:
    if split in dataframes:
        try:
            raw_datasets[split] = Dataset.from_pandas(dataframes[split])
            print(f"Successfully converted {split} DataFrame.")
            successful_conversions.append(split)
        except Exception as e:
            print(f"Error converting {split} DataFrame: {e}")
    else:
        print(f"No DataFrame found for {split} split.")

print(f"\nRaw Datasets structure: {raw_datasets}")


# --- 4. Apply the Preprocessing Function using .map() ---

# Define the updated preprocessing function that correctly uses the split info
def map_preprocess_function(example, split_name):
     """
     Wrapper for preprocess_video_frame that constructs the correct path.
     """
     try:
          dialogue_id = example['Dialogue_ID']
          utterance_id = example['Utterance_ID']
          video_filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"

          # Determine the correct video folder for the current split
          current_video_folder = video_folders.get(split_name)
          if not current_video_folder or not os.path.exists(current_video_folder):
               # print(f"Warning: Video folder for split '{split_name}' not found at {current_video_folder}")
               return {'pixel_values': None, 'label': -1, 'status': 'folder_not_found'}

          full_video_path = os.path.join(current_video_folder, video_filename)

          # --- Rest of the preprocessing logic ---
          if not os.path.exists(full_video_path):
              # print(f"Warning: Video file not found at {full_video_path}")
              return {'pixel_values': None, 'label': -1, 'status': 'file_not_found'}

          cap = cv2.VideoCapture(full_video_path)
          if not cap.isOpened(): return {'pixel_values': None, 'label': -1, 'status': 'cant_open_video'}

          frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
          middle_frame_idx = frame_count // 2
          cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_idx)
          ret, frame = cap.read()
          cap.release()

          if not ret or frame is None: return {'pixel_values': None, 'label': -1, 'status': 'cant_read_frame'}

          frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
          faces = detector.detect_faces(frame_rgb)

          if not faces: return {'pixel_values': None, 'label': -1, 'status': 'no_face_detected'}

          bounding_box = faces[0]['box']
          x, y, w, h = bounding_box
          padding = 20
          x1, y1 = max(0, x - padding), max(0, y - padding)
          x2, y2 = min(frame_rgb.shape[1], x + w + padding), min(frame_rgb.shape[0], y + h + padding)
          face_crop = frame_rgb[y1:y2, x1:x2]

          if face_crop.size == 0: return {'pixel_values': None, 'label': -1, 'status': 'empty_crop'}

          face_image = Image.fromarray(face_crop)

          try:
               inputs = processor(images=face_image, return_tensors="pt")
               pixel_values = inputs['pixel_values'].squeeze(0)
          except Exception as proc_e:
               # print(f"Processor error on {full_video_path}: {proc_e}")
               return {'pixel_values': None, 'label': -1, 'status': 'processor_error'}

          # Get label (assuming 'Emotion' column exists and is string)
          # Make sure the features are accessible - might need to load them if not part of Dataset
          # For pandas-loaded data, we need the mapping explicitly
          emotion_map = {'neutral': 0, 'surprise': 1, 'fear': 2, 'sadness': 3, 'joy': 4, 'disgust': 5, 'anger': 6}
          emotion_str = example.get('Emotion', 'unknown').lower() # Handle potential missing/case issues
          emotion_label = emotion_map.get(emotion_str, -1) # Default to -1 if unknown

          if emotion_label == -1:
               # print(f"Warning: Unknown emotion '{example.get('Emotion')}' in {full_video_path}")
               pass # Keep label as -1, will filter later

          return {'pixel_values': pixel_values, 'label': emotion_label, 'status': 'success'}

     except Exception as e:
          # print(f"ERROR in map_preprocess_function for Dia:{example.get('Dialogue_ID', 'N/A')}, Utt:{example.get('Utterance_ID', 'N/A')}: {e}")
          return {'pixel_values': None, 'label': -1, 'status': 'unknown_error'}


# Apply the mapping function to each split that was successfully converted
processed_datasets = DatasetDict()
num_proc = 1 # Use multiple CPU cores for faster processing if available in Colab

print(f"\nApplying preprocessing function using {num_proc} processes (this will be slower)...")
for split_name in successful_conversions:
    print(f"Processing '{split_name}' split...")
    processed_datasets[split_name] = raw_datasets[split_name].map(
        map_preprocess_function,
        fn_kwargs={'split_name': split_name}, # Pass the split name to the function
        batched=False, # Process one example at a time
        num_proc=num_proc, # Use multiple cores
        remove_columns=raw_datasets[split_name].column_names # Remove old columns
    )
    print(f"Finished processing '{split_name}'.")


# --- 5. Filter out failed examples ---
print("\nFiltering out examples where preprocessing failed...")
filtered_datasets = DatasetDict()
for split_name in processed_datasets:
    # Keep only examples where pixel_values is not None and label is valid (>= 0)
    filtered_datasets[split_name] = processed_datasets[split_name].filter(
        lambda example: example['pixel_values'] is not None and example['label'] >= 0
    )
    # Also remove the 'status' column if it exists
    if 'status' in filtered_datasets[split_name].column_names:
         filtered_datasets[split_name] = filtered_datasets[split_name].remove_columns(['status'])

    print(f"'{split_name}' split: {len(processed_datasets[split_name])} processed -> {len(filtered_datasets[split_name])} valid examples.")


# --- 6. Set Format for PyTorch ---
print("\nSetting dataset format to PyTorch tensors...")
try:
    filtered_datasets.set_format("torch", columns=["pixel_values", "label"])
    print("Format set successfully.")
except Exception as e:
    print(f"Error setting format: {e}")

# --- Final Check ---
print("\n--- Preprocessing Complete ---")
print(f"Final dataset structure: {filtered_datasets}")
if "train" in filtered_datasets:
    print("\nFirst example from processed training set:")
    print(filtered_datasets["train"][0])
    print(f"Label: {filtered_datasets['train'][0]['label']}")
    print(f"Tensor shape: {filtered_datasets['train'][0]['pixel_values'].shape}")
else:
    print("\nNo valid training data after processing.")

Base path for videos: /content/drive/My Drive/MELD_Dataset/MELD.Raw
Initializing MTCNN face detector...
MTCNN detector initialized.

Converting pandas DataFrames to Datasets...
Successfully converted train DataFrame.
Successfully converted dev DataFrame.
Successfully converted test DataFrame.

Raw Datasets structure: DatasetDict({
    train: Dataset({
        features: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
        num_rows: 9989
    })
    dev: Dataset({
        features: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
        num_rows: 1109
    })
    test: Dataset({
        features: ['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
        num_rows: 2610
    })
})

Applying preprocessing function using 1 processes (

Map:   0%|          | 0/9989 [00:00<?, ? examples/s]

KeyboardInterrupt: 