In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.preprocessing import LabelEncoder # <--- Added Import

# --- 1. Load Text Embeddings ---
print("Step 1: Loading Text Embeddings...")
# Check Drive path first, then local
pkl_path = '/content/drive/MyDrive/DS 340 Project/MELD.Raw/text_emotion.pkl'
if not os.path.exists(pkl_path):
    pkl_path = 'text_emotion.pkl' # Fallback to local

if not os.path.exists(pkl_path):
    print("‚ùå ERROR: 'text_emotion.pkl' not found.")
else:
    with open(pkl_path, 'rb') as f:
        text_data = pickle.load(f)

    # --- 2. Load Labels from CSV ---
    print("Step 2: Loading CSV Labels...")
    base_path = '/content/drive/MyDrive/DS 340 Project/MELD.Raw'

    train_labels_df = pd.read_csv(os.path.join(base_path, 'train_sent_emo.csv'))
    dev_labels_df = pd.read_csv(os.path.join(base_path, 'dev_sent_emo.csv'))
    test_labels_df = pd.read_csv(os.path.join(base_path, 'test_sent_emo.csv'))

    # Helper to map IDs to Emotions
    def create_label_dict(df):
        label_dict = {}
        for _, row in df.iterrows():
            label_dict[(row['Dialogue_ID'], row['Utterance_ID'])] = row['Emotion']
        return label_dict

    train_label_dict = create_label_dict(train_labels_df)
    dev_label_dict = create_label_dict(dev_labels_df)
    test_label_dict = create_label_dict(test_labels_df)

    # --- 3. Extract IDs and Sync ---
    def extract_ids_and_labels(text_dict, label_dict):
        dia_ids = []
        utt_ids = []
        emotions = []

        for dia_id in text_dict.keys():
            num_utts = text_dict[dia_id].shape[0]
            for utt_idx in range(num_utts):
                key = (int(dia_id), utt_idx)
                if key in label_dict:
                    dia_ids.append(dia_id)
                    utt_ids.append(utt_idx)
                    emotions.append(label_dict[key])

        return dia_ids, utt_ids, emotions

    print("Step 3: Generating ID lists...")
    train_text_dict, val_text_dict, test_text_dict = text_data

    # Create the variables for matching with spectrograms
    train_dia_ids, train_utt_ids, train_emotions = extract_ids_and_labels(train_text_dict, train_label_dict)
    val_dia_ids, val_utt_ids, val_emotions = extract_ids_and_labels(val_text_dict, dev_label_dict)
    test_dia_ids, test_utt_ids, test_emotions = extract_ids_and_labels(test_text_dict, test_label_dict)

    # --- 4. ENCODE LABELS (CRITICAL ADDITION) ---
    print("Step 4: Encoding Labels...")
    le = LabelEncoder()
    # Fit on all emotions to ensure consistent mapping
    le.fit(train_emotions + val_emotions + test_emotions)

    # Create the 'y' variables required by the spectrogram script
    y_train = le.transform(train_emotions)
    y_val = le.transform(val_emotions)
    y_test = le.transform(test_emotions)

    print(f"‚úÖ Success! Variables restored and encoded.")
    print(f"Train samples: {len(y_train)}")
    print(f"Classes: {le.classes_}")

Step 1: Loading Text Embeddings...
Step 2: Loading CSV Labels...
Step 3: Generating ID lists...
Step 4: Encoding Labels...
‚úÖ Success! Variables restored and encoded.
Train samples: 9989
Classes: ['anger' 'disgust' 'fear' 'joy' 'neutral' 'sadness' 'surprise']


# Extract mp4 files from zips

In [None]:
import librosa
import numpy as np
import os
import glob
from tqdm import tqdm
from google.colab import drive

# --- 1. SET UP ENVIRONMENT AND PATHS ---

# NOTE: The Drive MUST be mounted BEFORE running this cell.
try:
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive', force_remount=True)
        print("‚úì Drive mounted.")
except Exception:
    print("Drive mounting failed. Please ensure the Drive mount cell is run.")

# Define the EXPLICIT paths where the MP4 files were successfully extracted
ACTUAL_MP4_PATHS = {
    'Train': '/content/meld_audio_splits/train',
    'Validation': '/content/meld_audio_splits/dev/dev_splits_complete',
    'Test': '/content/meld_audio_splits/test'
}

print(f"Verified MP4 Source Path for Train: {ACTUAL_MP4_PATHS['Train']}")
print("-" * 50)


# --- 2. Improved Spectrogram Function ---
def process_audio_to_spectrogram(file_path, max_time_steps=128, n_mels=128):
    """
    Converts raw audio (from .wav or .mp4) to a 2D Mel-Spectrogram image.
    Uses a fixed duration and consistent padding/truncation for CNN input (128x128x1).
    """
    try:
        # Load audio (limited to 3 seconds to ensure consistent size)
        y, sr = librosa.load(file_path, sr=16000, duration=3.0)

        # Generate Mel Spectrogram
        mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)

        # Convert to Log-Scale (dB)
        mels_db = librosa.power_to_db(mels, ref=np.max)

        # Normalization (Min-Max Scaling)
        min_val = mels_db.min()
        max_val = mels_db.max()

        if max_val - min_val > 1e-8:
            mels_db = (mels_db - min_val) / (max_val - min_val)
        else:
            mels_db = np.zeros_like(mels_db)

        # Padding/Truncating to achieve max_time_steps (128 columns)
        current_width = mels_db.shape[1]
        if current_width < max_time_steps:
            padding = max_time_steps - current_width
            mels_db = np.pad(mels_db, ((0, 0), (0, padding)), mode='constant')
        else:
            mels_db = mels_db[:, :max_time_steps]

        # Add channel dimension (128, 128) -> (128, 128, 1)
        return mels_db[..., np.newaxis]

    except Exception as e:
        return np.zeros((n_mels, max_time_steps, 1))


# --- 3. Optimized File Loading and Processing ---
def find_and_process_optimized(dia_ids, utt_ids, split_name, base_path):
    """
    Indexes MP4 files recursively and generates spectrograms aligned to the
    provided IDs. Guarantees 1:1 correspondence.
    """
    print(f"Indexing audio files for {split_name} set from: {base_path}")

    # --- OPTIMIZATION: Index files first ---
    audio_file_index = {}

    if not os.path.exists(base_path):
        print(f"Error: Base path {base_path} does not exist. Did the extraction run correctly?")
        return np.array([])

    file_count = 0
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith(('.mp4', '.wav')):
                audio_file_index[file] = os.path.join(root, file)
                file_count += 1

    print(f"Indexed {file_count} audio/video files in the local Colab disk.")

    spectrograms = []
    missing_count = 0

    target_count = len(dia_ids)
    print(f"Generating spectrograms for {target_count} target labels...")

    # Iterate strictly over the provided IDs
    for d_id, u_id in tqdm(zip(dia_ids, utt_ids), total=target_count, desc=f"Processing {split_name}"):
        filename_mp4 = f"dia{d_id}_utt{u_id}.mp4"
        filename_wav = f"dia{d_id}_utt{u_id}.wav"

        path = None
        if filename_mp4 in audio_file_index:
            path = audio_file_index[filename_mp4]
        elif filename_wav in audio_file_index:
            path = audio_file_index[filename_wav]

        if path:
            spec = process_audio_to_spectrogram(path)
            spectrograms.append(spec)
        else:
            missing_count += 1
            # CRITICAL: Append blank spectrogram to maintain alignment index
            spectrograms.append(np.zeros((128, 128, 1)))

    if missing_count > 0:
        print(f"\nWarning: {missing_count} audio files were not found for the {split_name} set.")

    return np.array(spectrograms)


# --- 4. Execution and Saving ---
print("\n" + "=" * 70)
print("üéØ STARTING SYNCHRONIZED SPECTROGRAM GENERATION")
print("=" * 70)

# The following variables (train_dia_ids, train_utt_ids, etc.) MUST be defined
# by reading the MELD CSV files before this cell. If not, a NameError will occur.

# Process Train Set
X_audio_train = find_and_process_optimized(
    train_dia_ids, train_utt_ids, "Train", ACTUAL_MP4_PATHS['Train']
)

# Process Validation Set (Dev)
X_audio_val = find_and_process_optimized(
    val_dia_ids, val_utt_ids, "Validation", ACTUAL_MP4_PATHS['Validation']
)

# Process Test Set
X_audio_test = find_and_process_optimized(
    test_dia_ids, test_utt_ids, "Test", ACTUAL_MP4_PATHS['Test']
)

print(f"\n{'=' * 70}")
print(f"‚úÖ SPECTROGRAM GENERATION COMPLETE")
print(f"{'=' * 70}")

print(f"\nFinal Audio Data Shapes:")
print(f"  Train: {X_audio_train.shape}")
print(f"  Val:   {X_audio_val.shape}")
print(f"  Test:  {X_audio_test.shape}")


# --- SAVING TO GOOGLE DRIVE ---
SAVE_DIR = '/content/drive/MyDrive/DS 340 Project/MELD_SPECTROGRAMS/'
!mkdir -p "{SAVE_DIR}"

print(f"\nSaving generated spectrogram arrays to Drive: {SAVE_DIR}")

np.save(f"{SAVE_DIR}X_audio_train.npy", X_audio_train)
np.save(f"{SAVE_DIR}X_audio_val.npy", X_audio_val)
np.save(f"{SAVE_DIR}X_audio_test.npy", X_audio_test)

print("‚úì All Spectrogram Arrays Saved Successfully to Google Drive!")

Verified MP4 Source Path for Train: /content/meld_audio_splits/train
--------------------------------------------------

üéØ STARTING SYNCHRONIZED SPECTROGRAM GENERATION
Indexing audio files for Train set from: /content/meld_audio_splits/train
Error: Base path /content/meld_audio_splits/train does not exist. Did the extraction run correctly?
Indexing audio files for Validation set from: /content/meld_audio_splits/dev/dev_splits_complete
Error: Base path /content/meld_audio_splits/dev/dev_splits_complete does not exist. Did the extraction run correctly?
Indexing audio files for Test set from: /content/meld_audio_splits/test
Error: Base path /content/meld_audio_splits/test does not exist. Did the extraction run correctly?

‚úÖ SPECTROGRAM GENERATION COMPLETE

Final Audio Data Shapes:
  Train: (0,)
  Val:   (0,)
  Test:  (0,)

Saving generated spectrogram arrays to Drive: /content/drive/MyDrive/DS 340 Project/MELD_SPECTROGRAMS/
‚úì All Spectrogram Arrays Saved Successfully to Google Driv

# Convert audio to spectrogram

In [None]:
import librosa
import numpy as np
import os
import glob
from tqdm import tqdm
from google.colab import drive
import tarfile # Added for archive extraction

# --- 1. SET UP ENVIRONMENT AND PATHS ---

# NOTE: The Drive MUST be mounted BEFORE running this cell.
try:
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive', force_remount=True)
        print("‚úì Drive mounted.")
except Exception:
    print("Drive mounting failed. Please ensure the Drive mount cell is run.")

# Define the EXPLICIT paths where the MP4 files are EXPECTED to be extracted
ACTUAL_MP4_PATHS = {
    'Train': '/content/meld_audio_splits/train',
    'Validation': '/content/meld_audio_splits/dev/dev_splits_complete',
    'Test': '/content/meld_audio_splits/test'
}

# Path to the directory containing the .tar.gz archives (in Google Drive)
MELD_RAW_PATH = '/content/drive/MyDrive/DS 340 Project/MELD.Raw/'

print(f"Verified MP4 Source Path for Train: {ACTUAL_MP4_PATHS['Train']}")
print("-" * 50)


# --- NEW: 2. EXTRACT MP4 FILES FROM ZIPS ---
print("Step 2: Checking for and extracting audio archives...")

archive_details = {
    'train.tar.gz': ACTUAL_MP4_PATHS['Train'],
    'dev.tar.gz': ACTUAL_MP4_PATHS['Validation'],
    'test.tar.gz': ACTUAL_MP4_PATHS['Test']
}

for archive_name, dest_path in archive_details.items():
    source_archive_path = os.path.join(MELD_RAW_PATH, archive_name)

    # Check if the destination path already contains extracted files
    # This avoids re-extracting if it's already done
    if os.path.exists(dest_path) and any(f.endswith(('.mp4', '.wav')) for f in os.listdir(dest_path)):
        print(f"‚úì {archive_name} already extracted to {dest_path}. Skipping.")
        continue

    if os.path.exists(source_archive_path):
        print(f"Extracting {archive_name} to {dest_path}...")
        # Create destination directory if it doesn't exist
        !mkdir -p "{dest_path}"
        try:
            with tarfile.open(source_archive_path, 'r:gz') as tar:
                tar.extractall(path=dest_path)
            print(f"‚úÖ Successfully extracted {archive_name}.")
        except Exception as e:
            print(f"‚ùå Error extracting {archive_name}: {e}")
    else:
        print(f"‚ö†Ô∏è  Archive not found: {source_archive_path}. Skipping extraction for this split.")

print("Extraction check complete.")
print("-" * 50)

# --- OLD 2 / NEW 3. Improved Spectrogram Function ---
def process_audio_to_spectrogram(file_path, max_time_steps=128, n_mels=128):
    """
    Converts raw audio (from .wav or .mp4) to a 2D Mel-Spectrogram image.
    Uses a fixed duration and consistent padding/truncation for CNN input (128x128x1).
    """
    try:
        # Load audio (limited to 3 seconds to ensure consistent size)
        y, sr = librosa.load(file_path, sr=16000, duration=3.0)

        # Generate Mel Spectrogram
        mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)

        # Convert to Log-Scale (dB)
        mels_db = librosa.power_to_db(mels, ref=np.max)

        # Normalization (Min-Max Scaling)
        min_val = mels_db.min()
        max_val = mels_db.max()

        # Avoid division by zero/near-zero range
        if max_val - min_val > 1e-8:
            mels_db = (mels_db - min_val) / (max_val - min_val)
        else:
            mels_db = np.zeros_like(mels_db)

        # Padding/Truncating to achieve max_time_steps (128 columns)
        current_width = mels_db.shape[1]
        if current_width < max_time_steps:
            padding = max_time_steps - current_width
            mels_db = np.pad(mels_db, ((0, 0), (0, padding)), mode='constant')
        else:
            mels_db = mels_db[:, :max_time_steps]

        # Add channel dimension (128, 128) -> (128, 128, 1)
        return mels_db[..., np.newaxis]

    except Exception as e:
        # Log the error and return an array of zeros to prevent crash
        # print(f"Error processing {os.path.basename(file_path)}: {e}")
        return np.zeros((n_mels, max_time_steps, 1))


# --- OLD 3 / NEW 4. Optimized File Loading and Processing ---
def find_and_process_optimized(dia_ids, utt_ids, split_name, base_path):
    """
    =========================================================================
    üéØ SYNCHRONIZATION GUARANTEE:
    =========================================================================
    This function generates spectrograms in the EXACT order specified by
    the dia_ids and utt_ids parameters, which come from the label filtering
    step. This ensures that:

    - The i-th spectrogram corresponds to the i-th label
    - Missing audio files are replaced with zero spectrograms (keeping index alignment)
    - The output array length exactly matches the label array length

    By using the filtered ID lists as the single source of truth, we eliminate
    the need for any post-processing trimming or alignment.
    =========================================================================

    Indexes MP4 files recursively under base_path and generates spectrograms
    aligned to the provided dialogue/utterance IDs.

    Parameters:
    -----------
    dia_ids : list
        List of dialogue IDs from the label processing step
    utt_ids : list
        List of utterance IDs from the label processing step
    split_name : str
        Name of the split (Train/Validation/Test) for logging
    base_path : str
        Base directory containing the audio files

    Returns:
    --------
    np.ndarray
        Array of spectrograms with shape (len(dia_ids), 128, 128, 1)
    """
    print(f"Indexing audio files for {split_name} set from: {base_path}")

    # --- OPTIMIZATION: Index files first ---
    audio_file_index = {}

    if not os.path.exists(base_path):
        print(f"Error: Base path {base_path} does not exist. Did the extraction run correctly?")
        return np.array([])

    # Recursively walk through folders and find all .mp4 and .wav files
    file_count = 0
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith(('.mp4', '.wav')):
                audio_file_index[file] = os.path.join(root, file)
                file_count += 1

    print(f"Indexed {file_count} audio/video files in the local Colab disk.")

    spectrograms = []
    missing_count = 0

    print(f"Generating spectrograms for {len(dia_ids)} utterances...")
    print(f"  ‚Üí Processing ONLY the IDs from the filtered label set")
    print(f"  ‚Üí This ensures 1:1 correspondence with y_{split_name.lower()}")

    for d_id, u_id in tqdm(zip(dia_ids, utt_ids), total=len(dia_ids), desc=f"Processing {split_name}"):
        # MELD filenames are consistent: diaX_uttY.mp4
        filename_mp4 = f"dia{d_id}_utt{u_id}.mp4"
        filename_wav = f"dia{d_id}_utt{u_id}.wav"

        path = None
        if filename_mp4 in audio_file_index:
            path = audio_file_index[filename_mp4]
        elif filename_wav in audio_file_index:
            path = audio_file_index[filename_wav]

        if path:
            spec = process_audio_to_spectrogram(path)
            spectrograms.append(spec)
        else:
            missing_count += 1
            # Append a blank spectrogram for missing files
            # IMPORTANT: We still append to maintain index alignment!
            spectrograms.append(np.zeros((128, 128, 1)))

    if missing_count > 0:
        print(f"\nWarning: {missing_count} audio files were not found for the {split_name} set.")
        print(f"  ‚Üí Zero spectrograms were inserted to maintain alignment with labels")

    return np.array(spectrograms)


# --- OLD 4 / NEW 5. Execution and Saving ---
print("\n" + "=" * 70)
print("üéØ STARTING SYNCHRONIZED SPECTROGRAM GENERATION")
print("=" * 70)
print("\n‚ö†Ô∏è  CRITICAL: This cell MUST be run AFTER the label processing cell!")
print("   The spectrogram generation uses the filtered ID lists as input.")
print("   This ensures perfect correspondence between audio and labels.\n")
# The following variables (train_dia_ids, train_utt_ids, etc.) MUST be defined
# by reading the MELD CSV files before this cell. If not, a NameError will occur.

# Process Train Set
# The train_dia_ids and train_utt_ids come from the label filtering step
# This guarantees that X_audio_train[i] corresponds to y_train[i]
X_audio_train = find_and_process_optimized(
    train_dia_ids, train_utt_ids, "Train", ACTUAL_MP4_PATHS['Train']
)

# Process Validation Set (Dev)
X_audio_val = find_and_process_optimized(
    val_dia_ids, val_utt_ids, "Validation", ACTUAL_MP4_PATHS['Validation']
)

# Process Test Set
X_audio_test = find_and_process_optimized(
    test_dia_ids, test_utt_ids, "Test", ACTUAL_MP4_PATHS['Test']
)

print(f"\n{'=' * 70}")
print(f"‚úÖ SPECTROGRAM GENERATION COMPLETE")
print(f"{'=' * 70}")
print(f"\nFinal Audio Data Shapes:")
print(f"  Train: {X_audio_train.shape}")
print(f"  Val:   {X_audio_val.shape}")
print(f"  Test:  {X_audio_test.shape}")

# --- SAVING TO GOOGLE DRIVE ---
SAVE_DIR = '/content/drive/MyDrive/DS 340 Project/MELD_SPECTROGRAMS/'
!mkdir -p "{SAVE_DIR}" # Create save directory on Drive

print(f"\nSaving generated spectrogram arrays to Drive: {SAVE_DIR}")

np.save(f"{SAVE_DIR}X_audio_train.npy", X_audio_train)
np.save(f"{SAVE_DIR}X_audio_val.npy", X_audio_val)
np.save(f"{SAVE_DIR}X_audio_test.npy", X_audio_test)

print("‚úì All Spectrogram Arrays Saved Successfully to Google Drive!")

Verified MP4 Source Path for Train: /content/meld_audio_splits/train
--------------------------------------------------
Step 2: Checking for and extracting audio archives...
Extracting train.tar.gz to /content/meld_audio_splits/train...


  tar.extractall(path=dest_path)


‚úÖ Successfully extracted train.tar.gz.
Extracting dev.tar.gz to /content/meld_audio_splits/dev/dev_splits_complete...
‚úÖ Successfully extracted dev.tar.gz.
Extracting test.tar.gz to /content/meld_audio_splits/test...
‚úÖ Successfully extracted test.tar.gz.
Extraction check complete.
--------------------------------------------------

üéØ STARTING SYNCHRONIZED SPECTROGRAM GENERATION

‚ö†Ô∏è  CRITICAL: This cell MUST be run AFTER the label processing cell!
   The spectrogram generation uses the filtered ID lists as input.
   This ensures perfect correspondence between audio and labels.

Indexing audio files for Train set from: /content/meld_audio_splits/train
Indexed 9989 audio/video files in the local Colab disk.
Generating spectrograms for 9989 utterances...
  ‚Üí Processing ONLY the IDs from the filtered label set
  ‚Üí This ensures 1:1 correspondence with y_train


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(file_path, sr=16000, duration=3.0)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(file_path, sr=16000, duration=3.0)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(file_path, sr=16000, duration=3.0)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(file_path, sr=16000, duration=3.0)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_nativ