In [20]:
import pandas as pd
import numpy as np
import os
from scipy.io import loadmat


In [41]:
# Load the CSV file
quality_df = pd.read_csv('Data_Quality_Evaluation.csv', index_col=0)

# Transpose the DataFrame
quality_df = quality_df.T


In [42]:
quality_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
1,2,1,1,4,1,5,3,5,1,5,...,5,1,1,5,3,4,1,1,5,1
2,1,4,5,1,1,5,1,1,1,1,...,3,1,1,3,1,1,1,2,1,2
3,1,1,1,1,1,1,1,1,3,1,...,1,4,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,5
5,5,5,4,3,3,4,4,4,5,4,...,4,3,4,3,3,3,3,3,5,4


In [36]:
# Function to extract first row from a .mat file
def get_first_row_from_mat(file_path):
    """
    Load a .mat file and extract the first row as a pandas Series.
    Returns None if file doesn't exist or has issues.
    """
    try:
        mat_data = loadmat(file_path)
        
        # Use 'ThisEEG' as the data key
        data_key = 'ThisEEG'
        
        if data_key not in mat_data:
            print(f"Warning: 'ThisEEG' key not found in {file_path}")
            return None
        
        data_array = mat_data[data_key]
        
        # Extract first row
        if len(data_array.shape) >= 2:
            first_row = data_array[0, :]
        elif len(data_array.shape) == 1:
            first_row = data_array[0] if data_array.shape[0] > 0 else data_array
        else:
            first_row = data_array.flatten()
        
        # Convert to pandas Series
        return pd.Series(first_row.flatten())
    
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None


In [43]:
# Build the main DataFrame
# Participants: 1 to 58 (as strings to match quality_df)
# Video clips: 1 to 36

participants = [str(i) for i in range(1, 59)]  # "1" to "58" as strings
video_clips = list(range(1, 37))  # 1 to 36

# Initialize DataFrame with object dtype to store Series objects
eeg_df = pd.DataFrame(index=participants, columns=video_clips, dtype=object)

# Fill the DataFrame
for participant in participants:
    # Convert participant string to P01 format for folder name
    participant_folder = f"P{int(participant):02d}"
    
    for video_id in video_clips:
        file_path = f'EEGData/Movie_{participant_folder}/EEG_Clip{video_id}.mat'
        
        # Get the first row as a Series
        first_row_series = get_first_row_from_mat(file_path)
        
        if first_row_series is not None:
            eeg_df.at[participant, video_id] = first_row_series
        else:
            eeg_df.at[participant, video_id] = None
    
eeg_df.shape

(58, 36)

In [40]:
eeg_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
1,0 336.0 1 234.5 2 133.0 3 ...,0 435.0 1 435.0 2 372.0 3 ...,0 325.0 1 325.0 2 328.5 3 ...,0 374.0 1 374.0 2 374.0 3 ...,0 264.0 1 260.0 2 256.0 3 ...,0 272.5 1 268.0 2 268.0 3 ...,0 345.5 1 314.0 2 314.0 3 ...,0 385.0 1 385.0 2 333.5 3 ...,0 289.0 1 289.0 2 281.5 3 ...,0 470.0 1 470.0 2 381.0 3 ...,...,0 89.0 1 89.0 2 89.0 3 ...,0 305.0 1 306.5 2 308.0 3 ...,0 320.5 1 315.0 2 315.0 3 ...,0 277.0 1 278.0 2 279.0 3 ...,0 528.0 1 528.0 2 404.0 3 ...,0 135.0 1 135.0 2 161.5 3 ...,0 401.0 1 401.0 2 369.0 3 ...,0 329.0 1 329.0 2 329.0 3 ...,0 213.0 1 276.5 2 340.0 3 ...,0 458.0 1 458.0 2 458.0 3 ...
2,0 410.0 1 381.5 2 353.0 3 ...,0 257.0 1 257.0 2 257.0 3 ...,0 259.0 1 343.5 2 428.0 3 ...,0 497.0 1 337.0 2 337.0 3 ...,0 389.0 1 389.0 2 389.0 3 ...,0 295.0 1 295.0 2 305.5 3 ...,0 170.5 1 341.0 2 341.0 3 ...,0 628.0 1 628.0 2 628.0 3 ...,0 603.0 1 392.0 2 181.0 3 ...,0 700.0 1 700.0 2 700.0 3 ...,...,0 120.0 1 206.5 2 293.0 3 ...,0 749.0 1 749.0 2 624.0 3 ...,0 88.0 1 88.0 2 223.0 3 ...,0 314.0 1 314.0 2 519.5 3 ...,0 342.0 1 348.0 2 354.0 3 ...,0 217.0 1 217.0 2 256.0 3 ...,0 240.0 1 322.0 2 322.0 3 ...,0 116.5 1 194.0 2 194.0 3 ...,0 626.0 1 631.5 2 637.0 3 ...,0 362.0 1 362.0 2 362.0 3 ...
3,0 338.0 1 356.0 2 374.0 3 ...,0 300.0 1 300.0 2 328.5 3 ...,0 376.0 1 360.0 2 360.0 3 ...,0 0.0 1 0.0 2 105.0 3 ...,0 321.0 1 321.0 2 321.0 3 ...,0 132.0 1 132.0 2 150.0 3 ...,0 344.5 1 361.0 2 361.0 3 ...,0 327.5 1 360.0 2 360.0 3 ...,0 NaN 1 NaN 2 NaN 3 ...,0 358.0 1 314.0 2 314.0 3 ...,...,0 254.0 1 254.0 2 263.0 3 ...,0 NaN 1 NaN 2 NaN 3 ...,0 315.0 1 315.0 2 314.5 3 ...,0 348.5 1 348.0 2 348.0 3 ...,0 465.0 1 352.0 2 352.0 3 ...,0 244.0 1 244.0 2 244.0 3 ...,0 421.0 1 366.0 2 311.0 3 ...,0 292.0 1 232.5 2 173.0 3 ...,0 323.0 1 323.0 2 161.5 3 ...,0 459.0 1 459.0 2 459.0 3 ...
4,0 419.0 1 663.5 2 908.0 3 ...,0 267.0 1 343.5 2 420.0 3 ...,0 408.0 1 376.0 2 344.0 3 ...,0 333.0 1 226.0 2 226.0 3 ...,0 320.0 1 320.0 2 340.0 3 ...,0 456.0 1 415.5 2 375.0 3 ...,0 296.0 1 296.0 2 409.5 3 ...,0 357.0 1 374.0 2 374.0 3 ...,0 358.0 1 528.0 2 528.0 3 ...,0 182.0 1 9.0 2 9.0 3 ...,...,0 150.0 1 150.0 2 125.0 3 ...,0 283.0 1 283.0 2 303.5 3 ...,0 352.0 1 340.0 2 328.0 3 ...,0 111.0 1 111.0 2 111.0 3 ...,0 421.0 1 343.0 2 265.0 3 ...,0 148.0 1 148.0 2 185.0 3 ...,0 374.0 1 329.0 2 284.0 3 ...,0 364.0 1 364.0 2 432.0 3 ...,0 312.0 1 312.0 2 319.5 3 ...,0 796.0 1 999.0 2 999.0 3 ...
5,0 315.0 1 326.5 2 338.0 3 ...,0 362.0 1 331.5 2 301.0 3 ...,0 258.0 1 258.0 2 276.5 3 ...,0 NaN 1 NaN 2 NaN 3 ...,0 346.0 1 333.5 2 321.0 3 ...,0 284.0 1 284.0 2 311.0 3 ...,0 251.5 1 235.0 2 235.0 3 ...,0 276.0 1 498.0 2 498.0 3 ...,0 411.0 1 411.0 2 411.0 3 ...,0 374.000000 1 374.000000 2 ...,...,0 309.0 1 328.5 2 348.0 3 ...,0 183.0 1 183.0 2 183.0 3 ...,0 571.0 1 571.0 2 571.0 3 ...,0 312.0 1 312.0 2 582.0 3 ...,0 354.0 1 354.0 2 354.0 3 ...,0 330.0 1 317.5 2 305.0 3 ...,0 359.0 1 359.0 2 359.0 3 ...,0 93.5 1 187.0 2 187.0 3 ...,0 356.0 1 356.0 2 356.0 3 ...,0 8.0 1 8.0 2 70.0 3 ...


In [48]:
# Create cleaned DataFrame based on quality scores
# 1 = perfect, 2 = good, 3 = ok, 4 = problematic, 5 = bad, 6 = missing
# Remove all data with quality score => 3

# Start with a copy of eeg_df
eeg_cleaned_df = eeg_df.copy()

# Iterate through all participants and video clips
for participant in eeg_cleaned_df.index:
    for video_id in eeg_cleaned_df.columns:
        # Check if quality score exists and is >= 3
        if participant in quality_df.index and video_id in quality_df.columns:
            quality_score = quality_df.loc[participant, video_id]
            if pd.notna(quality_score) and quality_score >= 3:
                # Set the corresponding EEG data to None
                eeg_cleaned_df.at[participant, video_id] = None


print(f"Original non-null cells: {eeg_df.notna().sum().sum()}")
print(f"Cleaned non-null cells: {eeg_cleaned_df.notna().sum().sum()}")
print(f"Cells removed: {eeg_df.notna().sum().sum() - eeg_cleaned_df.notna().sum().sum()}")


Original non-null cells: 2088
Cleaned non-null cells: 1450
Cells removed: 638


In [52]:
eeg_cleaned_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
1,0 336.0 1 234.5 2 133.0 3 ...,0 435.0 1 435.0 2 372.0 3 ...,0 325.0 1 325.0 2 328.5 3 ...,,0 264.0 1 260.0 2 256.0 3 ...,,,,0 289.0 1 289.0 2 281.5 3 ...,,...,,0 305.0 1 306.5 2 308.0 3 ...,0 320.5 1 315.0 2 315.0 3 ...,,,,0 401.0 1 401.0 2 369.0 3 ...,0 329.0 1 329.0 2 329.0 3 ...,,0 458.0 1 458.0 2 458.0 3 ...
2,0 410.0 1 381.5 2 353.0 3 ...,,,0 497.0 1 337.0 2 337.0 3 ...,0 389.0 1 389.0 2 389.0 3 ...,,0 170.5 1 341.0 2 341.0 3 ...,0 628.0 1 628.0 2 628.0 3 ...,0 603.0 1 392.0 2 181.0 3 ...,0 700.0 1 700.0 2 700.0 3 ...,...,,0 749.0 1 749.0 2 624.0 3 ...,0 88.0 1 88.0 2 223.0 3 ...,,0 342.0 1 348.0 2 354.0 3 ...,0 217.0 1 217.0 2 256.0 3 ...,0 240.0 1 322.0 2 322.0 3 ...,0 116.5 1 194.0 2 194.0 3 ...,0 626.0 1 631.5 2 637.0 3 ...,0 362.0 1 362.0 2 362.0 3 ...
3,0 338.0 1 356.0 2 374.0 3 ...,0 300.0 1 300.0 2 328.5 3 ...,0 376.0 1 360.0 2 360.0 3 ...,0 0.0 1 0.0 2 105.0 3 ...,0 321.0 1 321.0 2 321.0 3 ...,0 132.0 1 132.0 2 150.0 3 ...,0 344.5 1 361.0 2 361.0 3 ...,0 327.5 1 360.0 2 360.0 3 ...,,0 358.0 1 314.0 2 314.0 3 ...,...,0 254.0 1 254.0 2 263.0 3 ...,,0 315.0 1 315.0 2 314.5 3 ...,0 348.5 1 348.0 2 348.0 3 ...,0 465.0 1 352.0 2 352.0 3 ...,0 244.0 1 244.0 2 244.0 3 ...,0 421.0 1 366.0 2 311.0 3 ...,0 292.0 1 232.5 2 173.0 3 ...,0 323.0 1 323.0 2 161.5 3 ...,0 459.0 1 459.0 2 459.0 3 ...
4,0 419.0 1 663.5 2 908.0 3 ...,0 267.0 1 343.5 2 420.0 3 ...,0 408.0 1 376.0 2 344.0 3 ...,0 333.0 1 226.0 2 226.0 3 ...,0 320.0 1 320.0 2 340.0 3 ...,0 456.0 1 415.5 2 375.0 3 ...,0 296.0 1 296.0 2 409.5 3 ...,0 357.0 1 374.0 2 374.0 3 ...,0 358.0 1 528.0 2 528.0 3 ...,0 182.0 1 9.0 2 9.0 3 ...,...,0 150.0 1 150.0 2 125.0 3 ...,0 283.0 1 283.0 2 303.5 3 ...,0 352.0 1 340.0 2 328.0 3 ...,0 111.0 1 111.0 2 111.0 3 ...,0 421.0 1 343.0 2 265.0 3 ...,0 148.0 1 148.0 2 185.0 3 ...,0 374.0 1 329.0 2 284.0 3 ...,0 364.0 1 364.0 2 432.0 3 ...,0 312.0 1 312.0 2 319.5 3 ...,
5,,,,,,,,,,,...,,,,,,,,,,


In [54]:
# Find participants where all cells are unusable
# Check each row - if all values are None/NaN, the row sum of notna() will be 0
participants_all_none = eeg_cleaned_df.index[eeg_cleaned_df.notna().sum(axis=1) == 0].tolist()

print(f"Participants with all data quality values >= 3: {participants_all_none}")
print(f"Count: {len(participants_all_none)}")

Participants with all data quality values >= 3: ['5', '6', '9', '13', '18']
Count: 5
