In [120]:
import pandas as pd
import numpy as np
import os
from scipy.io import loadmat


In [121]:
# Load the CSV file
quality_df = pd.read_csv('Data_Quality_Evaluation.csv', index_col=0)

# Transpose the DataFrame
quality_df = quality_df.T


In [122]:
quality_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
1,2,1,1,4,1,5,3,5,1,5,...,5,1,1,5,3,4,1,1,5,1
2,1,4,5,1,1,5,1,1,1,1,...,3,1,1,3,1,1,1,2,1,2
3,1,1,1,1,1,1,1,1,3,1,...,1,4,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,5
5,5,5,4,3,3,4,4,4,5,4,...,4,3,4,3,3,3,3,3,5,4


In [123]:
# Define EEG channels
channels = ["delta", "theta", "alpha1", "alpha2", "beta1", "beta2", "gamma1", "gamma2"]

# Function to extract all channels from a .mat file
def get_all_channels_from_mat(file_path):
    """
    Load a .mat file and extract all channels as a NumPy array.
    Returns a 8xTime NumPy array where each row is a channel.
    Returns None if file doesn't exist or has issues.
    """
    try:
        mat_data = loadmat(file_path)
        
        # Use 'ThisEEG' as the data key
        data_key = 'ThisEEG'
        
        if data_key not in mat_data:
            print(f"Warning: 'ThisEEG' key not found in {file_path}")
            return None
        
        # Assume the data is already in 8xTime format
        data_array = mat_data[data_key]
        
        # Return as NumPy array with shape (8, time_points)
        return np.array(data_array, dtype=np.float64)
    
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None


In [124]:
# Build the main DataFrame
# Participants: 1 to 58 (as strings to match quality_df)
# Video clips: 1 to 36

participants = [str(i) for i in range(1, 59)]  # "1" to "58" as strings
video_clips = list(range(1, 37))  # 1 to 36

# Initialize DataFrame with object dtype to store NumPy arrays
eeg_df = pd.DataFrame(index=participants, columns=video_clips, dtype=object)

# Fill the DataFrame
for participant in participants:
    # Convert participant string to P01 format for folder name
    participant_folder = f"P{int(participant):02d}"
    
    for video_id in video_clips:
        file_path = f'EEGData/Movie_{participant_folder}/EEG_Clip{video_id}.mat'
        
        # Get all channels as a NumPy array (8xTime)
        eeg_array = get_all_channels_from_mat(file_path)
        
        if eeg_array is not None:
            eeg_df.at[participant, video_id] = eeg_array
        else:
            eeg_df.at[participant, video_id] = None
    
eeg_df.shape

(58, 36)

In [125]:
eeg_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
1,"[[336.0, 234.5, 133.0, 133.0, 133.0, 133.0, 13...","[[435.0, 435.0, 372.0, 309.0, 309.0, 357.0, 40...","[[325.0, 325.0, 328.5, 332.0, 332.0, 332.0, 35...","[[374.0, 374.0, 374.0, 413.5, 453.0, 453.0, 45...","[[264.0, 260.0, 256.0, 256.0, 307.0, 358.0, 35...","[[272.5, 268.0, 268.0, 290.0, 312.0, 312.0, 31...","[[345.5, 314.0, 314.0, 323.0, 332.0, 332.0, 34...","[[385.0, 385.0, 333.5, 282.0, 282.0, 326.5, 37...","[[289.0, 289.0, 281.5, 274.0, 274.0, 274.0, 27...","[[470.0, 470.0, 381.0, 292.0, 292.0, 355.0, 41...",...,"[[89.0, 89.0, 89.0, 89.0, 89.0, 89.0, 188.5, 2...","[[305.0, 306.5, 308.0, 308.0, 272.5, 237.0, 23...","[[320.5, 315.0, 315.0, 315.0, 303.5, 292.0, 29...","[[277.0, 278.0, 279.0, 289.0, 299.0, 299.0, 29...","[[528.0, 528.0, 404.0, 280.0, 280.0, 261.0, 24...","[[135.0, 135.0, 161.5, 188.0, 188.0, 218.0, 24...","[[401.0, 401.0, 369.0, 337.0, 337.0, 364.5, 39...","[[329.0, 329.0, 329.0, 329.0, 329.0, 354.0, 37...","[[213.0, 276.5, 340.0, 340.0, 365.0, 390.0, 39...","[[458.0, 458.0, 458.0, 369.5, 281.0, 281.0, 33..."
2,"[[410.0, 381.5, 353.0, 353.0, 318.5, 284.0, 28...","[[257.0, 257.0, 257.0, 257.0, 257.0, 318.0, 37...","[[259.0, 343.5, 428.0, 428.0, 398.5, 369.0, 36...","[[497.0, 337.0, 337.0, 337.0, 333.0, 329.0, 32...","[[389.0, 389.0, 389.0, 352.0, 315.0, 315.0, 27...","[[295.0, 295.0, 305.5, 316.0, 316.0, 324.0, 33...","[[170.5, 341.0, 341.0, 379.0, 417.0, 417.0, 41...","[[628.0, 628.0, 628.0, 628.0, 637.0, 646.0, 64...","[[603.0, 392.0, 181.0, 181.0, 208.5, 236.0, 23...","[[700.0, 700.0, 700.0, 0.0, 0.0, 0.0, 117.0, 1...",...,"[[120.0, 206.5, 293.0, 293.0, 293.0, 299.5, 30...","[[749.0, 749.0, 624.0, 499.0, 499.0, 476.5, 45...","[[88.0, 88.0, 223.0, 358.0, 358.0, 357.5, 357....","[[314.0, 314.0, 519.5, 725.0, 725.0, 401.5, 78...","[[342.0, 348.0, 354.0, 354.0, 340.0, 326.0, 32...","[[217.0, 217.0, 256.0, 295.0, 295.0, 295.0, 32...","[[240.0, 322.0, 322.0, 462.5, 603.0, 603.0, 30...","[[116.5, 194.0, 194.0, 173.5, 153.0, 153.0, 21...","[[626.0, 631.5, 637.0, 637.0, 637.0, 476.0, 31...","[[362.0, 362.0, 362.0, 362.0, 362.0, 362.0, 36..."
3,"[[338.0, 356.0, 374.0, 374.0, 331.0, 288.0, 28...","[[300.0, 300.0, 328.5, 357.0, 357.0, 357.0, 34...","[[376.0, 360.0, 360.0, 322.0, 284.0, 284.0, 22...","[[0.0, 0.0, 105.0, 210.0, 210.0, 349.0, 488.0,...","[[321.0, 321.0, 321.0, 321.0, 321.0, 309.5, 29...","[[132.0, 132.0, 150.0, 168.0, 168.0, 175.5, 18...","[[344.5, 361.0, 361.0, 350.0, 339.0, 339.0, 34...","[[327.5, 360.0, 360.0, 345.5, 331.0, 331.0, 32...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[358.0, 314.0, 314.0, 302.0, 290.0, 290.0, 30...",...,"[[254.0, 254.0, 263.0, 272.0, 272.0, 262.5, 25...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[315.0, 315.0, 314.5, 314.0, 314.0, 314.0, 29...","[[348.5, 348.0, 348.0, 331.5, 315.0, 315.0, 31...","[[465.0, 352.0, 352.0, 267.0, 182.0, 182.0, 18...","[[244.0, 244.0, 244.0, 244.0, 244.0, 244.0, 24...","[[421.0, 366.0, 311.0, 311.0, 295.5, 280.0, 28...","[[292.0, 232.5, 173.0, 173.0, 284.5, 396.0, 39...","[[323.0, 323.0, 161.5, 0.0, 0.0, 0.0, 0.0, 0.0...","[[459.0, 459.0, 459.0, 459.0, 433.5, 408.0, 40..."
4,"[[419.0, 663.5, 908.0, 908.0, 908.0, 646.5, 38...","[[267.0, 343.5, 420.0, 420.0, 210.0, 0.0, 0.0,...","[[408.0, 376.0, 344.0, 344.0, 262.0, 180.0, 18...","[[333.0, 226.0, 226.0, 226.0, 303.5, 381.0, 38...","[[320.0, 320.0, 340.0, 360.0, 360.0, 679.5, 99...","[[456.0, 415.5, 375.0, 320.0, 265.0, 265.0, 26...","[[296.0, 296.0, 409.5, 523.0, 523.0, 523.0, 76...","[[357.0, 374.0, 374.0, 366.0, 358.0, 358.0, 42...","[[358.0, 528.0, 528.0, 528.0, 237.0, 237.0, 23...","[[182.0, 9.0, 9.0, 80.0, 151.0, 151.0, 304.5, ...",...,"[[150.0, 150.0, 125.0, 100.0, 100.0, 132.5, 16...","[[283.0, 283.0, 303.5, 324.0, 324.0, 362.0, 40...","[[352.0, 340.0, 328.0, 328.0, 385.5, 443.0, 44...","[[111.0, 111.0, 111.0, 164.0, 164.0, 164.0, 16...","[[421.0, 343.0, 265.0, 265.0, 410.5, 556.0, 55...","[[148.0, 148.0, 185.0, 185.0, 185.0, 213.0, 21...","[[374.0, 329.0, 284.0, 284.0, 340.0, 396.0, 39...","[[364.0, 364.0, 432.0, 432.0, 432.0, 456.0, 45...","[[312.0, 312.0, 319.5, 327.0, 327.0, 163.5, 0....","[[796.0, 999.0, 999.0, 499.5, 0.0, 0.0, 189.0,..."
5,"[[315.0, 326.5, 338.0, 338.0, 317.5, 297.0, 29...","[[362.0, 331.5, 301.0, 301.0, 307.0, 313.0, 31...","[[258.0, 258.0, 276.5, 295.0, 295.0, 357.5, 42...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[346.0, 333.5, 321.0, 321.0, 296.5, 272.0, 27...","[[284.0, 284.0, 311.0, 338.0, 338.0, 301.5, 26...","[[251.5, 235.0, 235.0, 235.0, 235.0, 235.0, 23...","[[276.0, 498.0, 498.0, 498.0, 625.0, 752.0, 75...","[[411.0, 411.0, 411.0, 411.0, 411.0, 411.0, 53...","[[374.0, 374.0, 331.0, 288.0, 288.0, 288.0, 26...",...,"[[309.0, 328.5, 348.0, 348.0, 340.0, 332.0, 33...","[[183.0, 183.0, 183.0, 276.0, 369.0, 369.0, 39...","[[571.0, 571.0, 571.0, 571.0, 571.0, 443.0, 31...","[[312.0, 312.0, 582.0, 582.0, 582.0, 410.0, 41...","[[354.0, 354.0, 354.0, 354.0, 354.0, 354.0, 35...","[[330.0, 317.5, 305.0, 305.0, 306.5, 308.0, 30...","[[359.0, 359.0, 359.0, 801.0, 801.0, 801.0, 76...","[[93.5, 187.0, 187.0, 282.5, 378.0, 378.0, 378...","[[356.0, 356.0, 356.0, 356.0, 356.0, 356.0, 35...","[[8.0, 8.0, 70.0, 132.0, 132.0, 152.5, 173.0, ..."


In [126]:
# Create cleaned DataFrame based on quality scores
# 1 = perfect, 2 = good, 3 = ok, 4 = problematic, 5 = bad, 6 = missing
# Remove all data with quality score => 3

# Start with a copy of eeg_df
eeg_cleaned_df = eeg_df.copy()

# Iterate through all participants and video clips
for participant in eeg_cleaned_df.index:
    for video_id in eeg_cleaned_df.columns:
        # Check if quality score exists and is >= 3
        if participant in quality_df.index and video_id in quality_df.columns:
            quality_score = quality_df.loc[participant, video_id]
            if pd.notna(quality_score) and quality_score >= 3:
                # Set the corresponding EEG data to None
                eeg_cleaned_df.at[participant, video_id] = None


print(f"Original non-null cells: {eeg_df.notna().sum().sum()}")
print(f"Cleaned non-null cells: {eeg_cleaned_df.notna().sum().sum()}")
print(f"Cells removed: {eeg_df.notna().sum().sum() - eeg_cleaned_df.notna().sum().sum()}")


Original non-null cells: 2088
Cleaned non-null cells: 1450
Cells removed: 638


In [127]:
eeg_cleaned_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,27,28,29,30,31,32,33,34,35,36
1,"[[336.0, 234.5, 133.0, 133.0, 133.0, 133.0, 13...","[[435.0, 435.0, 372.0, 309.0, 309.0, 357.0, 40...","[[325.0, 325.0, 328.5, 332.0, 332.0, 332.0, 35...",,"[[264.0, 260.0, 256.0, 256.0, 307.0, 358.0, 35...",,,,"[[289.0, 289.0, 281.5, 274.0, 274.0, 274.0, 27...",,...,,"[[305.0, 306.5, 308.0, 308.0, 272.5, 237.0, 23...","[[320.5, 315.0, 315.0, 315.0, 303.5, 292.0, 29...",,,,"[[401.0, 401.0, 369.0, 337.0, 337.0, 364.5, 39...","[[329.0, 329.0, 329.0, 329.0, 329.0, 354.0, 37...",,"[[458.0, 458.0, 458.0, 369.5, 281.0, 281.0, 33..."
2,"[[410.0, 381.5, 353.0, 353.0, 318.5, 284.0, 28...",,,"[[497.0, 337.0, 337.0, 337.0, 333.0, 329.0, 32...","[[389.0, 389.0, 389.0, 352.0, 315.0, 315.0, 27...",,"[[170.5, 341.0, 341.0, 379.0, 417.0, 417.0, 41...","[[628.0, 628.0, 628.0, 628.0, 637.0, 646.0, 64...","[[603.0, 392.0, 181.0, 181.0, 208.5, 236.0, 23...","[[700.0, 700.0, 700.0, 0.0, 0.0, 0.0, 117.0, 1...",...,,"[[749.0, 749.0, 624.0, 499.0, 499.0, 476.5, 45...","[[88.0, 88.0, 223.0, 358.0, 358.0, 357.5, 357....",,"[[342.0, 348.0, 354.0, 354.0, 340.0, 326.0, 32...","[[217.0, 217.0, 256.0, 295.0, 295.0, 295.0, 32...","[[240.0, 322.0, 322.0, 462.5, 603.0, 603.0, 30...","[[116.5, 194.0, 194.0, 173.5, 153.0, 153.0, 21...","[[626.0, 631.5, 637.0, 637.0, 637.0, 476.0, 31...","[[362.0, 362.0, 362.0, 362.0, 362.0, 362.0, 36..."
3,"[[338.0, 356.0, 374.0, 374.0, 331.0, 288.0, 28...","[[300.0, 300.0, 328.5, 357.0, 357.0, 357.0, 34...","[[376.0, 360.0, 360.0, 322.0, 284.0, 284.0, 22...","[[0.0, 0.0, 105.0, 210.0, 210.0, 349.0, 488.0,...","[[321.0, 321.0, 321.0, 321.0, 321.0, 309.5, 29...","[[132.0, 132.0, 150.0, 168.0, 168.0, 175.5, 18...","[[344.5, 361.0, 361.0, 350.0, 339.0, 339.0, 34...","[[327.5, 360.0, 360.0, 345.5, 331.0, 331.0, 32...",,"[[358.0, 314.0, 314.0, 302.0, 290.0, 290.0, 30...",...,"[[254.0, 254.0, 263.0, 272.0, 272.0, 262.5, 25...",,"[[315.0, 315.0, 314.5, 314.0, 314.0, 314.0, 29...","[[348.5, 348.0, 348.0, 331.5, 315.0, 315.0, 31...","[[465.0, 352.0, 352.0, 267.0, 182.0, 182.0, 18...","[[244.0, 244.0, 244.0, 244.0, 244.0, 244.0, 24...","[[421.0, 366.0, 311.0, 311.0, 295.5, 280.0, 28...","[[292.0, 232.5, 173.0, 173.0, 284.5, 396.0, 39...","[[323.0, 323.0, 161.5, 0.0, 0.0, 0.0, 0.0, 0.0...","[[459.0, 459.0, 459.0, 459.0, 433.5, 408.0, 40..."
4,"[[419.0, 663.5, 908.0, 908.0, 908.0, 646.5, 38...","[[267.0, 343.5, 420.0, 420.0, 210.0, 0.0, 0.0,...","[[408.0, 376.0, 344.0, 344.0, 262.0, 180.0, 18...","[[333.0, 226.0, 226.0, 226.0, 303.5, 381.0, 38...","[[320.0, 320.0, 340.0, 360.0, 360.0, 679.5, 99...","[[456.0, 415.5, 375.0, 320.0, 265.0, 265.0, 26...","[[296.0, 296.0, 409.5, 523.0, 523.0, 523.0, 76...","[[357.0, 374.0, 374.0, 366.0, 358.0, 358.0, 42...","[[358.0, 528.0, 528.0, 528.0, 237.0, 237.0, 23...","[[182.0, 9.0, 9.0, 80.0, 151.0, 151.0, 304.5, ...",...,"[[150.0, 150.0, 125.0, 100.0, 100.0, 132.5, 16...","[[283.0, 283.0, 303.5, 324.0, 324.0, 362.0, 40...","[[352.0, 340.0, 328.0, 328.0, 385.5, 443.0, 44...","[[111.0, 111.0, 111.0, 164.0, 164.0, 164.0, 16...","[[421.0, 343.0, 265.0, 265.0, 410.5, 556.0, 55...","[[148.0, 148.0, 185.0, 185.0, 185.0, 213.0, 21...","[[374.0, 329.0, 284.0, 284.0, 340.0, 396.0, 39...","[[364.0, 364.0, 432.0, 432.0, 432.0, 456.0, 45...","[[312.0, 312.0, 319.5, 327.0, 327.0, 163.5, 0....",
5,,,,,,,,,,,...,,,,,,,,,,


In [128]:
# Find participants where all cells are unusable
# Check each row - if all values are None/NaN, the row sum of notna() will be 0
participants_all_none = eeg_cleaned_df.index[eeg_cleaned_df.notna().sum(axis=1) == 0].tolist()

print(f"Participants with all data quality values >= 3: {participants_all_none}")
print(f"Count: {len(participants_all_none)}")

Participants with all data quality values >= 3: ['5', '6', '9', '13', '18']
Count: 5


In [129]:
# Get the delta channel of participant 3, video 5
eeg_cleaned_df.loc['3'][5][channels.index('delta')]

array([321., 321., 321., ..., 205., 524., 524.])

In [130]:
from antropy import lziv_complexity

def calculate_lzc(signal, hz=32, epoch_len_sec=2.0):
    """
    signal: array of shape (1, samples) or (samples,)
    hz: sampling rate
    epoch_len_sec: length of epoch to calculate LPZ over
    
    Returns: average Lempel-Ziv complexity across all epochs
    """
    # Ensure signal is 1D
    if len(signal.shape) == 2:
        signal = signal.flatten()
    
    n_samples = len(signal)
    epoch_samples = int(hz * epoch_len_sec)  # number of samples per epoch
    n_epochs = n_samples // epoch_samples  # number of complete epochs
    
    if n_epochs == 0:
        return np.nan
    
    lzc_vals = []
    
    # Calculate LPZ for each epoch
    for e in range(n_epochs):
        # Extract epoch
        start_idx = e * epoch_samples
        end_idx = start_idx + epoch_samples
        epoch = signal[start_idx:end_idx]
        
        # Z-score the epoch
        z_scored = (epoch - np.mean(epoch)) / (np.std(epoch) + 1e-9)
        
        # Binarize (threshold at 0)
        binarized = (z_scored > 0).astype(int)
        
        # Calculate Lempel-Ziv complexity
        lzc = lziv_complexity(binarized)
        lzc_vals.append(lzc)
    
    # Return average LPZ across all epochs
    return np.mean(lzc_vals)


In [131]:
# Calculate LZC for all cells in eeg_cleaned_df
# Create a new DataFrame with the same shape
lzc_df = pd.DataFrame(index=eeg_cleaned_df.index, columns=eeg_cleaned_df.columns, dtype=object)

# Iterate through all participants and video clips
for participant in eeg_cleaned_df.index:
    for video_id in eeg_cleaned_df.columns:
        eeg_data = eeg_cleaned_df.at[participant, video_id]
        
        # If the cell is None, keep it None
        if eeg_data is None:
            lzc_df.at[participant, video_id] = None
        else:
            # eeg_data is an 8xTime NumPy array
            # Calculate LZC for each of the 8 channels
            lzc_values = []
            
            for channel_idx in range(8):
                # Extract the channel data (one row from the 8xTime array)
                channel_signal = eeg_data[channel_idx, :]
                
                # Calculate LZC for this channel
                lzc = calculate_lzc(channel_signal, hz=32, epoch_len_sec=2.0)
                lzc_values.append(lzc)
            
            # Store as NumPy array (NaN values will be preserved)
            lzc_df.at[participant, video_id] = np.array(lzc_values, dtype=np.float64)

print(f"LZC DataFrame shape: {lzc_df.shape}")
print(f"Non-null cells: {lzc_df.notna().sum().sum()}")


LZC DataFrame shape: (58, 36)
Non-null cells: 1450


In [133]:
# Calculate statistics for LZC scores by channel
# Collect all LZC values for each channel
channel_lzc_data = {channel: [] for channel in channels}

# Iterate through all cells in lzc_df
for participant in lzc_df.index:
    for video_id in lzc_df.columns:
        lzc_array = lzc_df.at[participant, video_id]
        
        # Skip None cells
        if lzc_array is not None:
            # lzc_array is a NumPy array of 8 values (one per channel)
            for channel_idx, channel_name in enumerate(channels):
                lzc_value = lzc_array[channel_idx]
                # Only add if not NaN
                if not np.isnan(lzc_value):
                    channel_lzc_data[channel_name].append(lzc_value)

# Calculate statistics for each channel
lzc_stats = []
for channel in channels:
    values = channel_lzc_data[channel]
    if len(values) > 0:
        stats = {
            'Channel': channel,
            'Count': len(values),
            'Mean': np.mean(values),
            'Std': np.std(values),
            'Min': np.min(values),
            'Max': np.max(values),
            'Median': np.median(values),
            '25th Percentile': np.percentile(values, 25),
            '75th Percentile': np.percentile(values, 75)
        }
    else:
        stats = {
            'Channel': channel,
            'Count': 0,
            'Mean': np.nan,
            'Std': np.nan,
            'Min': np.nan,
            'Max': np.nan,
            'Median': np.nan,
            '25th Percentile': np.nan,
            '75th Percentile': np.nan
        }
    lzc_stats.append(stats)

# Create DataFrame with statistics
lzc_stats_df = pd.DataFrame(lzc_stats)
lzc_stats_df


Unnamed: 0,Channel,Count,Mean,Std,Min,Max,Median,25th Percentile,75th Percentile
0,delta,1450,8.225088,0.672919,2.0,10.041667,8.339469,7.913453,8.62963
1,theta,1450,2.641821,0.592472,2.0,3.68,2.125,2.105769,3.266667
2,alpha1,1450,2.6987,0.59064,2.0,3.68,2.951744,2.108974,3.285714
3,alpha2,1450,2.645249,0.525717,2.0,3.529412,2.788889,2.117647,3.155556
4,beta1,1450,2.646325,0.528779,2.0,3.555556,2.646753,2.117647,3.16
5,beta2,1450,2.590623,0.478792,2.0,3.537037,2.633953,2.117647,3.066667
6,gamma1,1450,2.641975,0.524695,2.0,3.515152,2.812149,2.117647,3.155556
7,gamma2,1450,2.629764,0.503866,2.0,3.457143,2.957778,2.117647,3.114123


In [132]:
lzc_df.loc['25'][20]

array([8.83333333, 3.3       , 3.3       , 3.13333333, 3.13333333,
       2.8       , 3.1       , 3.03333333])