In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import os

# --- Configuration ---
INPUT_EEG_FILE = 'extracted_EEG_data.csv'
OUTPUT_EEG_FEATURES = 'T1_G24_EEG_features.csv'

def extract_eeg_features_from_file(input_file, output_file):
    """Computes band power statistics (mean, std, skew, kurtosis) per trial."""
    print(f"--- Processing EEG data for Feature Engineering ---")
    try:
        # Load the synchronized/extracted EEG data
        df_eeg = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: {input_file} not found. Skipping EEG feature extraction.")
        return

    if 'Trial_Index' not in df_eeg.columns:
        print("Error: 'Trial_Index' column is missing. Cannot group by trial.")
        return

    # Identify columns containing band power values (assuming they are numeric and prefixed)
    # Adjust this list based on the actual columns in your extracted file (e.g., Alpha, Beta, etc.)
    band_cols = [col for col in df_eeg.columns if col not in ['Trial_Index', 'Timestamp', 'Participant_ID']]

    def extract_stats(series, prefix):
        """Helper to compute statistics on a single band power series."""
        features = {}
        if not series.empty:
            features[f'{prefix}_mean'] = series.mean()
            features[f'{prefix}_std'] = series.std()
            features[f'{prefix}_max'] = series.max()
            features[f'{prefix}_min'] = series.min()
            features[f'{prefix}_skew'] = skew(series.dropna())
            features[f'{prefix}_kurt'] = kurtosis(series.dropna())
        return pd.Series(features)

    all_eeg_features = []

    # Iterate through each trial and extract features
    for trial_index, df_trial in df_eeg.groupby('Trial_Index'):
        trial_features = {'Trial_Index': trial_index}

        # Optionally, preserve a key identifier like Participant_ID
        if 'Participant_ID' in df_trial.columns:
            trial_features['Participant_ID'] = df_trial['Participant_ID'].iloc[0]

        # Extract features for each band/channel
        for col in band_cols:
            stats = extract_stats(df_trial[col], col)
            trial_features.update(stats)

        all_eeg_features.append(trial_features)

    df_features = pd.DataFrame(all_eeg_features)
    df_features.to_csv(output_file, index=False)
    print(f"-> Successfully saved EEG features to {output_file}")
    print(df_features.head())

# extract_eeg_features_from_file(INPUT_EEG_FILE, OUTPUT_EEG_FEATURES)

In [None]:
from scipy.signal import find_peaks

INPUT_GSR_FILE = 'extracted_GSR_data.csv'
OUTPUT_GSR_FEATURES = 'T1_G24_GSR_features.csv'

def extract_gsr_features_from_file(input_file, output_file):
    """Computes GSR features like arousal slope and peak counts per trial."""
    print(f"\n--- Processing GSR data for Feature Engineering ---")
    try:
        df_gsr = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: {input_file} not found. Skipping GSR feature extraction.")
        return

    if 'Trial_Index' not in df_gsr.columns:
        print("Error: 'Trial_Index' column is missing. Cannot group by trial.")
        return

    # Assuming the arousal data is in a column named 'GSR_Value' or similar
    GSR_COLUMN = [col for col in df_gsr.columns if 'GSR' in col or 'arousal' in col]
    if not GSR_COLUMN:
        print("Error: Could not identify GSR value column (looking for 'GSR' or 'arousal').")
        return
    GSR_COLUMN = GSR_COLUMN[0]

    def extract_gsr_stats(df_trial):
        """Helper to compute features for a single trial."""
        series = df_trial[GSR_COLUMN].dropna()
        features = {}

        if not series.empty:
            # 1. Slope of Arousal (Approximation using first/last point difference)
            features['GSR_Arousal_Slope'] = (series.iloc[-1] - series.iloc[0]) / len(series)

            # 2. Peak Counts per Window (Requires a minimum peak height/distance)
            # Find peaks: height=threshold (e.g., mean of data), distance=min_samples_between_peaks
            peaks, _ = find_peaks(series, height=series.mean() * 0.8, distance=TARGET_RATE_HZ * 2)
            features['GSR_Peak_Count'] = len(peaks)

            # Additional features
            features['GSR_Mean'] = series.mean()
            features['GSR_Variance'] = series.var()

        return pd.Series(features)

    df_features = df_gsr.groupby('Trial_Index').apply(extract_gsr_stats).reset_index()
    df_features.to_csv(output_file, index=False)
    print(f"-> Successfully saved GSR features to {output_file}")
    print(df_features.head())

# extract_gsr_features_from_file(INPUT_GSR_FILE, OUTPUT_GSR_FEATURES)

In [None]:
def extract_eeg_features_from_file(input_file, output_file):
    """Computes band power statistics (mean, std, skew, kurtosis) per trial."""
    print(f"--- Processing EEG data for Feature Engineering ---")
    try:
        # Load the synchronized/extracted EEG data
        # --- REVISED LINE ---
        # 1. Try a common alternative delimiter (e.g., semicolon) if applicable.
        # 2. Use 'engine="python"' as it handles irregular lines better than the C engine.
        df_eeg = pd.read_csv(
            input_file,
            # sep=',', # Keep comma by default, or change to ';' if you suspect it
            engine='python',
            on_bad_lines='warn' # 'warn' prints a message but attempts to skip or fix, 'skip' ignores the bad line entirely
        )
        # --- END REVISED LINE ---

        # If the delimiter is the issue, you might need to test:
        # df_eeg = pd.read_csv(input_file, sep=';', engine='python', on_bad_lines='warn')

    except FileNotFoundError:
        print(f"Error: {input_file} not found. Skipping EEG feature extraction.")
        return
    # ... rest of the function remains the same ...

In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks

In [None]:
# 1. Mount Google Drive (Run this cell first!)
drive.mount('/content/drive')

# 2. Define your project's root folder structure (ADJUST THIS PATH!)
PROJECT_ROOT = '/content/drive/MyDrive/emotion_clustering/'
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')

# Create directories (if they don't exist)
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

print(f"Project files will be read from/saved to: {PROJECT_ROOT}")

# --- Global Configurations ---
# Target downsample rate from Step 2.1
TARGET_RATE_HZ = 5

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Project files will be read from/saved to: /content/drive/MyDrive/emotion_clustering/


In [None]:
INPUT_EEG_FILE = 'extracted_EEG_data.csv'
OUTPUT_EEG_FEATURES = os.path.join(PROJECT_ROOT, 'T1_G24_EEG_features.csv')

def extract_eeg_features_from_file(input_file, output_file):
    print(f"\n--- Processing EEG data: {input_file} ---")
    try:
        # Using engine='python' and on_bad_lines='skip' to handle potential errors from Step 2.1 output
        df_eeg = pd.read_csv(input_file, engine='python', on_bad_lines='skip')
    except Exception as e:
        print(f"Error loading {input_file}: {e}")
        return

    if 'Trial_Index' not in df_eeg.columns:
        print("Error: 'Trial_Index' column is missing.")
        return

    band_cols = [col for col in df_eeg.columns if col not in ['Trial_Index', 'Timestamp', 'Participant_ID']]

    def extract_stats(series, prefix):
        features = {}
        series = series.dropna()
        if not series.empty:
            features[f'{prefix}_mean'] = series.mean()
            features[f'{prefix}_std'] = series.std()
            features[f'{prefix}_skew'] = skew(series)
            features[f'{prefix}_kurt'] = kurtosis(series)
        return pd.Series(features)

    all_eeg_features = []
    for trial_index, df_trial in df_eeg.groupby('Trial_Index'):
        trial_features = {'Trial_Index': trial_index}

        # Extract features for each band/channel
        for col in band_cols:
            stats = extract_stats(df_trial[col], col)
            trial_features.update(stats)

        all_eeg_features.append(trial_features)

    df_features = pd.DataFrame(all_eeg_features)
    df_features.to_csv(output_file, index=False)
    print(f"-> Saved EEG features to {output_file}")

# extract_eeg_features_from_file(INPUT_EEG_FILE, OUTPUT_EEG_FEATURES) # RUN THIS

In [None]:
INPUT_GSR_FILE = 'extracted_GSR_data.csv'
OUTPUT_GSR_FEATURES = os.path.join(PROJECT_ROOT, 'T1_G24_GSR_features.csv')

def extract_gsr_features_from_file(input_file, output_file):
    print(f"\n--- Processing GSR data: {input_file} ---")
    try:
        df_gsr = pd.read_csv(input_file, engine='python', on_bad_lines='skip')
    except Exception as e:
        print(f"Error loading {input_file}: {e}")
        return

    if 'Trial_Index' not in df_gsr.columns:
        print("Error: 'Trial_Index' column is missing.")
        return

    # Assuming 'GSR_Value' is the column name for arousal
    GSR_COLUMN = 'GSR_Value'
    if GSR_COLUMN not in df_gsr.columns:
        # Try to infer a GSR column if 'GSR_Value' doesn't exist
        potential_cols = [col for col in df_gsr.columns if 'GSR' in col or 'arousal' in col and col not in ['Trial_Index', 'Timestamp']]
        if potential_cols:
             GSR_COLUMN = potential_cols[0]
        else:
             print(f"Error: Could not find a suitable GSR column.")
             return

    def extract_gsr_stats(df_trial):
        series = df_trial[GSR_COLUMN].dropna()
        features = {}
        if len(series) > 1:
            # Slope of Arousal
            features['GSR_Arousal_Slope'] = (series.iloc[-1] - series.iloc[0]) / len(series)

            # Peak Counts (distance ensures minimum separation between peaks)
            # Find peaks: height=threshold (e.g., 0.8 * mean), distance=min_samples_between_peaks
            peaks, _ = find_peaks(series, height=series.mean() * 0.8, distance=TARGET_RATE_HZ * 2)
            features['GSR_Peak_Count'] = len(peaks)

            features['GSR_Mean'] = series.mean()

        return pd.Series(features)

    df_features = df_gsr.groupby('Trial_Index').apply(extract_gsr_stats).reset_index()
    df_features.to_csv(output_file, index=False)
    print(f"-> Saved GSR features to {output_file}")

# extract_gsr_features_from_file(INPUT_GSR_FILE, OUTPUT_GSR_FEATURES) # RUN THIS