<a href="https://colab.research.google.com/github/chimera-org/chimera_v2.0/blob/main/notebooks/eegencoder_experiments/04_preprocessing_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Clone your GitHub repository
!git clone https://github.com/chimera-org/chimera_v2.0/

print("‚úÖ Repository cloned.")

Cloning into 'chimera_v2.0'...
remote: Enumerating objects: 1734, done.[K
remote: Counting objects: 100% (293/293), done.[K
remote: Compressing objects: 100% (181/181), done.[K
remote: Total 1734 (delta 213), reused 112 (delta 112), pack-reused 1441 (from 2)[K
Receiving objects: 100% (1734/1734), 1.01 MiB | 12.76 MiB/s, done.
Resolving deltas: 100% (885/885), done.
‚úÖ Repository cloned.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Deep Analysis of Raw GDF Files

!pip install mne
import mne
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd

# CONFIGURATION
RAW_GDF_PATH = Path("/content/drive/MyDrive/Motor_Imagery_Datasets/OpenBCI/BCI_cIV_2a/BCI_IV_2a/")
SAVE_ANALYSIS_PATH = Path("/content/drive/MyDrive/chimera_v2.0/analysis/")  # NEW FOLDER
SAVE_ANALYSIS_PATH.mkdir(exist_ok=True)

print("üîç DEEP EEG DATA ANALYSIS")
print("="*60)
print(f"Raw data path: {RAW_GDF_PATH}")
print(f"Analysis save path: {SAVE_ANALYSIS_PATH}")

# Analyze all 9 subjects
analysis_results = []

for subject_id in range(1, 10):
    print(f"\n{'='*45}")
    print(f"SUBJECT A{subject_id:02d}T")
    print(f"{'='*45}")

    # Load raw GDF
    gdf_file = RAW_GDF_PATH / f"A{subject_id:02d}T.gdf"
    raw = mne.io.read_raw_gdf(gdf_file, preload=True, verbose=False)

    # Basic info
    n_channels = len(raw.ch_names)
    sfreq = raw.info['sfreq']
    duration = raw.times[-1]

    # Check events
    events, event_id = mne.events_from_annotations(raw, verbose=False)

    # Filter to motor imagery events only
    mi_events = events[np.isin(events[:, 2], [1, 2, 3, 4])]  # 769-772 map to 1-4
    n_trials = len(mi_events)

    # Quality metrics
    max_amplitude = np.abs(raw.get_data()).max()
    mean_amplitude = np.abs(raw.get_data()).mean()

    # Missing events check
    expected_events = ['769', '770', '771', '772']
    found_events = {k: v for k, v in event_id.items() if k in expected_events}

    print(f"Channels: {n_channels} | SFreq: {sfreq} Hz | Duration: {duration/60:.1f} min")
    print(f"MI trials: {n_trials}")
    print(f"Event codes: {found_events}")
    print(f"Amplitude: max={max_amplitude:.6f} V, mean={mean_amplitude:.6f} V")

    # Detect artifacts (amplitude > 100 ¬µV is suspicious)
    artifact_trials = 0
    for ev in mi_events:
        start, end = ev[0], ev[0] + int(4 * sfreq)  # 4 second epochs
        trial_data = raw.get_data(start=start, stop=end)
        if np.abs(trial_data).max() > 100e-6:  # 100 ¬µV threshold
            artifact_trials += 1

    print(f"Trials with artifacts (>{100e-6:.0e}V): {artifact_trials}/{n_trials}")

    # Store results
    analysis_results.append({
        'subject': subject_id,
        'channels': n_channels,
        'trials': n_trials,
        'max_amp': max_amplitude,
        'mean_amp': mean_amplitude,
        'artifact_trials': artifact_trials
    })

    # Save raw plot for first subject
    if subject_id == 1:
        fig = raw.plot(duration=10, n_channels=22, show=False)
        fig.savefig(SAVE_ANALYSIS_PATH / "subject_01_raw_plot.png", dpi=300)
        plt.close(fig)
        print("üìä Saved raw plot to analysis/subject_01_raw_plot.png")

# Save analysis summary
analysis_df = pd.DataFrame(analysis_results)
analysis_df.to_csv(SAVE_ANALYSIS_PATH / "preprocessing_analysis.csv", index=False)

print("\n" + "="*60)
print("üìä ANALYSIS COMPLETE")
print("="*60)
print(analysis_df.to_string(index=False))

# Summary statistics
print(f"\nüìà Summary:")
print(f"Total trials across subjects: {analysis_df['trials'].sum()}")
print(f"Mean trials per subject: {analysis_df['trials'].mean():.1f} ¬± {analysis_df['trials'].std():.1f}")
print(f"Mean max amplitude: {analysis_df['max_amp'].mean():.6f} V")
print(f"Mean artifact rate: {analysis_df['artifact_trials'].sum() / analysis_df['trials'].sum() * 100:.1f}%")

# Check for anomalies
if analysis_df['trials'].std() > 20:
    print("\n‚ö†Ô∏è WARNING: Large variation in trial counts!")
    print("Some subjects may have missing/corrupted trials")

if analysis_df['max_amp'].max() > 500e-6:
    print("\n‚ö†Ô∏è WARNING: High amplitude artifacts detected!")
    print("ICA will be critical for cleaning")

Collecting mne
  Downloading mne-1.11.0-py3-none-any.whl.metadata (15 kB)
Downloading mne-1.11.0-py3-none-any.whl (7.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m7.5/7.5 MB[0m [31m109.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mne
Successfully installed mne-1.11.0
üîç DEEP EEG DATA ANALYSIS
Raw data path: /content/drive/MyDrive/Motor_Imagery_Datasets/OpenBCI/BCI_cIV_2a/BCI_IV_2a
Analysis save path: /content/drive/MyDrive/chimera_v2.0/analysis

SUBJECT A01T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 44.8 min
MI trials: 18
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000013 V
Trials with artifacts (>1e-04V): 15/18
Using matplotlib as 2D backend.
üìä Saved raw plot to analysis/subject_01_raw_plot.png

SUBJECT A02T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 45.1 min
MI trials: 21
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000013 V
Trials with artifacts (>1e-04V): 10/21

SUBJECT A03T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 44.0 min
MI trials: 21
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000015 V
Trials with artifacts (>1e-04V): 15/21

SUBJECT A04T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 40.1 min
MI trials: 322
Event codes: {np.str_('769'): 5, np.str_('770'): 6, np.str_('771'): 7, np.str_('772'): 8}
Amplitude: max=0.016000 V, mean=0.000011 V
Trials with artifacts (>1e-04V): 53/322

SUBJECT A05T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 45.7 min
MI trials: 29
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000012 V
Trials with artifacts (>1e-04V): 16/29

SUBJECT A06T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 45.3 min
MI trials: 72
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000015 V
Trials with artifacts (>1e-04V): 17/72

SUBJECT A07T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 45.4 min
MI trials: 20
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000013 V
Trials with artifacts (>1e-04V): 10/20

SUBJECT A08T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 45.0 min
MI trials: 27
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000016 V
Trials with artifacts (>1e-04V): 16/27

SUBJECT A09T


  next(self.gen)


Channels: 25 | SFreq: 250.0 Hz | Duration: 44.9 min
MI trials: 54
Event codes: {np.str_('769'): 7, np.str_('770'): 8, np.str_('771'): 9, np.str_('772'): 10}
Amplitude: max=0.016000 V, mean=0.000016 V
Trials with artifacts (>1e-04V): 36/54

üìä ANALYSIS COMPLETE
 subject  channels  trials  max_amp  mean_amp  artifact_trials
       1        25      18    0.016  0.000013               15
       2        25      21    0.016  0.000013               10
       3        25      21    0.016  0.000015               15
       4        25     322    0.016  0.000011               53
       5        25      29    0.016  0.000012               16
       6        25      72    0.016  0.000015               17
       7        25      20    0.016  0.000013               10
       8        25      27    0.016  0.000016               16
       9        25      54    0.016  0.000016               36

üìà Summary:
Total trials across subjects: 584
Mean trials per subject: 64.9 ¬± 98.1
Mean max amplitude: 

In [6]:
# ================================================
# CELL: Direct Loader Class (Bypass Module Issues)
# ================================================

import sys
import numpy as np
from pathlib import Path
import mne
import torch

# Define the loader class directly (copy from fixed file)
class BCIC4_2A_Loader:
    def __init__(self, data_path):
        self.data_path = Path(data_path)

    def load_subject(self, subject_id, return_raw=False):
        """Fixed: Load exact 288 trials from BCI IV 2a"""
        if not (1 <= subject_id <= 9):
            raise ValueError(f"subject_id must be 1-9, got {subject_id}")

        gdf_file = self.data_path / f"A{subject_id:02d}T.gdf"
        raw = mne.io.read_raw_gdf(gdf_file, preload=True, verbose=False)

        events, event_id = mne.events_from_annotations(raw, verbose=False)

        # Map MI events
        mi_event_map = {
            '769': 0,  # Left hand
            '770': 1,  # Right hand
            '771': 2,  # Foot
            '772': 3,  # Tongue
        }

        mi_codes = {}
        for event_name, mi_label in mi_event_map.items():
            if event_name in event_id:
                mi_codes[event_id[event_name]] = mi_label

        mi_events = events[np.isin(events[:, 2], list(mi_codes.keys()))]

        epochs = mne.Epochs(raw, mi_events, event_id=mi_codes,
                            tmin=0, tmax=4.0, baseline=None, preload=True, verbose=False)

        if len(epochs) != 288:
            print(f"‚ö†Ô∏è WARNING: Subject {subject_id} has {len(epochs)} trials (expected 288)")

        X = epochs.get_data()
        y = epochs.events[:, -1]

        print(f"‚úÖ Subject {subject_id:02d}: X={X.shape}, labels={np.bincount(y)}")

        if return_raw:
            return X, y, raw, epochs
        return X, y

    def load_all_subjects(self, subject_ids=None):
        """Load multiple subjects"""
        if subject_ids is None:
            subject_ids = list(range(1, 10))

        X_list, y_list = [], []
        for subj_id in subject_ids:
            try:
                X, y = self.load_subject(subj_id)
                X_list.append(X)
                y_list.append(y)
            except Exception as e:
                print(f"‚ùå Failed to load subject {subj_id}: {e}")
                continue

        if not X_list:
            raise ValueError("No subjects loaded successfully!")

        return np.concatenate(X_list, axis=0), np.concatenate(y_list, axis=0)

# Test it directly
DATA_PATH = Path("/content/drive/MyDrive/Motor_Imagery_Datasets/OpenBCI/BCI_cIV_2a/BCI_IV_2a/")

loader = BCIC4_2A_Loader(DATA_PATH)

print("üîç TESTING DIRECT LOADER")
print("="*50)

for subj in [1, 2, 4, 9]:
    X, y = loader.load_subject(subj)
    print(f"Subject {subj}: {X.shape} | max_amp: {np.abs(X).max():.6f} V | labels: {np.bincount(y)}")

üîç TESTING DIRECT LOADER


  next(self.gen)


TypeError: Event names must be an instance of str, got <class 'int'> instead.