In [1]:
# =============================================================================
# INFORMATION DYNAMICS VALIDATION - Stanford Self-Regulation Dataset
# =============================================================================
# Goal: Validate G_info formula using real behavioral data
# Formula: G_info = k_individual × attention_focus × (1 - cognitive_load)
# Dataset: Stanford Self-Regulation Study (ds004636) - 110 participants
# =============================================================================

import pandas as pd
import numpy as np
from pathlib import Path

# Path to Stanford Self-Regulation Dataset
data_path = Path("../../../data/ds004636-main")
print(f"Data exists: {data_path.exists()}")

if data_path.exists():
    # Check dataset structure and participant count
    participants = list(data_path.glob("sub-*"))
    print(f"Found participants: {len(participants)}")
    print(f"First 3: {[p.name for p in participants[:3]]}")

Data exists: True
Found participants: 110
First 3: ['sub-s061', 'sub-s130', 'sub-s144']


In [2]:
# =============================================================================
# STEP 1: EXPLORE BEHAVIORAL DATA STRUCTURE
# =============================================================================
# We need event files (.tsv) containing response times and accuracy data
# Available tasks: CCTHot, stopSignal, twoByTwo, WATT3, DPX, stroop, etc.
# =============================================================================

# Take the first participant for initial exploration
participant = "sub-s061"
participant_path = data_path / participant

# Find event files (these contain our behavioral data)
events_files = []
for session in ['ses-1', 'ses-2']:
    func_dir = participant_path / session / 'func'
    if func_dir.exists():
        events_files.extend(list(func_dir.glob('*_events.tsv')))

print(f"Found event files: {len(events_files)}")
print("Available cognitive tasks:")
for f in events_files:
    print(f"  {f.name}")

Found event files: 9
Available cognitive tasks:
  sub-s061_ses-1_task-CCTHot_run-1_events.tsv
  sub-s061_ses-1_task-stopSignal_run-1_events.tsv
  sub-s061_ses-1_task-twoByTwo_run-1_events.tsv
  sub-s061_ses-1_task-WATT3_run-1_events.tsv
  sub-s061_ses-2_task-discountFix_run-1_events.tsv
  sub-s061_ses-2_task-DPX_run-1_events.tsv
  sub-s061_ses-2_task-motorSelectiveStop_run-1_events.tsv
  sub-s061_ses-2_task-stroop_run-1_events.tsv
  sub-s061_ses-2_task-surveyMedley_run-1_events.tsv


In [3]:
# =============================================================================
# STEP 2: EXAMINE FIRST TASK DATA STRUCTURE  
# =============================================================================
# Goal: Understand what columns contain response_time, accuracy, trial info
# This helps us identify the key variables for G_info validation
# =============================================================================

if events_files:
    # Take the first file to understand data structure
    first_file = events_files[0]
    print(f"Analyzing: {first_file.name}")
    
    # Load the data
    df = pd.read_csv(first_file, sep='\t')
    
    print(f"Rows: {len(df)}")
    print(f"Columns: {len(df.columns)}")
    print(f"Available columns: {list(df.columns)}")
    
    # Look for key variables we need for validation
    print("\nKey variables for validation:")
    if 'response_time' in df.columns:
        print("✓ Response time data available")
    if any('correct' in col for col in df.columns):
        print("✓ Accuracy data available") 
    if 'trial_type' in df.columns or 'condition' in df.columns:
        print("✓ Trial condition data available")
    
    print("\nFirst 5 rows:")
    print(df.head())

Analyzing: sub-s061_ses-1_task-CCTHot_run-1_events.tsv
Rows: 449
Columns: 19
Available columns: ['onset', 'duration', 'EV', 'action', 'clicked_on_loss_card', 'experiment_exp_id', 'gain_amount', 'gain_probability', 'key_press', 'loss_amount', 'loss_probability', 'num_cards', 'num_click_in_round', 'num_loss_cards', 'risk', 'response_time', 'total_cards', 'trial_id', 'worker_id']

Key variables for validation:
✓ Response time data available

First 5 rows:
    onset  duration         EV     action  clicked_on_loss_card  \
0   3.015     3.974  -0.734958  draw_card                   0.0   
1   6.994     1.194  -3.468291  draw_card                   0.0   
2   8.192     1.384  -7.568291  draw_card                   0.0   
3   9.580     1.131 -14.401625  end_round                   0.0   
4  10.715     2.250        NaN       -1.0                   NaN   

         experiment_exp_id  gain_amount  gain_probability  key_press  \
0  columbia_card_task_fmri         12.0          0.833333       89.0

In [None]:
# =============================================================================
# STEP 4: EXTRACT KEY METRICS FOR G_INFO VALIDATION
# =============================================================================
# Goal: Calculate performance metrics that represent information flow
# - Mean RT per condition (information processing speed)
# - Accuracy per condition (attention focus quality)  
# - Stroop effect (cognitive load impact)
# These will become inputs for testing the G_info formula
# =============================================================================

if stroop_files:
    # Calculate basic performance metrics
    print("=== STROOP TASK PERFORMANCE METRICS ===")
    
    # Filter valid trials (remove NaN response times)
    valid_trials = stroop_df[stroop_df['response_time'].notna() & (stroop_df['response_time'] > 0)]
    print(f"Valid trials: {len(valid_trials)} out of {len(stroop_df)}")
    
    # Performance by condition (congruent vs incongruent)
    if 'condition' in valid_trials.columns:
        print("\nPerformance by condition:")
        
        for condition in valid_trials['condition'].unique():
            condition_data = valid_trials[valid_trials['condition'] == condition]
            
            mean_rt = condition_data['response_time'].mean()
            mean_acc = condition_data['correct'].mean() if 'correct' in condition_data.columns else 'N/A'
            
            print(f"{condition.upper()}:")
            print(f"  Mean RT: {mean_rt:.3f} seconds")
            print(f"  Accuracy: {mean_acc:.3f}" if mean_acc != 'N/A' else "  Accuracy: N/A")
            print(f"  Trials: {len(condition_data)}")
    
    # Calculate Stroop effect (this represents cognitive load impact)
    congruent = valid_trials[valid_trials['condition'] == 'congruent']
    incongruent = valid_trials[valid_trials['condition'] == 'incongruent']
    
    if len(congruent) > 0 and len(incongruent) > 0:
        stroop_effect_rt = incongruent['response_time'].mean() - congruent['response_time'].mean()
        stroop_effect_acc = congruent['correct'].mean() - incongruent['correct'].mean()
        
        print(f"\n=== STROOP EFFECT (Cognitive Load Impact) ===")
        print(f"RT Cost: +{stroop_effect_rt:.3f} seconds (slower on incongruent)")
        print(f"Accuracy Cost: -{stroop_effect_acc:.3f} (lower accuracy on incongruent)")
        
        # This participant's overall performance profile
        overall_rt = valid_trials['response_time'].mean()
        overall_acc = valid_trials['correct'].mean()
        
        print(f"\n=== PARTICIPANT {participant.upper()} PROFILE ===")
        print(f"Overall Mean RT: {overall_rt:.3f} seconds")
        print(f"Overall Accuracy: {overall_acc:.3f}")
        print(f"Cognitive Load Sensitivity: {stroop_effect_rt:.3f} seconds")
        
        # These metrics will feed into G_info calculation in next steps
        print(f"\n=== READY FOR G_INFO VALIDATION ===")
        print("✓ Response time data extracted")
        print("✓ Accuracy data extracted") 
        print("✓ Cognitive load effects quantified")
        print("Next: Apply G_info formula and test predictions!")


Found Stroop task file: sub-s061_ses-2_task-stroop_run-1_events.tsv
Stroop data shape: (96, 12)
Columns: ['onset', 'duration', 'condition', 'correct', 'correct_response', 'experiment_exp_id', 'key_press', 'response_time', 'stim_color', 'stim_word', 'trial_type', 'worker_id']

Trial types: ['incongruent' 'congruent']
Response time columns: ['response_time']
Accuracy columns: ['correct', 'correct_response']

First few rows of Stroop data:
    onset  duration    condition  correct  correct_response experiment_exp_id  \
0   3.506       1.5  incongruent      1.0              89.0            stroop   
1   5.508       1.5    congruent      1.0              82.0            stroop   
2   7.510       1.5  incongruent      1.0              82.0            stroop   
3   9.783       1.5  incongruent      1.0              71.0            stroop   
4  11.784       1.5    congruent      1.0              82.0            stroop   

   key_press  response_time stim_color stim_word   trial_type worker_id 

In [None]:
# =============================================================================
# STEP 5: NEXT STEPS - MULTI-PARTICIPANT ANALYSIS & G_INFO VALIDATION
# =============================================================================
# TODO: Scale up analysis to all participants
# TODO: Calculate G_info components for each participant
# TODO: Test G_info predictions against observed performance
# TODO: Validate formula across different cognitive tasks
# =============================================================================

print("✅ Single participant analysis complete!")
print("📊 Ready to scale up to full dataset validation")
print("🧠 Next: Implement G_info formula and test predictions")
