In [2]:
"""
VOTC Plasticity Data Quality Check
==================================

This notebook validates the extracted ROI coordinates and beta values
to ensure data quality before proceeding with RSA analysis.

Based on the paper: "Cross-sectional and longitudinal changes in 
category-selectivity in visual cortex following pediatric cortical resection"
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# =============================================================================
# CONFIGURATION
# =============================================================================

# Define paths
BASE_DIR = Path("/user_data/csimmon2/long_pt")
ANALYSES_DIR = BASE_DIR / "analyses"
ROI_DIR = ANALYSES_DIR / "roi_extraction"
BETA_DIR = ANALYSES_DIR / "beta_extraction"

# Load data
roi_coords = pd.read_csv(ROI_DIR / "peak_roi_coordinates.csv")
session_inventory = pd.read_csv(BETA_DIR / "session_inventory.csv")

with open(BETA_DIR / "qc_report.json", 'r') as f:
    qc_report = json.load(f)

print("=== VOTC Plasticity Data Quality Check ===")
print(f"ROI coordinates loaded: {len(roi_coords)} ROIs")
print(f"Session inventory loaded: {len(session_inventory)} sessions")
print(f"QC report loaded: {len(qc_report['sessions'])} sessions")

=== VOTC Plasticity Data Quality Check ===
ROI coordinates loaded: 280 ROIs
Session inventory loaded: 28 sessions


KeyError: 'sessions'

In [None]:
# =============================================================================
# 1. ROI COORDINATE VALIDATION
# =============================================================================

print("\n" + "="*50)
print("1. ROI COORDINATE VALIDATION")
print("="*50)

# ROI summary statistics
roi_summary = roi_coords.groupby(['subject', 'session', 'roi_name']).size().reset_index(name='count')
roi_counts = roi_coords.groupby('roi_name').size().sort_values(ascending=False)

print(f"\nROI Detection Summary:")
print(f"Total ROIs extracted: {len(roi_coords)}")
print(f"Unique ROI types: {roi_coords['roi_name'].nunique()}")
print(f"Subjects: {roi_coords['subject'].nunique()}")
print(f"Sessions: {len(roi_coords.groupby(['subject', 'session']))}")

print(f"\nROI counts by type:")
for roi_name, count in roi_counts.items():
    print(f"  {roi_name}: {count}")

# Check for expected ROI types from the paper
expected_rois = [
    'lFFA', 'rFFA', 'lSTS', 'rSTS', 'lPPA', 'rPPA', 'lTOS', 'rTOS',
    'lLOC', 'rLOC', 'lPF', 'rPF', 'VWFA', 'STG', 'IFG', 'lEVC', 'rEVC'
]

detected_rois = set(roi_coords['roi_name'].unique())
missing_rois = set(expected_rois) - detected_rois
print(f"\nExpected ROIs from paper: {len(expected_rois)}")
print(f"Detected ROIs: {len(detected_rois)}")
if missing_rois:
    print(f"Missing ROIs: {missing_rois}")
else:
    print("✓ All expected ROI types detected")

In [None]:
# =============================================================================
# 2. SPATIAL ORGANIZATION ANALYSIS
# =============================================================================

print("\n" + "="*50)
print("2. SPATIAL ORGANIZATION ANALYSIS")
print("="*50)

# Create spatial organization plots (matching Figure 2 from paper)
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('ROI Spatial Organization by Subject', fontsize=16, fontweight='bold')

subjects = sorted(roi_coords['subject'].unique())
roi_colors = {
    'lFFA': 'magenta', 'rFFA': 'magenta',
    'lSTS': 'pink', 'rSTS': 'pink', 
    'lPPA': 'darkgreen', 'rPPA': 'darkgreen',
    'lTOS': 'lightgreen', 'rTOS': 'lightgreen',
    'lLOC': 'lightblue', 'rLOC': 'lightblue',
    'lPF': 'darkblue', 'rPF': 'darkblue',
    'VWFA': 'darkorange', 'STG': 'orange', 'IFG': 'yellow',
    'lEVC': 'gray', 'rEVC': 'gray'
}

for idx, subject in enumerate(subjects):
    if idx >= 6:  # Only plot first 6 subjects
        break
    
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]
    
    subj_data = roi_coords[roi_coords['subject'] == subject]
    
    for roi_name in subj_data['roi_name'].unique():
        roi_data = subj_data[subj_data['roi_name'] == roi_name]
        color = roi_colors.get(roi_name, 'black')
        
        ax.scatter(roi_data['x'], roi_data['y'], 
                  c=color, s=100, alpha=0.7, 
                  label=roi_name, edgecolors='black', linewidth=0.5)
    
    ax.set_xlabel('X coordinate (L-R)')
    ax.set_ylabel('Y coordinate (P-A)')
    ax.set_title(f'{subject}')
    ax.grid(True, alpha=0.3)
    
    # Set consistent axis limits for comparison
    ax.set_xlim(0, 200)
    ax.set_ylim(100, 250)

# Create legend
handles = []
labels = []
for roi_name, color in roi_colors.items():
    if roi_name in detected_rois:
        handles.append(plt.Line2D([0], [0], marker='o', color='w', 
                                markerfacecolor=color, markersize=8, 
                                markeredgecolor='black'))
        labels.append(roi_name)

fig.legend(handles, labels, loc='center right', bbox_to_anchor=(1.0, 0.5), ncol=1)
plt.tight_layout()
plt.subplots_adjust(right=0.85)
plt.show()

# Analyze medial-lateral organization (key analysis from paper)
print("\nMedial-Lateral Organization Analysis:")
ventral_rois = ['lEVC', 'rEVC', 'lPPA', 'rPPA', 'lPF', 'rPF', 'lFFA', 'rFFA', 'VWFA']

for subject in subjects:
    subj_data = roi_coords[roi_coords['subject'] == subject]
    ventral_data = subj_data[subj_data['roi_name'].isin(ventral_rois)]
    
    if len(ventral_data) > 0:
        x_range = ventral_data['x'].max() - ventral_data['x'].min()
        print(f"  {subject}: {len(ventral_data)} ventral ROIs, X-range: {x_range:.1f}mm")

In [None]:
# =============================================================================
# 3. BETA VALUE VALIDATION
# =============================================================================

print("\n" + "="*50)
print("3. BETA VALUE VALIDATION")
print("="*50)

# Load a sample of beta matrices for validation
sample_sessions = session_inventory.head(5)  # Sample first 5 sessions
beta_stats = []

print("Loading sample beta matrices...")
for idx, row in sample_sessions.iterrows():
    session_dir = BETA_DIR / row['session_dir']
    
    try:
        # Load beta matrix and metadata
        beta_matrix = np.load(session_dir / "beta_matrix.npy")
        roi_info = pd.read_csv(session_dir / "roi_info.csv")
        
        with open(session_dir / "metadata.json", 'r') as f:
            metadata = json.load(f)
        
        # Calculate statistics
        stats = {
            'subject': row['subject'],
            'session': row['session'],
            'n_conditions': beta_matrix.shape[0],
            'n_rois': beta_matrix.shape[1],
            'mean_beta': np.mean(beta_matrix),
            'std_beta': np.std(beta_matrix),
            'min_beta': np.min(beta_matrix),
            'max_beta': np.max(beta_matrix),
            'n_zero_betas': np.sum(beta_matrix == 0),
            'extraction_success': metadata.get('extraction_success', False)
        }
        beta_stats.append(stats)
        
        print(f"  {row['subject']}_{row['session']}: "
              f"{stats['n_conditions']}×{stats['n_rois']} matrix, "
              f"mean={stats['mean_beta']:.3f}, "
              f"success={stats['extraction_success']}")
        
    except Exception as e:
        print(f"  Error loading {row['session_dir']}: {e}")

beta_stats_df = pd.DataFrame(beta_stats)

# Plot beta value distributions
if len(beta_stats_df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Beta Value Quality Metrics', fontsize=16, fontweight='bold')
    
    # Mean beta values
    axes[0,0].boxplot([beta_stats_df['mean_beta']])
    axes[0,0].set_title('Mean Beta Values')
    axes[0,0].set_ylabel('Beta value')
    
    # Standard deviation
    axes[0,1].boxplot([beta_stats_df['std_beta']])
    axes[0,1].set_title('Beta Value Standard Deviation')
    axes[0,1].set_ylabel('Standard deviation')
    
    # Matrix dimensions
    axes[1,0].scatter(beta_stats_df['n_conditions'], beta_stats_df['n_rois'], 
                     s=100, alpha=0.7)
    axes[1,0].set_xlabel('Number of conditions')
    axes[1,0].set_ylabel('Number of ROIs')
    axes[1,0].set_title('Matrix Dimensions')
    
    # Zero beta counts
    axes[1,1].bar(range(len(beta_stats_df)), beta_stats_df['n_zero_betas'])
    axes[1,1].set_xlabel('Session index')
    axes[1,1].set_ylabel('Number of zero betas')
    axes[1,1].set_title('Zero Beta Counts')
    
    plt.tight_layout()
    plt.show()


In [None]:
# =============================================================================
# 4. SESSION COVERAGE ANALYSIS
# =============================================================================

print("\n" + "="*50)
print("4. SESSION COVERAGE ANALYSIS")
print("="*50)

# Analyze session coverage per subject (important for longitudinal analysis)
session_coverage = roi_coords.groupby('subject')['session'].nunique().sort_values(ascending=False)
print("Sessions per subject:")
for subject, n_sessions in session_coverage.items():
    sessions = sorted(roi_coords[roi_coords['subject'] == subject]['session'].unique())
    print(f"  {subject}: {n_sessions} sessions ({', '.join(sessions)})")

# Check for longitudinal subjects (>1 session)
longitudinal_subjects = session_coverage[session_coverage > 1].index.tolist()
print(f"\nLongitudinal subjects (>1 session): {longitudinal_subjects}")

# ROI consistency across sessions for longitudinal subjects
print(f"\nROI consistency for longitudinal subjects:")
for subject in longitudinal_subjects:
    subj_data = roi_coords[roi_coords['subject'] == subject]
    sessions = sorted(subj_data['session'].unique())
    
    print(f"\n{subject}:")
    for session in sessions:
        sess_rois = sorted(subj_data[subj_data['session'] == session]['roi_name'].unique())
        print(f"  {session}: {len(sess_rois)} ROIs ({', '.join(sess_rois[:5])}{'...' if len(sess_rois) > 5 else ''})")



In [None]:
# =============================================================================
# 5. QC REPORT SUMMARY
# =============================================================================

print("\n" + "="*50)
print("5. EXTRACTION QC SUMMARY")
print("="*50)

if qc_report:
    print(f"Total sessions processed: {qc_report['summary']['total_sessions']}")
    print(f"Successful extractions: {qc_report['summary']['successful_sessions']}")
    print(f"Success rate: {qc_report['summary']['success_rate']:.1%}")
    
    if qc_report['summary']['failed_sessions']:
        print(f"Failed sessions: {qc_report['summary']['failed_sessions']}")



In [None]:
# =============================================================================
# 6. DATA READINESS ASSESSMENT
# =============================================================================

print("\n" + "="*50)
print("6. DATA READINESS ASSESSMENT")
print("="*50)

# Check data readiness for RSA analysis
readiness_score = 0
max_score = 5

# Check 1: ROI extraction success
if len(roi_coords) >= 200:  # Expect ~280 ROIs
    print("✓ ROI extraction: Sufficient ROIs extracted")
    readiness_score += 1
else:
    print("⚠ ROI extraction: Low ROI count")

# Check 2: Expected ROI types
if len(missing_rois) == 0:
    print("✓ ROI types: All expected ROI types detected")
    readiness_score += 1
else:
    print(f"⚠ ROI types: Missing {len(missing_rois)} ROI types")

# Check 3: Beta extraction success
if qc_report['summary']['success_rate'] >= 0.9:
    print("✓ Beta extraction: High success rate")
    readiness_score += 1
else:
    print("⚠ Beta extraction: Low success rate")

# Check 4: Longitudinal data availability
if len(longitudinal_subjects) >= 3:
    print("✓ Longitudinal data: Multiple subjects with repeated sessions")
    readiness_score += 1
else:
    print("⚠ Longitudinal data: Limited longitudinal subjects")

# Check 5: Beta value quality
if len(beta_stats_df) > 0 and all(beta_stats_df['extraction_success']):
    print("✓ Beta quality: Sample extractions successful")
    readiness_score += 1
else:
    print("⚠ Beta quality: Issues with sample extractions")

print(f"\nData Readiness Score: {readiness_score}/{max_score}")

if readiness_score >= 4:
    print("🚀 Data is ready for RSA analysis!")
    print("\nNext steps:")
    print("1. Proceed with RSA analysis (03_rsa_analysis.py)")
    print("2. Implement Crawford t-tests for statistical analysis")
    print("3. Recreate Figure 2 (spatial organization)")
    print("4. Recreate Figure 3 (RSA correlations)")
    print("5. Recreate Figure 5 (longitudinal competition)")
else:
    print("⚠ Data quality issues detected. Review and fix before proceeding.")

In [None]:
# =============================================================================
# 7. GENERATE SUMMARY REPORT
# =============================================================================

print("\n" + "="*50)
print("7. SUMMARY REPORT GENERATION")
print("="*50)

# Create summary report for documentation
summary_report = {
    'extraction_date': qc_report.get('extraction_date', 'unknown'),
    'total_rois': len(roi_coords),
    'unique_roi_types': roi_coords['roi_name'].nunique(),
    'subjects': roi_coords['subject'].unique().tolist(),
    'sessions_per_subject': session_coverage.to_dict(),
    'longitudinal_subjects': longitudinal_subjects,
    'beta_extraction_success_rate': qc_report['summary']['success_rate'],
    'data_readiness_score': f"{readiness_score}/{max_score}",
    'ready_for_rsa': readiness_score >= 4,
    'detected_roi_types': sorted(detected_rois),
    'missing_roi_types': list(missing_rois)
}

# Save summary report
summary_path = ANALYSES_DIR / "data_quality_summary.json"
with open(summary_path, 'w') as f:
    json.dump(summary_report, f, indent=2)

print(f"Summary report saved to: {summary_path}")
print("\n=== Quality Check Complete ===")