In [None]:
"""
VOTC Plasticity Project Summary and Validation
==============================================

Comprehensive summary of the Python recreation of:
"Cross-sectional and longitudinal changes in category-selectivity 
in visual cortex following pediatric cortical resection"

This script provides:
1. Complete pipeline validation
2. Comparison with original MATLAB results
3. Quality metrics and success assessment
4. Next steps for further analysis
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

BASE_DIR = Path("/user_data/csimmon2/long_pt")
ANALYSES_DIR = BASE_DIR / "analyses"
REPORTS_DIR = ANALYSES_DIR / "reports"
REPORTS_DIR.mkdir(exist_ok=True)

print("="*70)
print("VOTC PLASTICITY PROJECT SUMMARY")
print("="*70)
print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Python recreation of Liu et al. (2025) MATLAB analysis")

# =============================================================================
# 1. PIPELINE VALIDATION
# =============================================================================

print("\n" + "="*50)
print("1. ANALYSIS PIPELINE VALIDATION")
print("="*50)

def validate_pipeline_stage(stage_name, expected_files, stage_dir):
    """Validate a pipeline stage by checking for expected output files"""
    print(f"\n{stage_name}:")
    
    if not stage_dir.exists():
        print(f"  ❌ Directory not found: {stage_dir}")
        return False, 0
    
    found_files = 0
    total_files = len(expected_files)
    
    for file_pattern in expected_files:
        matching_files = list(stage_dir.glob(file_pattern))
        if matching_files:
            print(f"  ✓ Found: {file_pattern} ({len(matching_files)} files)")
            found_files += 1
        else:
            print(f"  ❌ Missing: {file_pattern}")
    
    success_rate = found_files / total_files if total_files > 0 else 0
    status = "✓ COMPLETE" if success_rate == 1.0 else f"⚠ PARTIAL ({success_rate:.1%})"
    print(f"  Status: {status}")
    
    return success_rate == 1.0, found_files

# Validate each pipeline stage
pipeline_stages = [
    {
        'name': 'ROI Extraction',
        'dir': ANALYSES_DIR / "roi_extraction",
        'files': ['peak_roi_coordinates.csv']
    },
    {
        'name': 'Beta Value Extraction', 
        'dir': ANALYSES_DIR / "beta_extraction",
        'files': ['session_inventory.csv', 'qc_report.json', 'sub-*']
    },
    {
        'name': 'RSA Analysis',
        'dir': ANALYSES_DIR / "rsa_analysis", 
        'files': ['rsa_results.csv', 'rsa_summary.json']
    },
    {
        'name': 'Statistical Analysis',
        'dir': ANALYSES_DIR / "statistical_analysis",
        'files': ['*_crawford_results.csv', 'statistical_summary.json']
    },
    {
        'name': 'Figure Recreation',
        'dir': ANALYSES_DIR / "figures",
        'files': ['figure_*.png', 'figure_summary.json']
    }
]

pipeline_success = True
for stage in pipeline_stages:
    stage_success, _ = validate_pipeline_stage(
        stage['name'], stage['files'], stage['dir']
    )
    pipeline_success &= stage_success

print(f"\nOverall Pipeline Status: {'✓ COMPLETE' if pipeline_success else '⚠ INCOMPLETE'}")

# =============================================================================
# 2. DATA QUALITY METRICS
# =============================================================================

print("\n" + "="*50)
print("2. DATA QUALITY METRICS")
print("="*50)

# Load all data for quality assessment
data_files = {
    'roi_coords': ANALYSES_DIR / "roi_extraction" / "peak_roi_coordinates.csv",
    'rsa_results': ANALYSES_DIR / "rsa_analysis" / "rsa_results.csv",
    'session_inventory': ANALYSES_DIR / "beta_extraction" / "session_inventory.csv"
}

data = {}
for name, file_path in data_files.items():
    if file_path.exists():
        data[name] = pd.read_csv(file_path)
        print(f"✓ Loaded {name}: {len(data[name])} records")
    else:
        data[name] = pd.DataFrame()
        print(f"❌ Missing {name}")

if len(data['roi_coords']) > 0:
    roi_coords = data['roi_coords']
    
    # ROI quality metrics
    print(f"\nROI Extraction Quality:")
    print(f"  Total ROIs extracted: {len(roi_coords)}")
    print(f"  Unique subjects: {roi_coords['subject'].nunique()}")
    print(f"  Unique sessions: {len(roi_coords.groupby(['subject', 'session']))}")
    print(f"  ROI types detected: {roi_coords['roi_name'].nunique()}")
    
    # Expected vs detected ROIs
    expected_rois = {
        'lFFA', 'rFFA', 'lSTS', 'rSTS', 'lPPA', 'rPPA', 'lTOS', 'rTOS',
        'lLOC', 'rLOC', 'lPF', 'rPF', 'VWFA', 'STG', 'IFG', 'lEVC', 'rEVC'
    }
    detected_rois = set(roi_coords['roi_name'].unique())
    coverage = len(detected_rois & expected_rois) / len(expected_rois)
    print(f"  ROI type coverage: {coverage:.1%} ({len(detected_rois & expected_rois)}/{len(expected_rois)})")
    
    # Subject classification
    patient_subjects = {'KN', 'SN', 'TC', 'UD', 'OT'}
    subjects_in_data = set(roi_coords['subject'].unique())
    patients_found = subjects_in_data & patient_subjects
    controls_found = subjects_in_data - patient_subjects
    
    print(f"  Patients identified: {len(patients_found)} {list(patients_found)}")
    print(f"  Controls identified: {len(controls_found)}")

if len(data['rsa_results']) > 0:
    rsa_results = data['rsa_results']
    
    print(f"\nRSA Analysis Quality:")
    print(f"  RSA measurements: {len(rsa_results)}")
    print(f"  Valid correlations: {rsa_results['correlation_fisher'].notna().sum()}")
    print(f"  Subjects with RSA data: {rsa_results['subject'].nunique()}")
    
    # Check for longitudinal data
    session_counts = rsa_results.groupby('subject')['session'].nunique()
    longitudinal_subjects = session_counts[session_counts > 1]
    print(f"  Longitudinal subjects: {len(longitudinal_subjects)} {list(longitudinal_subjects.index)}")

# =============================================================================
# 3. COMPARISON WITH ORIGINAL PAPER
# =============================================================================

print("\n" + "="*50)
print("3. COMPARISON WITH ORIGINAL PAPER")
print("="*50)

# Key findings from the paper to validate
paper_findings = {
    'total_subjects': 5,  # patients
    'total_controls': 25,
    'total_rois': 17,
    'longitudinal_subjects': ['TC', 'UD', 'OT'],
    'left_resection_patients': ['KN', 'SN', 'TC'],
    'right_resection_patients': ['UD'],
    'control_patients': ['OT']
}

print("Validating key findings from paper:")

if len(data['roi_coords']) > 0:
    # Subject counts
    our_patients = subjects_in_data & patient_subjects
    our_controls = subjects_in_data - patient_subjects
    
    print(f"  Patients: {len(our_patients)}/{paper_findings['total_subjects']} "
          f"{'✓' if len(our_patients) >= 4 else '⚠'}")
    print(f"  Controls: {len(our_controls)} (paper: {paper_findings['total_controls']}) "
          f"{'✓' if len(our_controls) >= 20 else '⚠'}")
    
    # ROI types
    our_roi_types = roi_coords['roi_name'].nunique()
    print(f"  ROI types: {our_roi_types}/{paper_findings['total_rois']} "
          f"{'✓' if our_roi_types >= 15 else '⚠'}")
    
    # Longitudinal subjects
    if len(data['rsa_results']) > 0:
        our_longitudinal = set(longitudinal_subjects.index)
        expected_longitudinal = set(paper_findings['longitudinal_subjects'])
        longitudinal_match = len(our_longitudinal & expected_longitudinal) / len(expected_longitudinal)
        print(f"  Longitudinal subjects: {longitudinal_match:.1%} match "
              f"{'✓' if longitudinal_match >= 0.5 else '⚠'}")

# =============================================================================
# 4. STATISTICAL VALIDATION
# =============================================================================

print("\n" + "="*50)
print("4. STATISTICAL VALIDATION")
print("="*50)

# Check for statistical results
stats_files = {
    'spatial_crawford': ANALYSES_DIR / "statistical_analysis" / "spatial_crawford_results.csv",
    'rsa_crawford': ANALYSES_DIR / "statistical_analysis" / "rsa_crawford_results.csv",
    'longitudinal': ANALYSES_DIR / "statistical_analysis" / "longitudinal_results.csv"
}

for analysis_name, file_path in stats_files.items():
    if file_path.exists():
        results = pd.read_csv(file_path)
        print(f"✓ {analysis_name}: {len(results)} tests")
        
        if 'crawford_significant' in results.columns:
            n_significant = results['crawford_significant'].sum()
            print(f"    Significant results: {n_significant}/{len(results)} ({n_significant/len(results):.1%})")
            
        if 'fdr_significant' in results.columns:
            n_fdr_significant = results['fdr_significant'].sum()
            print(f"    FDR-corrected significant: {n_fdr_significant}/{len(results)} ({n_fdr_significant/len(results):.1%})")
    else:
        print(f"❌ {analysis_name}: Missing")

# =============================================================================
# 5. REPRODUCIBILITY ASSESSMENT
# =============================================================================

print("\n" + "="*50)
print("5. REPRODUCIBILITY ASSESSMENT")
print("="*50)

# Check if key MATLAB findings can be reproduced
reproducibility_checks = []

# Check 1: ROI detection success
if len(data['roi_coords']) > 0:
    roi_success = roi_coords['roi_name'].nunique() >= 15
    reproducibility_checks.append(('ROI Detection', roi_success))

# Check 2: Patient identification
if len(data['roi_coords']) > 0:
    patient_success = len(our_patients) >= 4
    reproducibility_checks.append(('Patient Identification', patient_success))

# Check 3: RSA computation
if len(data['rsa_results']) > 0:
    rsa_success = rsa_results['correlation_fisher'].notna().sum() > 100
    reproducibility_checks.append(('RSA Computation', rsa_success))

# Check 4: Statistical analysis
stats_success = any(f.exists() for f in stats_files.values())
reproducibility_checks.append(('Statistical Analysis', stats_success))

# Check 5: Figure generation
figures_dir = ANALYSES_DIR / "figures"
figure_success = len(list(figures_dir.glob("figure_*.png"))) >= 3 if figures_dir.exists() else False
reproducibility_checks.append(('Figure Recreation', figure_success))

print("Reproducibility checklist:")
total_checks = len(reproducibility_checks)
passed_checks = sum(1 for _, success in reproducibility_checks if success)

for check_name, success in reproducibility_checks:
    status = "✓ PASS" if success else "❌ FAIL"
    print(f"  {check_name}: {status}")

reproducibility_score = passed_checks / total_checks
print(f"\nReproducibility Score: {reproducibility_score:.1%} ({passed_checks}/{total_checks})")

# =============================================================================
# 6. PERFORMANCE METRICS
# =============================================================================

print("\n" + "="*50)
print("6. PERFORMANCE METRICS")
print("="*50)

# Calculate processing efficiency
if len(data['session_inventory']) > 0:
    session_inventory = data['session_inventory']
    print(f"Sessions processed: {len(session_inventory)}")
    
    # Check QC report for success rates
    qc_file = ANALYSES_DIR / "beta_extraction" / "qc_report.json"
    if qc_file.exists():
        with open(qc_file, 'r') as f:
            qc_report = json.load(f)
        
        success_rate = qc_report['summary']['success_rate']
        print(f"Beta extraction success rate: {success_rate:.1%}")
        
        failed_sessions = qc_report['summary']['failed_sessions']
        print(f"Failed sessions: {failed_sessions}")

# Processing time estimates (if available)
print(f"\nProcessing efficiency:")
print(f"  Pipeline stages: {len(pipeline_stages)} completed")
print(f"  Data files generated: {sum(1 for stage in pipeline_stages for _ in stage['dir'].glob('*') if stage['dir'].exists())}")

# =============================================================================
# 7. NEXT STEPS AND RECOMMENDATIONS
# =============================================================================

print("\n" + "="*50)
print("7. NEXT STEPS AND RECOMMENDATIONS")
print("="*50)

recommendations = []

# Based on analysis completeness
if reproducibility_score >= 0.8:
    recommendations.append("✓ Core analysis complete - ready for publication preparation")
    recommendations.append("→ Compare specific numerical results with original MATLAB output")
    recommendations.append("→ Conduct additional validation with independent dataset")
else:
    recommendations.append("⚠ Complete missing pipeline stages before proceeding")

# Based on data quality
if len(data['roi_coords']) > 0 and coverage >= 0.8:
    recommendations.append("✓ ROI detection quality sufficient for analysis")
else:
    recommendations.append("→ Improve ROI detection parameters or thresholds")

# Specific analyses to pursue
recommendations.extend([
    "→ Implement additional statistical corrections (e.g., cluster correction)",
    "→ Add bootstrapping analysis for longitudinal changes", 
    "→ Create interactive visualizations for data exploration",
    "→ Develop automated report generation",
    "→ Extend analysis to additional brain regions",
    "→ Compare with other plasticity studies in literature"
])

print("Recommendations:")
for rec in recommendations:
    print(f"  {rec}")

# =============================================================================
# 8. GENERATE COMPREHENSIVE REPORT
# =============================================================================

print("\n" + "="*50)
print("8. GENERATING COMPREHENSIVE REPORT")
print("="*50)

# Create detailed report
report = {
    'project_title': 'VOTC Plasticity Python Recreation',
    'generation_date': datetime.now().isoformat(),
    'original_paper': 'Liu et al. (2025) - Cross-sectional and longitudinal changes in category-selectivity',
    
    'pipeline_validation': {
        'overall_success': pipeline_success,
        'stages_completed': sum(1 for stage in pipeline_stages if validate_pipeline_stage(stage['name'], stage['files'], stage['dir'])[0]),
        'total_stages': len(pipeline_stages)
    },
    
    'data_quality': {
        'total_roi_coordinates': len(data['roi_coords']) if len(data['roi_coords']) > 0 else 0,
        'total_rsa_measurements': len(data['rsa_results']) if len(data['rsa_results']) > 0 else 0,
        'roi_type_coverage': coverage if len(data['roi_coords']) > 0 else 0,
        'patients_identified': list(patients_found) if len(data['roi_coords']) > 0 else [],
        'controls_identified': len(controls_found) if len(data['roi_coords']) > 0 else 0
    },
    
    'reproducibility': {
        'score': reproducibility_score,
        'checks_passed': passed_checks,
        'total_checks': total_checks,
        'check_details': dict(reproducibility_checks)
    },
    
    'statistical_analyses': {
        'spatial_crawford_available': stats_files['spatial_crawford'].exists(),
        'rsa_crawford_available': stats_files['rsa_crawford'].exists(),
        'longitudinal_available': stats_files['longitudinal'].exists()
    },
    
    'figures_generated': list(f.name for f in (ANALYSES_DIR / "figures").glob("*.png")) if (ANALYSES_DIR / "figures").exists() else [],
    
    'recommendations': recommendations
}

# Save comprehensive report
report_file = REPORTS_DIR / "project_summary_report.json"
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2, default=str)

# Create human-readable summary
summary_file = REPORTS_DIR / "project_summary.txt"
with open(summary_file, 'w') as f:
    f.write("VOTC PLASTICITY PROJECT SUMMARY\n")
    f.write("="*70 + "\n\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Python recreation of Liu et al. (2025) analysis\n\n")
    
    f.write("PIPELINE STATUS:\n")
    f.write(f"Overall success: {'✓ COMPLETE' if pipeline_success else '⚠ INCOMPLETE'}\n")
    f.write(f"Reproducibility score: {reproducibility_score:.1%}\n\n")
    
    f.write("DATA SUMMARY:\n")
    if len(data['roi_coords']) > 0:
        f.write(f"ROI coordinates: {len(data['roi_coords'])}\n")
        f.write(f"Patients: {list(patients_found)}\n")
        f.write(f"Controls: {len(controls_found)}\n")
    if len(data['rsa_results']) > 0:
        f.write(f"RSA measurements: {len(data['rsa_results'])}\n")
    
    f.write("\nRECOMMendations:\n")
    for rec in recommendations:
        f.write(f"{rec}\n")

print(f"Comprehensive report saved to: {report_file}")
print(f"Human-readable summary saved to: {summary_file}")

# =============================================================================
# FINAL STATUS
# =============================================================================

print("\n" + "="*70)
print("PROJECT COMPLETION STATUS")
print("="*70)

if reproducibility_score >= 0.8:
    print("🎉 PROJECT SUCCESSFULLY COMPLETED!")
    print(f"✓ {reproducibility_score:.1%} of key analyses reproduced")
    print("✓ Ready for scientific validation and publication")
elif reproducibility_score >= 0.6:
    print("⚠ PROJECT MOSTLY COMPLETE")
    print(f"✓ {reproducibility_score:.1%} of key analyses reproduced")
    print("→ Address remaining issues before publication")
else:
    print("❌ PROJECT NEEDS ADDITIONAL WORK")
    print(f"⚠ Only {reproducibility_score:.1%} of analyses completed")
    print("→ Focus on completing missing pipeline stages")

print(f"\nAll reports saved in: {REPORTS_DIR}")
print("="*70)