In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Define paths - UPDATE THESE TO YOUR ABCD DATA LOCATION
ABCD_DATA_PATH = "/path/to/abcd/data"  # UPDATE THIS PATH
OUTPUT_DIR = "/Users/chloehampson/Desktop/hippo-amyg-depression/derivatives"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Output directory: {OUTPUT_DIR}")

## 1. Load ABCD Data Tables

In [None]:
# Function to load ABCD tables
def load_abcd_table(table_name, data_path=ABCD_DATA_PATH):
    """
    Load an ABCD data table.
    Tries multiple common file formats.
    """
    extensions = ['.csv', '.txt', '.tsv']
    
    for ext in extensions:
        filepath = os.path.join(data_path, f"{table_name}{ext}")
        if os.path.exists(filepath):
            if ext == '.tsv':
                return pd.read_csv(filepath, sep='\t', low_memory=False)
            else:
                return pd.read_csv(filepath, low_memory=False)
    
    print(f"Warning: Could not find table {table_name}")
    return None

print("Table loading function defined.")

In [None]:
# Load data tables - UPDATE table names based on your ABCD data release

# Demographics
demo_df = load_abcd_table('ab_p_demo')
static_df = load_abcd_table('ab_g_stc')
dynamic_df = load_abcd_table('ab_g_dyn')

# Family Environment Scale
fes_df = load_abcd_table('fc_p_fes')

# Cultural & Social Environment
meim_y_df = load_abcd_table('fc_y_meim')
meim_p_df = load_abcd_table('fc_p_meim')
nbhsoc_df = load_abcd_table('le_l_nbhsoc')
srpf_df = load_abcd_table('fc_y_srpf')

# SES measures
coi_df = load_abcd_table('le_l_coi')

# Depression measures
cbcl_df = load_abcd_table('mh_p_cbcl')
ysr_df = load_abcd_table('mh_y_ysr')
ksads_dep_p_df = load_abcd_table('mh_p_ksads__dep')
ksads_dep_y_df = load_abcd_table('mh_y_ksads__dep')

# MRI Quality Control
mri_qc_df = load_abcd_table('mr_y_qc')

# N-back task fMRI data
nback_aseg_df = load_abcd_table('mri_y_tfmr_nback_aseg')

print("Data tables loaded (or attempted to load).")

## 2. Define Variable Lists

In [None]:
# Subject identifier
SUBJECT_ID = 'src_subject_id'
SESSION_ID = 'eventname'

# ----- COVARIATE VARIABLES -----
COVARIATES = {
    'age': 'ab_p_demo_age',
    'sex': 'ab_g_stc__cohort_sex',
    'ethnicity': 'ab_g_stc__cohort_ethnrace__meim',
    'race': 'ab_g_stc__cohort_race__nih',
    'site': 'ab_g_dyn__design_site',
    'family_id': 'ab_g_stc__design_id__fam',
}

print(f"Covariates defined: {len(COVARIATES)}")

In [None]:
# ----- SOCIOENVIRONMENTAL VARIABLES (PLSC Block 1 / X) -----

# Family Environment
FAMILY_ENV_VARS = [
    'fc_p_fes__cohes_mean',      # Family cohesion
    'fc_p_fes__confl_mean',      # Family conflict
    'fc_p_fes__expr_mean',       # Family expression
    'fc_p_fes__intelcult_mean',  # Intellectual/cultural orientation
    'fc_p_fes__org_mean',        # Family organization
    'fc_p_fes__rec_mean',        # Active-recreational orientation
]

# Cultural & Social Environment
CULTURAL_SOCIAL_VARS = [
    'fc_y_meim_mean',                          # MEIM Youth
    'fc_p_meim_mean',                          # MEIM Parent
    'le_l_nbhsoc__addr1__factor3_score',       # Neighborhood ethnic/immigrant
    'fc_y_srpf__env_mean',                     # School environment
]

# Socioeconomic Status
SES_VARS = [
    'le_l_nbhsoc__addr1__factor1_score',       # Neighborhood disadvantage
    'le_l_nbhsoc__addr1__aff_score',           # Neighborhood affluence
    'le_l_coi__addr1__coi__total__metro_score',# Child Opportunity Index
    'ab_p_demo__income__hhold_001',            # Household income
    'ab_g_dyn__cohort_edu__cgs',               # Caregiver education
]

# Combine all socioenvironmental variables
SOCIOENV_VARS = FAMILY_ENV_VARS + CULTURAL_SOCIAL_VARS + SES_VARS

print(f"\nTotal SOCIOENVIRONMENTAL variables: {len(SOCIOENV_VARS)}")
print("\n--- Family Environment ---")
for v in FAMILY_ENV_VARS: print(f"  {v}")
print("\n--- Cultural/Social ---")
for v in CULTURAL_SOCIAL_VARS: print(f"  {v}")
print("\n--- SES ---")
for v in SES_VARS: print(f"  {v}")

In [None]:
# ----- DEPRESSION VARIABLES (PLSC Block 2 / Y) -----

DEPRESSION_VARS = [
    # Summary scores
    'mh_p_cbcl__dsm__dep_sum',      # CBCL Depression (Parent)
    'mh_y_ysr__dsm__dep_sum',       # YSR Depression (Youth)
    
    # Core symptoms - Parent report
    'mh_p_ksads__dep__mood__pres_sx',   # Depressed mood (Parent)
    'mh_p_ksads__dep__anhed__pres_sx',  # Anhedonia (Parent)
    'mh_p_ksads__dep__fatig__pres_sx',  # Fatigue (Parent)
    
    # Core symptoms - Youth report
    'mh_y_ksads__dep__mood__pres_sx',   # Depressed mood (Youth)
    'mh_y_ksads__dep__anhed__pres_sx',  # Anhedonia (Youth)
    'mh_y_ksads__dep__fatig__pres_sx',  # Fatigue (Youth)
]

print(f"Total DEPRESSION variables: {len(DEPRESSION_VARS)}")
for v in DEPRESSION_VARS: print(f"  {v}")

In [None]:
# ----- BRAIN VARIABLES: HIPPOCAMPUS & AMYGDALA (Stage 2) -----
# N-back task activation - UPDATE based on your ABCD data release

# Hippocampus ROIs
HIPPOCAMPUS_VARS = [
    'tfmri_nback_all_beta_l_hippocampus',
    'tfmri_nback_all_beta_r_hippocampus',
    'tfmri_nback_2b_face_beta_l_hippocampus',
    'tfmri_nback_2b_face_beta_r_hippocampus',
    'tfmri_nback_2b_place_beta_l_hippocampus',
    'tfmri_nback_2b_place_beta_r_hippocampus',
]

# Amygdala ROIs
AMYGDALA_VARS = [
    'tfmri_nback_all_beta_l_amygdala',
    'tfmri_nback_all_beta_r_amygdala',
    'tfmri_nback_2b_face_beta_l_amygdala',
    'tfmri_nback_2b_face_beta_r_amygdala',
    'tfmri_nback_2b_place_beta_l_amygdala',
    'tfmri_nback_2b_place_beta_r_amygdala',
]

BRAIN_VARS = HIPPOCAMPUS_VARS + AMYGDALA_VARS

print(f"Total BRAIN variables: {len(BRAIN_VARS)}")
print(f"  Hippocampus: {len(HIPPOCAMPUS_VARS)}")
print(f"  Amygdala: {len(AMYGDALA_VARS)}")

## 3. Merge Data Tables

In [None]:
def merge_abcd_tables(tables_dict, on_cols=['src_subject_id', 'eventname'], how='inner'):
    """
    Merge multiple ABCD tables on subject ID and eventname.
    """
    merged_df = None
    
    for name, df in tables_dict.items():
        if df is None:
            print(f"Skipping {name} - DataFrame is None")
            continue
            
        available_cols = [col for col in on_cols if col in df.columns]
        
        if len(available_cols) == 0:
            print(f"Skipping {name} - No merge columns found")
            continue
        
        if merged_df is None:
            merged_df = df.copy()
            print(f"Initialized with {name}: {len(merged_df)} rows")
        else:
            merged_df = merged_df.merge(df, on=available_cols, how=how, suffixes=('', f'_{name}'))
            print(f"After merging {name}: {len(merged_df)} rows")
    
    return merged_df

# Create dictionary of tables to merge
tables_to_merge = {
    'demo': demo_df,
    'static': static_df,
    'dynamic': dynamic_df,
    'fes': fes_df,
    'meim_y': meim_y_df,
    'meim_p': meim_p_df,
    'nbhsoc': nbhsoc_df,
    'srpf': srpf_df,
    'coi': coi_df,
    'cbcl': cbcl_df,
    'ysr': ysr_df,
    'ksads_dep_p': ksads_dep_p_df,
    'ksads_dep_y': ksads_dep_y_df,
    'mri_qc': mri_qc_df,
    'nback_aseg': nback_aseg_df,
}

# Remove None entries
tables_to_merge = {k: v for k, v in tables_to_merge.items() if v is not None}
print(f"\nTables available for merging: {list(tables_to_merge.keys())}")

In [None]:
# Merge all tables
# NOTE: Uncomment when data is loaded
# merged_df = merge_abcd_tables(tables_to_merge)

print("NOTE: Uncomment the merge function call once data is loaded")

## 4. Select Timepoint

In [None]:
# Define timepoint of interest
TIMEPOINT = 'ses-02A'  # 2-year follow-up - UPDATE AS NEEDED

# Filter to selected timepoint
# Uncomment when merged_df is available

# if 'eventname' in merged_df.columns:
#     analysis_df = merged_df[merged_df['eventname'] == TIMEPOINT].copy()
#     print(f"Subjects at {TIMEPOINT}: {len(analysis_df)}")
# else:
#     analysis_df = merged_df.copy()

print(f"Selected timepoint: {TIMEPOINT}")

## 5. Quality Control & Missing Data

In [None]:
def apply_qc_filters(df, motion_threshold=0.5, motion_var='tfmri_nback_all_meanmotion', qc_var='imgincl_nback_include'):
    """Apply quality control filters for neuroimaging data."""
    n_initial = len(df)
    
    if qc_var in df.columns:
        df = df[df[qc_var] == 1]
        print(f"After QC inclusion filter: {len(df)} ({n_initial - len(df)} removed)")
    
    if motion_var in df.columns:
        n_before = len(df)
        df = df[df[motion_var] <= motion_threshold]
        print(f"After motion filter: {len(df)} ({n_before - len(df)} removed)")
    
    return df

def assess_missing_data(df, variables):
    """Assess missing data for a list of variables."""
    missing_info = []
    for var in variables:
        if var in df.columns:
            n_missing = df[var].isna().sum()
            pct_missing = (n_missing / len(df)) * 100
            missing_info.append({
                'variable': var,
                'n_missing': n_missing,
                'pct_missing': round(pct_missing, 2)
            })
        else:
            missing_info.append({'variable': var, 'n_missing': 'NOT FOUND', 'pct_missing': 'N/A'})
    return pd.DataFrame(missing_info)

print("QC and missing data functions defined.")

## 6. Create PLSC Dataframes (Separate Blocks)

In [None]:
def create_plsc_dataframes_separate(df, socioenv_vars, depression_vars, brain_vars, covariate_vars, subject_id='src_subject_id'):
    """
    Create FOUR separate dataframes for the two-stage PLSC analysis:
    
    Stage 1 PLSC:
    1. Socioenvironmental dataframe (Block 1 / X)
    2. Depression dataframe (Block 2 / Y)
    
    Stage 2 (Brain correlation):
    3. Brain dataframe (Hippocampus & Amygdala)
    
    Plus:
    4. Covariate dataframe (for residualization)
    """
    
    # Filter to existing variables
    existing_socioenv = [v for v in socioenv_vars if v in df.columns]
    existing_depression = [v for v in depression_vars if v in df.columns]
    existing_brain = [v for v in brain_vars if v in df.columns]
    existing_covars = {k: v for k, v in covariate_vars.items() if v in df.columns}
    
    print(f"Socioenvironmental variables found: {len(existing_socioenv)}/{len(socioenv_vars)}")
    print(f"Depression variables found: {len(existing_depression)}/{len(depression_vars)}")
    print(f"Brain variables found: {len(existing_brain)}/{len(brain_vars)}")
    print(f"Covariate variables found: {len(existing_covars)}/{len(covariate_vars)}")
    
    # Create covariate dataframe
    covar_cols = [subject_id] + list(existing_covars.values())
    covar_df = df[covar_cols].copy()
    covar_df = covar_df.set_index(subject_id)
    
    # Create SOCIOENVIRONMENTAL dataframe (PLSC Block 1 / X)
    socioenv_df = df[[subject_id] + existing_socioenv].copy()
    socioenv_df = socioenv_df.set_index(subject_id)
    
    # Create DEPRESSION dataframe (PLSC Block 2 / Y)
    depression_df = df[[subject_id] + existing_depression].copy()
    depression_df = depression_df.set_index(subject_id)
    
    # Create BRAIN dataframe (Stage 2)
    brain_df = df[[subject_id] + existing_brain].copy()
    brain_df = brain_df.set_index(subject_id)
    
    return {
        'covariates': covar_df,
        'socioenv': socioenv_df,
        'depression': depression_df,
        'brain': brain_df,
        'variable_lists': {
            'socioenv': existing_socioenv,
            'depression': existing_depression,
            'brain': existing_brain,
            'covariates': existing_covars
        }
    }

print("PLSC dataframe creation function defined (separate blocks).")

In [None]:
# Create PLSC dataframes
# Uncomment when analysis_df is available

# plsc_data = create_plsc_dataframes_separate(
#     analysis_df,
#     socioenv_vars=SOCIOENV_VARS,
#     depression_vars=DEPRESSION_VARS,
#     brain_vars=BRAIN_VARS,
#     covariate_vars=COVARIATES
# )
# 
# print("\nDataframe shapes:")
# print(f"  Covariates: {plsc_data['covariates'].shape}")
# print(f"  Socioenvironmental (PLSC Block 1): {plsc_data['socioenv'].shape}")
# print(f"  Depression (PLSC Block 2): {plsc_data['depression'].shape}")
# print(f"  Brain - Hippo/Amyg (Stage 2): {plsc_data['brain'].shape}")

## 7. Save Dataframes for R Analysis

In [None]:
def save_plsc_dataframes(plsc_data, output_dir):
    """
    Save PLSC dataframes to CSV files for R analysis.
    Creates FOUR separate files for the two-stage analysis.
    """
    import json
    os.makedirs(output_dir, exist_ok=True)
    
    # Save covariates
    plsc_data['covariates'].to_csv(os.path.join(output_dir, 'covariate.csv'))
    print(f"Saved: covariate.csv {plsc_data['covariates'].shape}")
    
    # Save socioenvironmental (PLSC Block 1)
    plsc_data['socioenv'].to_csv(os.path.join(output_dir, 'clean-socioenv.csv'))
    print(f"Saved: clean-socioenv.csv {plsc_data['socioenv'].shape}")
    
    # Save depression (PLSC Block 2)
    plsc_data['depression'].to_csv(os.path.join(output_dir, 'clean-depression.csv'))
    print(f"Saved: clean-depression.csv {plsc_data['depression'].shape}")
    
    # Save brain data (Stage 2)
    plsc_data['brain'].to_csv(os.path.join(output_dir, 'clean-brain-hippo-amyg.csv'))
    print(f"Saved: clean-brain-hippo-amyg.csv {plsc_data['brain'].shape}")
    
    # Save variable lists
    with open(os.path.join(output_dir, 'variable_lists.json'), 'w') as f:
        json.dump(plsc_data['variable_lists'], f, indent=2)
    print("Saved: variable_lists.json")
    
    print(f"\nAll files saved to: {output_dir}")

print("Save function defined.")

In [None]:
# Save dataframes
# Uncomment when plsc_data is available

# save_plsc_dataframes(plsc_data, OUTPUT_DIR)

## 8. Summary

### Files Created:
1. **covariate.csv** - Demographic & nuisance variables for residualization
2. **clean-socioenv.csv** - Socioenvironmental factors (PLSC Block 1 / X)
3. **clean-depression.csv** - Depression symptoms (PLSC Block 2 / Y)  
4. **clean-brain-hippo-amyg.csv** - Hippocampus & Amygdala (Stage 2)
5. **variable_lists.json** - Reference file with all variable names

### Analysis Flow:
```
STAGE 1: PLSC
   Socioenvironmental (X) ←→ Depression (Y)
   → Identifies latent dimensions linking socio factors to depression
   → Extract latent variable scores

STAGE 2: Brain Correlation  
   PLSC Latent Scores ←→ Hippocampus/Amygdala Activation
   → Determines brain's role in socio-depression association
```

### Next Steps:
1. Open `hippo_amyg_depression_plsc.Rmd` in RStudio
2. Update file paths to point to the derivatives folder
3. Run the PLSC analysis

In [None]:
print("="*60)
print("DATA PREPARATION COMPLETE")
print("="*60)
print(f"\nOutput directory: {OUTPUT_DIR}")
print("\nNext: Run hippo_amyg_depression_plsc.Rmd in R")