# Girls Wellbeing Analysis

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
df = pd.read_csv('../../1_data_collection/data/cleaned/girls_survey_clean.csv')
print(f"Total: n={len(df)}")
print(f"Participants: n={(df['in_program']=='yes').sum()}")
print(f"Non-participants: n={(df['in_program']=='no').sum()}")

In [None]:
def cohens_d(group1, group2):
    """Calculate Cohen's d effect size"""
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
    return (np.mean(group1) - np.mean(group2)) / pooled_std

def cohens_d_ci(group1, group2, confidence=0.95):
    """Calculate CI for Cohen's d"""
    from scipy.stats import t as t_dist
    n1, n2 = len(group1), len(group2)
    d = cohens_d(group1, group2)
    
    var_d = (n1 + n2) / (n1 * n2) + d**2 / (2 * (n1 + n2))
    se_d = np.sqrt(var_d)
    
    alpha = 1 - confidence
    df = n1 + n2 - 2
    t_crit = t_dist.ppf(1 - alpha/2, df)
    
    return d - t_crit * se_d, d + t_crit * se_d

## Primary Outcomes

In [None]:
outcome_vars = [
    ('who5_score', 'WHO-5 Score'),
    ('social_index', 'Social Connection'),
    ('confidence_index', 'Confidence')
]

print("="*70)
print("GROUP COMPARISONS: PARTICIPANTS VS NON-PARTICIPANTS")
print("="*70)

for var, label in outcome_vars:
    participants = df[df['in_program']=='yes'][var].dropna()
    non_participants = df[df['in_program']=='no'][var].dropna()
    
    m1, sd1 = participants.mean(), participants.std(ddof=1)
    m2, sd2 = non_participants.mean(), non_participants.std(ddof=1)
    
    t_stat, p_val = stats.ttest_ind(participants, non_participants)
    df_val = len(participants) + len(non_participants) - 2
    
    d = cohens_d(participants, non_participants)
    ci_lower, ci_upper = cohens_d_ci(participants, non_participants)
    
    print(f"\n{label}")
    print("-" * 70)
    print(f"Participants:     M={m1:.2f}, SD={sd1:.2f}, n={len(participants)}")
    print(f"Non-participants: M={m2:.2f}, SD={sd2:.2f}, n={len(non_participants)}")
    print(f"t({df_val})={t_stat:.3f}, p={p_val:.4f}")
    print(f"Cohen's d={d:.3f}, 95% CI [{ci_lower:.3f}, {ci_upper:.3f}]")

## WHO-5 Component Analysis

In [None]:
who_items = [
    ('who_cheerful', 'Cheerful and in good spirits'),
    ('who_calm', 'Calm and relaxed'),
    ('who_active', 'Active and vigorous'),
    ('who_rested', 'Fresh and rested'),
    ('who_interested', 'Interesting daily life')
]

p_values = []

print("="*70)
print("WHO-5 COMPONENTS")
print("="*70)

for var, label in who_items:
    participants = df[df['in_program']=='yes'][var].dropna()
    non_participants = df[df['in_program']=='no'][var].dropna()
    
    m1, sd1 = participants.mean(), participants.std(ddof=1)
    m2, sd2 = non_participants.mean(), non_participants.std(ddof=1)
    
    t_stat, p_val = stats.ttest_ind(participants, non_participants)
    d = cohens_d(participants, non_participants)
    
    p_values.append(p_val)
    
    print(f"\n{label}")
    print(f"  Participants: M={m1:.2f}, SD={sd1:.2f}")
    print(f"  Non-participants: M={m2:.2f}, SD={sd2:.2f}")
    print(f"  t={t_stat:.3f}, p={p_val:.4f}, d={d:.3f}")

alpha_bonf = 0.05 / len(who_items)
print(f"\nBonferroni α = {alpha_bonf:.4f}")
print(f"Significant: {sum(p < alpha_bonf for p in p_values)}/{len(p_values)}")

## Subgroup Analysis

In [None]:
print("="*70)
print("SUBGROUP: AGE GROUPS")
print("="*70)

for age in sorted(df['age_group'].unique()):
    print(f"\n{age}:")
    subset = df[df['age_group'] == age]
    
    for var, label in outcome_vars:
        p = subset[subset['in_program']=='yes'][var].dropna()
        np = subset[subset['in_program']=='no'][var].dropna()
        
        if len(p) < 3 or len(np) < 3:
            print(f"  {label}: insufficient n")
            continue
        
        t_stat, p_val = stats.ttest_ind(p, np)
        d = cohens_d(p, np)
        print(f"  {label}: t={t_stat:.2f}, p={p_val:.3f}, d={d:.2f}")

## Correlations

In [None]:
corr_vars = ['who5_score', 'social_index', 'confidence_index']

print("="*70)
print("CORRELATIONS AMONG OUTCOMES")
print("="*70)

corr_matrix = df[corr_vars].corr()
print("\nCorrelation matrix:")
print(corr_matrix.round(3))

print("\nSignificance tests:")
for i in range(len(corr_vars)):
    for j in range(i+1, len(corr_vars)):
        v1, v2 = corr_vars[i], corr_vars[j]
        r, p = stats.pearsonr(df[v1].dropna(), df[v2].dropna())
        print(f"{v1} × {v2}: r={r:.3f}, p={p:.4f}")