# Girls Wellbeing Analysis

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings

# Suppress precision loss warnings (occur when data has very low variance)
warnings.filterwarnings('ignore', message='Precision loss occurred in moment calculation')

In [2]:
df = pd.read_csv('../../1_data_collection/data/cleaned/girls_survey_clean.csv')
print(f"Total: n={len(df)}")
print(f"Participants: n={(df['in_program']=='yes').sum()}")
print(f"Non-participants: n={(df['in_program']=='no').sum()}")

Total: n=102
Participants: n=79
Non-participants: n=23


In [3]:
def cohens_d(group1, group2):
    """Calculate Cohen's d effect size"""
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
    return (np.mean(group1) - np.mean(group2)) / pooled_std

def cohens_d_ci(group1, group2, confidence=0.95):
    """Calculate CI for Cohen's d"""
    from scipy.stats import t as t_dist
    n1, n2 = len(group1), len(group2)
    d = cohens_d(group1, group2)
    
    var_d = (n1 + n2) / (n1 * n2) + d**2 / (2 * (n1 + n2))
    se_d = np.sqrt(var_d)
    
    alpha = 1 - confidence
    df = n1 + n2 - 2
    t_crit = t_dist.ppf(1 - alpha/2, df)
    
    return d - t_crit * se_d, d + t_crit * se_d

## Primary Outcomes

In [4]:
outcome_vars = [
    ('who5_score', 'WHO-5 Score'),
    ('social_index', 'Social Connection'),
    ('confidence_index', 'Confidence')
]

print("="*70)
print("GROUP COMPARISONS: PARTICIPANTS VS NON-PARTICIPANTS")
print("="*70)

for var, label in outcome_vars:
    participants = df[df['in_program']=='yes'][var].dropna()
    non_participants = df[df['in_program']=='no'][var].dropna()
    
    m1, sd1 = participants.mean(), participants.std(ddof=1)
    m2, sd2 = non_participants.mean(), non_participants.std(ddof=1)
    
    t_stat, p_val = stats.ttest_ind(participants, non_participants)
    df_val = len(participants) + len(non_participants) - 2
    
    d = cohens_d(participants, non_participants)
    ci_lower, ci_upper = cohens_d_ci(participants, non_participants)
    
    print(f"\n{label}")
    print("-" * 70)
    print(f"Participants:     M={m1:.2f}, SD={sd1:.2f}, n={len(participants)}")
    print(f"Non-participants: M={m2:.2f}, SD={sd2:.2f}, n={len(non_participants)}")
    print(f"t({df_val})={t_stat:.3f}, p={p_val:.4f}")
    print(f"Cohen's d={d:.3f}, 95% CI [{ci_lower:.3f}, {ci_upper:.3f}]")

GROUP COMPARISONS: PARTICIPANTS VS NON-PARTICIPANTS

WHO-5 Score
----------------------------------------------------------------------
Participants:     M=82.13, SD=10.79, n=79
Non-participants: M=40.70, SD=3.75, n=23
t(100)=18.044, p=0.0000
Cohen's d=4.275, 95% CI [3.518, 5.033]

Social Connection
----------------------------------------------------------------------
Participants:     M=4.31, SD=0.52, n=79
Non-participants: M=1.99, SD=0.16, n=23
t(100)=21.063, p=0.0000
Cohen's d=4.990, 95% CI [4.153, 5.828]

Confidence
----------------------------------------------------------------------
Participants:     M=4.16, SD=0.61, n=79
Non-participants: M=2.07, SD=0.14, n=23
t(100)=16.376, p=0.0000
Cohen's d=3.880, 95% CI [3.165, 4.595]


## WHO-5 Component Analysis

In [5]:
who_items = [
    ('who_cheerful', 'Cheerful and in good spirits'),
    ('who_calm', 'Calm and relaxed'),
    ('who_active', 'Active and vigorous'),
    ('who_rested', 'Fresh and rested'),
    ('who_interested', 'Interesting daily life')
]

p_values = []

print("="*70)
print("WHO-5 COMPONENTS")
print("="*70)

for var, label in who_items:
    participants = df[df['in_program']=='yes'][var].dropna()
    non_participants = df[df['in_program']=='no'][var].dropna()
    
    m1, sd1 = participants.mean(), participants.std(ddof=1)
    m2, sd2 = non_participants.mean(), non_participants.std(ddof=1)
    
    t_stat, p_val = stats.ttest_ind(participants, non_participants)
    d = cohens_d(participants, non_participants)
    
    p_values.append(p_val)
    
    print(f"\n{label}")
    print(f"  Participants: M={m1:.2f}, SD={sd1:.2f}")
    print(f"  Non-participants: M={m2:.2f}, SD={sd2:.2f}")
    print(f"  t={t_stat:.3f}, p={p_val:.4f}, d={d:.3f}")

alpha_bonf = 0.05 / len(who_items)
print(f"\nBonferroni α = {alpha_bonf:.4f}")
print(f"Significant: {sum(p < alpha_bonf for p in p_values)}/{len(p_values)}")

WHO-5 COMPONENTS

Cheerful and in good spirits
  Participants: M=4.13, SD=0.65
  Non-participants: M=2.17, SD=0.39
  t=13.728, p=0.0000, d=3.253

Calm and relaxed
  Participants: M=3.90, SD=0.65
  Non-participants: M=2.00, SD=0.00
  t=13.910, p=0.0000, d=3.296

Active and vigorous
  Participants: M=4.48, SD=0.53
  Non-participants: M=1.83, SD=0.39
  t=22.399, p=0.0000, d=5.307

Fresh and rested
  Participants: M=3.80, SD=0.63
  Non-participants: M=1.91, SD=0.29
  t=13.933, p=0.0000, d=3.301

Interesting daily life
  Participants: M=4.23, SD=0.60
  Non-participants: M=2.26, SD=0.45
  t=14.600, p=0.0000, d=3.459

Bonferroni α = 0.0100
Significant: 5/5


  res = hypotest_fun_out(*samples, **kwds)


## Subgroup Analysis

In [None]:
print("="*70)
print("SUBGROUP: AGE GROUPS")
print("="*70)

for age in sorted(df['age_group'].unique()):
    print(f"\n{age}:")
    subset = df[df['age_group'] == age]
    
    for var, label in outcome_vars:
        participants_sub = subset[subset['in_program']=='yes'][var].dropna()
        non_participants_sub = subset[subset['in_program']=='no'][var].dropna()
        
        if len(participants_sub) < 3 or len(non_participants_sub) < 3:
            print(f"  {label}: insufficient n")
            continue
        
        t_stat, p_val = stats.ttest_ind(participants_sub, non_participants_sub)
        d = cohens_d(participants_sub, non_participants_sub)
        print(f"  {label}: t={t_stat:.2f}, p={p_val:.3f}, d={d:.2f}")

In [None]:
print("="*70)
print("SUBGROUP: DISPLACEMENT DURATION")
print("="*70)

for duration in sorted(df['displacement_duration'].unique()):
    print(f"\n{duration}:")
    subset = df[df['displacement_duration'] == duration]
    
    for var, label in outcome_vars:
        participants_sub = subset[subset['in_program']=='yes'][var].dropna()
        non_participants_sub = subset[subset['in_program']=='no'][var].dropna()
        
        if len(participants_sub) < 3 or len(non_participants_sub) < 3:
            print(f"  {label}: insufficient n")
            continue
        
        t_stat, p_val = stats.ttest_ind(participants_sub, non_participants_sub)
        d = cohens_d(participants_sub, non_participants_sub)
        print(f"  {label}: t={t_stat:.2f}, p={p_val:.3f}, d={d:.2f}")

In [None]:
print("="*70)
print("SUBGROUP: SCHOOL ENROLLMENT")
print("="*70)

for school_status in sorted(df['school'].unique()):
    print(f"\n{school_status}:")
    subset = df[df['school'] == school_status]
    
    for var, label in outcome_vars:
        participants_sub = subset[subset['in_program']=='yes'][var].dropna()
        non_participants_sub = subset[subset['in_program']=='no'][var].dropna()
        
        if len(participants_sub) < 3 or len(non_participants_sub) < 3:
            print(f"  {label}: insufficient n")
            continue
        
        t_stat, p_val = stats.ttest_ind(participants_sub, non_participants_sub)
        d = cohens_d(participants_sub, non_participants_sub)
        print(f"  {label}: t={t_stat:.2f}, p={p_val:.3f}, d={d:.2f}")

## Correlations

In [9]:
corr_vars = ['who5_score', 'social_index', 'confidence_index']

print("="*70)
print("CORRELATIONS AMONG OUTCOMES")
print("="*70)

corr_matrix = df[corr_vars].corr()
print("\nCorrelation matrix:")
print(corr_matrix.round(3))

print("\nSignificance tests:")
for i in range(len(corr_vars)):
    for j in range(i+1, len(corr_vars)):
        v1, v2 = corr_vars[i], corr_vars[j]
        r, p = stats.pearsonr(df[v1].dropna(), df[v2].dropna())
        print(f"{v1} × {v2}: r={r:.3f}, p={p:.4f}")

CORRELATIONS AMONG OUTCOMES

Correlation matrix:
                  who5_score  social_index  confidence_index
who5_score             1.000         0.986             0.989
social_index           0.986         1.000             0.985
confidence_index       0.989         0.985             1.000

Significance tests:
who5_score × social_index: r=0.986, p=0.0000
who5_score × confidence_index: r=0.989, p=0.0000
social_index × confidence_index: r=0.985, p=0.0000


## Football Perception Items (Participants Only)

In [10]:
football_items = [
    ('fb_happy', 'Playing football makes me happy'),
    ('fb_forget_worries', 'Football helps me forget worries'),
    ('fb_feel_stronger', 'Football makes me feel stronger'),
    ('fb_recommend', 'I would recommend football to friends')
]

participants_only = df[df['in_program']=='yes']

print("="*70)
print("FOOTBALL PERCEPTION ITEMS (PARTICIPANTS ONLY)")
print("="*70)
print(f"n = {len(participants_only)}\n")

for var, label in football_items:
    data = participants_only[var].dropna()
    
    mean_val = data.mean()
    sd_val = data.std(ddof=1)
    n_val = len(data)
    
    # Calculate 95% CI for mean
    se = sd_val / np.sqrt(n_val)
    ci_lower = mean_val - 1.96 * se
    ci_upper = mean_val + 1.96 * se
    
    # Count agreement (assuming 4-5 on 5-point scale is agreement)
    if data.max() <= 5:  # If it's a 5-point scale
        agree_pct = (data >= 4).sum() / n_val * 100
        print(f"{label}")
        print(f"  M={mean_val:.2f}, SD={sd_val:.2f}, 95% CI [{ci_lower:.2f}, {ci_upper:.2f}]")
        print(f"  Agreement (4-5): {agree_pct:.1f}%\n")
    else:
        print(f"{label}")
        print(f"  M={mean_val:.2f}, SD={sd_val:.2f}, 95% CI [{ci_lower:.2f}, {ci_upper:.2f}]\n")

FOOTBALL PERCEPTION ITEMS (PARTICIPANTS ONLY)
n = 79



AttributeError: 'Series' object has no attribute 'sqrt'