# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style('whitegrid')

## Load Data

In [None]:
df = pd.read_csv('../../1_data_collection/data/cleaned/girls_survey_clean.csv')
print(f"n = {len(df)}")
df.head()

## Sample Characteristics

In [None]:
print("Program participation:")
print(df['in_program'].value_counts())
print(f"\nParticipation rate: {(df['in_program']=='yes').sum()/len(df)*100:.1f}%")

In [None]:
print("Age distribution:")
print(df.groupby(['in_program', 'age_group']).size().unstack(fill_value=0))

In [None]:
print("Displacement duration:")
print(df.groupby(['in_program', 'displacement_duration']).size().unstack(fill_value=0))

In [None]:
print("School enrollment:")
print(df.groupby(['in_program', 'school']).size().unstack(fill_value=0))

## Descriptive Statistics

In [None]:
outcome_vars = ['who5_score', 'social_index', 'confidence_index']

print("Participants:")
print(df[df['in_program']=='yes'][outcome_vars].describe())

print("\nNon-participants:")
print(df[df['in_program']=='no'][outcome_vars].describe())

## Distribution Analysis

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, var in enumerate(outcome_vars):
    ax = axes[idx]
    participants = df[df['in_program']=='yes'][var]
    non_participants = df[df['in_program']=='no'][var]
    
    ax.hist(participants, alpha=0.6, label='Participants', bins=15)
    ax.hist(non_participants, alpha=0.6, label='Non-participants', bins=15)
    ax.set_xlabel(var.replace('_', ' ').title())
    ax.set_ylabel('Frequency')
    ax.legend()

plt.tight_layout()
plt.show()