# causaldata Package Demo

**Author:** Dili Maduabum  
**Date:** December 2024  
**Package:** causaldata v0.1.0

This notebook demonstrates the main features of the `causaldata` package for simulating correlated mixed-type data.

In [None]:
# Setup - import the package
import sys
sys.path.insert(0, '..')  # Add parent directory for local import

from causaldata import MixedSimulator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
print("Setup complete!")

## Example 1: Basic Usage

Generate correlated data with mixed types: continuous income, binary treatment, and ordinal education.

In [None]:
# Create simulator
sim = MixedSimulator(n=1000)

# Add variables
sim.add_continuous("income", mean=50000, std=15000, min_val=0)
sim.add_binary("treated", prob=0.3)
sim.add_ordinal("education", 
                levels=["HS", "College", "Grad"],
                probs=[0.3, 0.5, 0.2])

# Set correlations
sim.set_correlation("income", "education", 0.5)
sim.set_correlation("treated", "education", 0.2)

# Generate data
data = sim.generate(seed=42)

# Display results
print(f"Generated {len(data)} observations\n")
display(data.head(10))
display(data.describe())

## Example 2: Visualize Income by Education

In [None]:
# Box plot: Income by Education Level
plt.figure(figsize=(10, 6))
order = ["HS", "College", "Grad"]
sns.boxplot(data=data, x="education", y="income", order=order)
plt.title("Income Distribution by Education Level\n(Correlation = 0.5)", fontsize=14)
plt.ylabel("Income ($)")
plt.xlabel("Education Level")
plt.tight_layout()
plt.show()

# Summary by group
print("\nMean income by education level:")
print(data.groupby("education")["income"].mean().sort_values(ascending=False))

## Example 3: Correlation Preservation

Verify that specified correlations are preserved in the generated data.

In [None]:
# Generate correlated continuous variables
sim2 = MixedSimulator(n=5000)
sim2.add_continuous("x1", mean=0, std=1)
sim2.add_continuous("x2", mean=0, std=1)
sim2.add_continuous("x3", mean=0, std=1)

sim2.set_correlation("x1", "x2", 0.7)
sim2.set_correlation("x1", "x3", -0.4)
sim2.set_correlation("x2", "x3", 0.3)

data2 = sim2.generate(seed=607)

# Compare specified vs. observed correlations
print("Specified correlations:")
print("  x1 ↔ x2:  0.7")
print("  x1 ↔ x3: -0.4")
print("  x2 ↔ x3:  0.3")
print("\nObserved correlations:")
print(data2[["x1", "x2", "x3"]].corr().round(3))

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(data2["x1"], data2["x2"], alpha=0.3, s=10)
axes[0].set_title(f"x1 vs x2 (ρ = {data2[['x1','x2']].corr().iloc[0,1]:.3f})")
axes[0].set_xlabel("x1")
axes[0].set_ylabel("x2")

axes[1].scatter(data2["x1"], data2["x3"], alpha=0.3, s=10, color='red')
axes[1].set_title(f"x1 vs x3 (ρ = {data2[['x1','x3']].corr().iloc[0,1]:.3f})")
axes[1].set_xlabel("x1")
axes[1].set_ylabel("x3")

axes[2].scatter(data2["x2"], data2["x3"], alpha=0.3, s=10, color='green')
axes[2].set_title(f"x2 vs x3 (ρ = {data2[['x2','x3']].corr().iloc[0,1]:.3f})")
axes[2].set_xlabel("x2")
axes[2].set_ylabel("x3")

plt.tight_layout()
plt.show()

## Example 4: Method Chaining

Build simulators using clean, fluent API.

In [None]:
# Create entire simulation in one expression
student_data = (MixedSimulator(n=500)
                .add_continuous("gpa", mean=3.2, std=0.5, min_val=0, max_val=4)
                .add_binary("scholarship", prob=0.4)
                .add_ordinal("year", 
                            levels=["Freshman", "Sophomore", "Junior", "Senior"],
                            probs=[0.25, 0.25, 0.25, 0.25])
                .set_correlation("gpa", "scholarship", 0.6)
                .generate(seed=42))

print("Student data generated with method chaining:")
display(student_data.head())

# Visualize GPA by scholarship status
plt.figure(figsize=(8, 5))
sns.violinplot(data=student_data, x="scholarship", y="gpa")
plt.title("GPA Distribution by Scholarship Status")
plt.xticks([0, 1], ["No Scholarship", "Scholarship"])
plt.tight_layout()
plt.show()

## Example 5: Simulating Treatment Effects with Confounding

Demonstrate how to create realistic causal inference scenarios.

In [None]:
# Create confounded treatment assignment
sim5 = MixedSimulator(n=2000)
sim5.add_continuous("confounder", mean=50, std=10)
sim5.add_binary("treated", prob=0.5)
sim5.add_continuous("baseline_outcome", mean=100, std=15)

# Confounder affects both treatment and outcome
sim5.set_correlation("confounder", "treated", 0.6)
sim5.set_correlation("confounder", "baseline_outcome", 0.7)

data5 = sim5.generate(seed=2024)

# Add true treatment effect (ATE = 10)
data5['outcome'] = data5['baseline_outcome'] + 10 * data5['treated']

# Naive comparison (biased)
naive_effect = data5.groupby('treated')['outcome'].mean().diff().iloc[-1]

# Simple regression adjustment
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X = data5[['treated', 'confounder']]
y = data5['outcome']
model.fit(X, y)
controlled_effect = model.coef_[0]

# Results
print("Treatment Effect Estimation:")
print(f"  True effect:        10.0")
print(f"  Naive (biased):     {naive_effect:.2f}")
print(f"  Adjusted (better):  {controlled_effect:.2f}")

# Visualize confounding
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Confounder by treatment
sns.violinplot(data=data5, x='treated', y='confounder', ax=axes[0])
axes[0].set_title('Confounder Distribution by Treatment Status')
axes[0].set_xticklabels(['Control', 'Treated'])

# Right: Outcome by treatment, colored by confounder
scatter = axes[1].scatter(data5['treated'], data5['outcome'], 
                         c=data5['confounder'], cmap='viridis', alpha=0.5, s=20)
axes[1].set_xlabel('Treatment Status')
axes[1].set_ylabel('Outcome')
axes[1].set_title('Outcome by Treatment (colored by Confounder)')
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['Control', 'Treated'])
plt.colorbar(scatter, ax=axes[1], label='Confounder')

plt.tight_layout()
plt.show()

## Summary

This demo showed:

✅ **Basic functionality** - Mixed-type variable generation  
✅ **Correlation control** - Specified correlations preserved  
✅ **Bounded variables** - Min/max constraints work  
✅ **Clean API** - Method chaining for readable code  
✅ **Causal inference** - Realistic confounding scenarios  
✅ **Reproducibility** - Seed control for consistent results  

For more information:
- **GitHub:** https://github.com/dmaduabum/causaldata
- **Issue:** https://github.com/statsmodels/statsmodels/issues/9603