This notebook shows how NormalPowerAnalysis and PowerAnalysis calculators give similar powers for a switchback experiment

In [1]:
from datetime import date

import numpy as np
from cluster_experiments import PowerAnalysis, ConstantPerturbator, BalancedClusteredSplitter, ExperimentAnalysis, ClusteredOLSAnalysis, NormalPowerAnalysis
import pandas as pd



# Create fake data
N = 10_000
clusters = [f"Cluster {i}" for i in range(10)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 15)]
df = pd.DataFrame(
    {
        "cluster": np.random.choice(clusters, size=N),
        "date": np.random.choice(dates, size=N),
    }
).assign(
    # Target is a linear combination of cluster and day of week, plus some noise
    cluster_id=lambda df: df["cluster"].astype("category").cat.codes,
    day_of_week=lambda df: pd.to_datetime(df["date"]).dt.dayofweek,
    target=lambda df: df["cluster_id"] + df["day_of_week"] + np.random.normal(size=N),
)


In [2]:
df.head()

Unnamed: 0,cluster,date,cluster_id,day_of_week,target
0,Cluster 8,2022-01-07,8,4,13.096328
1,Cluster 5,2022-01-10,5,0,4.356023
2,Cluster 2,2022-01-14,2,4,7.487109
3,Cluster 6,2022-01-05,6,2,7.35639
4,Cluster 9,2022-01-01,9,5,13.38889


Some clusters have a higher average outcome than others

In [30]:
cluster_cols = ["cluster", "date"]

splitter = BalancedClusteredSplitter(
    cluster_cols=cluster_cols,
)

perturbator = ConstantPerturbator()

analysis = ClusteredOLSAnalysis(
    cluster_cols=cluster_cols,
)

alpha = 0.05
n_simulations = 100
n_simulations_normal = 10

# Simulated power analysis, we use clustered splitter and ols clustered analysis
pw_simulated = PowerAnalysis(
    splitter=splitter,
    perturbator=perturbator,
    alpha=alpha,
    n_simulations=n_simulations,
    analysis=analysis,
)

# Normal power analysis, uses Central limit theorem to estimate power, and needs less simulations
pw_normal = NormalPowerAnalysis(
    splitter=splitter,
    alpha=alpha,
    n_simulations=n_simulations,
    analysis=analysis,
)


In [31]:
# power line for simulated
pw_simulated_line = pw_simulated.power_line(df, average_effects=[0.05, 0.1, 0.15, 0.2, 0.25, 0.3])

In [32]:
# power line for normal
pw_normal_line = pw_normal.power_line(df, average_effects=[0.05, 0.1, 0.15, 0.2, 0.25, 0.3])
pw_normal_line

{0.05: 0.10677146772831803,
 0.1: 0.28420471420327353,
 0.15: 0.5488103982168475,
 0.2: 0.7929723292605043,
 0.25: 0.9346005373723081,
 0.3: 0.9862779767444964}

In [33]:
pd.DataFrame(
    {
        "Average effect": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        "Simulated power": pw_simulated_line.values(),
        "Normal power": pw_normal_line.values(),
    }
)

Unnamed: 0,Average effect,Simulated power,Normal power
0,0.05,0.04,0.106771
1,0.1,0.07,0.284205
2,0.15,0.04,0.54881
3,0.2,0.02,0.792972
4,0.25,0.07,0.934601
5,0.3,0.04,0.986278


In [29]:
pw_simulated.power_analysis(df, average_effect=1)

1.0

In [27]:
pw_normal.power_analysis(df, average_effect=0.1)

0.28469297656183246