# Chapter 2 - Randomized Experiments and Stats Review

In [1]:
import numpy as np
import pandas as pd

# Loading Data

In [3]:
url = 'https://github.com/matheusfacure/causal-inference-in-python-code/blob/main/causal-inference-in-python/data/cross_sell_email.csv?raw=true'

treatment_column = "cross_sell_email"
control_group = "no_email"

df = pd.read_csv(url)
df["control"] = df[treatment_column].apply(lambda x: int(x == control_group))
df.tail()

Unnamed: 0,gender,cross_sell_email,age,conversion,control
318,0,long,18,0,0
319,1,no_email,16,0,1
320,0,no_email,15,0,1
321,1,no_email,16,0,1
322,1,long,24,1,0


# Analysis

Computing the mean conversion for each treatment group. In other words,

$$\mathbb{E}[Y | T = t_i]$$

In [4]:
df_treatment_mean = df.groupby([treatment_column]).mean()
df_treatment_mean

Unnamed: 0_level_0,gender,age,conversion,control
cross_sell_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
long,0.550459,21.752294,0.055046,0.0
no_email,0.542553,20.489362,0.042553,1.0
short,0.633333,20.991667,0.125,0.0


## Average Treatment Effect (ATE)

To compute the average treatment effect, we need to estimate the difference between all treatment groups to control group as in:

$$ATE = \mathbb{E}[Y | T = t_{tr}] - \mathbb{E}[Y | T = t_{co}]$$

In [5]:
df_treatment_mean["conversion_control"] = df_treatment_mean.loc[control_group]["conversion"]
df_treatment_mean["ate"] = df_treatment_mean["conversion"] - df_treatment_mean["conversion_control"]
df_treatment_mean.sort_values(by="control", ascending=False)

Unnamed: 0_level_0,gender,age,conversion,control,conversion_control,ate
cross_sell_email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
no_email,0.542553,20.489362,0.042553,1.0,0.042553,0.0
long,0.550459,21.752294,0.055046,0.0,0.042553,0.012493
short,0.633333,20.991667,0.125,0.0,0.042553,0.082447


## Checking Randomization

Ideally, if randomization was done right, no differences should be observed for feature variables among treatment group. That is:

- $gender_{tr} \approx gender_{co}$
- $age_{tr} \approx age_{co}$

A simple way to evaluate whether these differences are significant is to compute the **normalized differences between the treatment groups**:

$$\frac{\hat{\mu}_{tr} - \hat{\mu}_{co}}{\sqrt{\Big( \hat{\sigma}^2_{tr} + \hat{\sigma}^2_{co} \Big) / 2}}$$

and evaluate whether these differences are higher than 0.5 (_rule of thumb_).

In [6]:
X = ["gender", "age"]

mu = df.groupby(treatment_column)[X].mean()
var = df.groupby(treatment_column)[X].var()

norm_diff = ((mu - mu.loc[control_group]) / np.sqrt((var + var.loc[control_group]) / 2))
norm_diff

Unnamed: 0_level_0,gender,age
cross_sell_email,Unnamed: 1_level_1,Unnamed: 2_level_1
long,0.015802,0.221423
no_email,0.0,0.0
short,0.184341,0.08737


In [7]:
norm_diff_threshold = 0.5
norm_diff > norm_diff_threshold

Unnamed: 0_level_0,gender,age
cross_sell_email,Unnamed: 1_level_1,Unnamed: 2_level_1
long,False,False
no_email,False,False
short,False,False


In [8]:
df

Unnamed: 0,gender,cross_sell_email,age,conversion,control
0,0,short,15,0,0
1,1,short,27,0,0
2,1,long,17,0,0
3,1,long,34,0,0
4,1,no_email,14,0,1
...,...,...,...,...,...
318,0,long,18,0,0
319,1,no_email,16,0,1
320,0,no_email,15,0,1
321,1,no_email,16,0,1


# Standard Error of Estimates

Moivre's equation

$$SE = \frac{\sigma}{\sqrt{n}}$$

In [9]:
def se(y: pd.Series) -> float:
    return y.std() / np.sqrt(y.shape[0])

for treatment in df["cross_sell_email"].unique():
    print (f"SE for {treatment} Email:", se(df.query("cross_sell_email == @treatment")["conversion"]))


SE for short Email: 0.030316953129541618
SE for long Email: 0.021946024609185506
SE for no_email Email: 0.020930611780338927


In [10]:
from scipy import stats

def ci(y: pd.Series, alpha=0.95) -> float:
    z = np.abs(stats.norm.ppf((1-alpha)/2))
    exp_se = y.sem()
    exp_mu = y.mean()
    ci = (exp_mu - z * exp_se, exp_mu + z * exp_se)
    return ci

alpha = 0.95

for treatment in df["cross_sell_email"].unique():
    print (f"{100*alpha}% CI for {treatment} Email:", ci(df.query("cross_sell_email == @treatment")["conversion"], alpha))

95.0% CI for short Email: (0.06557986374510955, 0.18442013625489045)
95.0% CI for long Email: (0.012032453721799723, 0.09805928939746633)
95.0% CI for no_email Email: (0.0015299462255076238, 0.08357643675321577)


# Hypothesis Testing

## Difference of distributions

When two random variables follow a normal distribution, we can compute the statistics on the sum or difference for these 2 distributions as:

- $Mean_{diff}: \mu_1 - \mu_2$
- $SE_{diff}: \sqrt{SE_1^2 + SE_2^2}$

In [16]:
diff_mu = df.query("cross_sell_email == 'short'")["conversion"].mean() - df.query("cross_sell_email == 'no_email'")["conversion"].mean()
diff_se = np.sqrt(df.query("cross_sell_email == 'short'")["conversion"].sem()**2 + df.query("cross_sell_email == 'no_email'")["conversion"].sem()**2)

ci = (diff_mu - 1.96*diff_se, diff_mu + 1.96 * diff_se)

print (f"{95}% CI for the difference (short email - no email): {ci}")


95% CI for the difference (short email - no email): (0.01023980847439844, 0.15465380854687816)
