In [11]:
import numpy as np
import pandas as pd


In [12]:
p_z = 0.5
p_x_z = [0.9, 0.1]
p_y_xz = [0.2, 0.4, 0.6, 0.8]
z = np.random.binomial(n=1, p=p_z, size=500)
p_x = np.choose(z, p_x_z)
x = np.random.binomial(n=1, p=p_x, size=500)
p_y = np.choose(x+2*z, p_y_xz)
y = np.random.binomial(n=1, p=p_y, size=500)
generate_dataset_0 = pd.DataFrame({"x":x, "y":y})
generate_dataset_0

Unnamed: 0,x,y
0,0,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
495,1,0
496,1,1
497,0,1
498,0,0


In [13]:
def estimate_uplift(ds):
    base = ds[ds.x == 0]
    variant = ds[ds.x == 1]
    delta = variant.y.mean() - base.y.mean()
    delta_err = 1.96 * np.sqrt(variant.y.var() / variant.shape[0] + base.y.var() / base.shape[0])
    return {"estimated_effect": delta, "standard_error": delta_err}

estimate_uplift(generate_dataset_0)

{'estimated_effect': -0.1609101535603913, 'standard_error': 0.0866593277954973}

In [14]:
from scipy.stats import chi2_contingency
contingency_table = (
    generate_dataset_0
    .assign(placeholder=1)
    .pivot_table(index="x", columns="y", values="placeholder", aggfunc="sum")
    .values
)
_, p, _, _ = chi2_contingency(contingency_table, lambda_="log-likelihood")
# p-value
p

0.0004378489315638042

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
def generate_dataset_0(n_samples=500, set_X=None, show_z=False):
    p_z = 0.5
    p_x_z = [0.9, 0.1]
    p_y_xz = [0.2, 0.4, 0.6, 0.8]
    z = np.random.binomial(n=1, p=p_z, size=n_samples)
    if set_X is not None:
        assert(len(set_X) == n_samples)
        x = set_X
    else:
        p_x = np.choose(z, p_x_z)
        x = np.random.binomial(n=1, p=p_x, size=n_samples)
    p_y = np.choose(x+2*z, p_y_xz)
    y = np.random.binomial(n=1, p=p_y, size=n_samples)
    if show_z:
        return pd.DataFrame({"x":x, "y":y, "z":z})
    return pd.DataFrame({"x":x, "y":y})

In [22]:
def run_ab_test(datagenerator, n_samples=10000, filter_=None):
    n_samples_a = int(n_samples / 2)
    n_samples_b = n_samples - n_samples_a
    set_X = np.concatenate([np.ones(n_samples_a), np.zeros(n_samples_b)]).astype(np.int64)
    ds = datagenerator(n_samples=n_samples, set_X=set_X)
    if filter_ != None:
        ds = ds[filter_(ds)].copy()
    return estimate_uplift(ds)

run_ab_test(generate_dataset_0)

{'estimated_effect': 0.195, 'standard_error': 0.019224927128366914}

### Correlation doesn’t imply causality. 
- X and Y are random variables and we want to measure the effect by forcing X to take a certain value on how the distribution of Y will get changed. 
- We can call the procedure of forcing a variable to take a certain value intervention.
    - P(Y|do(X))
- Let’s say these variables are Y0  and Y1 and also these random variables can not be directly observed. 
- Y  is defined in terms of
    - Y=Y1 when X=1
    - Y=Y0 when X=0

Using those potential outcomes shifts the problem from one about how distributions change under the intervention, to one about data drawn Independent and identically distributed random variables with missing values

- ATE = E[Y1−Y0]
- ATT=E[Y1−Y0|X=1], the “Average Treatment effect of the Treated”
- ATC=E[Y1−Y0|X=0], the “Average Treatment effect of the Control”
