In [5]:
#https://medium.com/@akelleh/causal-inference-with-pandas-dataframes-fc3e64fce5d
#https://github.com/akelleh/causality
#http://www.degeneratestate.org/posts/2018/Mar/24/causal-inference-with-python-part-1-potential-outcomes/
#https://causalinferenceinpython.org/#installation


#from __future__ import division

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
sns.set_palette("colorblind")

%matplotlib inline

import datagenerators as dg
from causalinference import CausalModel
from causalinference.utils import random_data
Y, D, X = random_data()
causal = CausalModel(Y, D, X)

In [7]:
observed_data_0 = dg.generate_dataset_0()
observed_data_0.head()
#generate x=1 person has cool hat. x=0 no hat 
#productive 1 or 0 not productive(outcome)

Unnamed: 0,x,y
0,1,0
1,1,0
2,1,0
3,0,0
4,1,0


In [8]:
#estimating Causal Effect? 
#
def estimate_uplift(ds):
    """
    Estiamte the difference in means between two groups.
    
    Parameters
    ----------
    ds: pandas.DataFrame
        a dataframe of samples.
        
    Returns
    -------
    estimated_uplift: dict[Str: float] containing two items:
        "estimated_effect" - the difference in mean values of $y$ for treated and untreated samples.
        "standard_error" - 90% confidence intervals arround "estimated_effect"
        
        
    """
    base = ds[ds.x == 0]
    variant = ds[ds.x == 1]
    
    delta = variant.y.mean() - base.y.mean()
    delta_err = 1.96 * np.sqrt(
        variant.y.var() / variant.shape[0] + 
        base.y.var() / base.shape[0])
    
    return {"estimated_effect": delta, "standard_error": delta_err}

estimate_uplift(observed_data_0)
#people with cool hats are 15.5% less productive

{'estimated_effect': -0.1556256908954004,
 'standard_error': 0.08681671433619813}

In [9]:
from scipy.stats import chi2_contingency

contingency_table = (
    observed_data_0
    .assign(placeholder=1)
    .pivot_table(index="x", columns="y", values="placeholder", aggfunc="sum")
    .values
)

_, p, _, _ = chi2_contingency(contingency_table, lambda_="log-likelihood")

# p-value
p
#p value <.05 is SS.

0.0006864081722202652

In [10]:
 
def run_ab_test(datagenerator, n_samples=10000, filter_=None):
    """
    Generates n_samples from datagenerator with the value of X randomized
    so that 50% of the samples recieve treatment X=1 and 50% receive X=0,
    and feeds the results into `estimate_uplift` to get an unbiased 
    estimate of the average treatment effect.
    
    Returns
    -------
    effect: dict
    """
    n_samples_a = int(n_samples / 2)
    n_samples_b = n_samples - n_samples_a
    set_X = np.concatenate([np.ones(n_samples_a), np.zeros(n_samples_b)]).astype(np.int64)
    ds = datagenerator(n_samples=n_samples, set_X=set_X)
    if filter_ != None:
        ds = ds[filter_(ds)].copy()
    return estimate_uplift(ds)

run_ab_test(dg.generate_dataset_0)
#estimated effect is reversed

{'estimated_effect': 0.19679999999999997,
 'standard_error': 0.019218387269410702}