In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy import stats
from analyzer import ABTestAnalyzer


In [2]:

fake_data = pd.read_csv('fake_data.csv')
fake_data

Unnamed: 0,user_id,group,revenue_pre,revenue_exp,volume_pre,clicks_exp
0,cf7f6389-95e8-44b1-88d9-71a784d0eea3,control,126.174822,131.495757,125.447071,13.0
1,6b129f09-fb5f-42df-bde3-a0daf9476c31,control,95.278575,105.803968,95.423081,10.0
2,f5ce5d40-a918-482f-bd4a-27ade2af0d97,treatment,101.397369,101.787418,99.848352,10.0
3,82592e1a-b6b7-42bf-a625-1fc138f92d3e,control,68.855613,56.851970,71.326561,5.0
4,549b14a8-fc08-4224-bf8e-931a7a50e6ff,control,112.974435,101.960805,112.344642,10.0
...,...,...,...,...,...,...
1958,074587da-a60a-4017-aa27-6d4c316efe71,treatment,80.804420,73.944887,79.248900,7.0
1959,d1a843de-a1db-479f-9e23-65867cc57120,treatment,54.919574,79.652109,54.698111,7.0
1960,c9e0f7e5-6c22-4434-9168-131e97841d9b,treatment,158.438671,141.914387,157.195166,14.0
1961,c69dd04c-80c5-4c09-b961-ff188eb22a20,control,119.798094,101.634467,120.691774,10.0


# Overall Test

In [5]:
results = ABTestAnalyzer.analyze(
    fake_data,
    success_metrics = ['revenue_exp'],
    group_col = 'group',
    X = ['revenue_pre'],
    mode = 'abtest'
)
results

{'revenue_exp': {'Mean': {'treatment': 101.74291867913796,
   'control': 99.42347070906241},
  'Variance': {'treatment': 1001.451554877565, 'control': 930.5085475845752},
  'Uplift': {'pcnt': 2.3328978092736476, 'absolute': 2.3194479700755437},
  'Significance': {'p-value': np.float64(0.0984783049357988),
   'ci_95': ConfidenceInterval(low=np.float64(-5.071216089846748), high=np.float64(0.432320149695689)),
   'ci_90': ConfidenceInterval(low=np.float64(-4.628469304398749), high=np.float64(-0.010426635752311064)),
   'ci_80': ConfidenceInterval(low=np.float64(-4.118226780655929), high=np.float64(-0.5206691594951294))},
  'Variance reduction': {'before': np.float64(966.6706102787891),
   'after': np.float64(966.6706102787891),
   'reduction_ratio': np.float64(0.0)},
  'Skewness': {}}}

# Test CUPED transformations.
Edge Cases:
- 1 cov 1 succ
- n cov 1 succ
- 1 cov n succ
- n cov n succ


In [4]:
fake_data

Unnamed: 0,user_id,group,revenue_pre,revenue_exp,volume_pre,clicks_exp
0,cf7f6389-95e8-44b1-88d9-71a784d0eea3,control,126.174822,131.495757,125.447071,13.0
1,6b129f09-fb5f-42df-bde3-a0daf9476c31,control,95.278575,105.803968,95.423081,10.0
2,f5ce5d40-a918-482f-bd4a-27ade2af0d97,treatment,101.397369,101.787418,99.848352,10.0
3,82592e1a-b6b7-42bf-a625-1fc138f92d3e,control,68.855613,56.851970,71.326561,5.0
4,549b14a8-fc08-4224-bf8e-931a7a50e6ff,control,112.974435,101.960805,112.344642,10.0
...,...,...,...,...,...,...
1958,074587da-a60a-4017-aa27-6d4c316efe71,treatment,80.804420,73.944887,79.248900,7.0
1959,d1a843de-a1db-479f-9e23-65867cc57120,treatment,54.919574,79.652109,54.698111,7.0
1960,c9e0f7e5-6c22-4434-9168-131e97841d9b,treatment,158.438671,141.914387,157.195166,14.0
1961,c69dd04c-80c5-4c09-b961-ff188eb22a20,control,119.798094,101.634467,120.691774,10.0


In [5]:
# CUPED with 1 cov and 1 succ

#INPUT
data = fake_data
pre_exp = ['revenue_pre']
exp = ['revenue_exp']

# get 'training' data
pre_exp_df = data[pre_exp]

# get demeaned 'training' data
pre_exp_demeaned_df = pre_exp_df - np.mean(pre_exp_df)

if len(pre_exp) == 1:
    pre_exp_demeaned_df = pre_exp_demeaned_df.values.reshape(-1, 1)

# get experiment data
exp_df = data[[*exp]]

#train theta
theta_model = LinearRegression(fit_intercept=False)
theta_model.fit(pre_exp_df, exp_df)

# cuped transform
cuped_exp_df = exp_df - theta_model.coef_[0] * (pre_exp_demeaned_df)

print(cuped_exp_df)

      revenue_exp
0      105.088363
1      110.212705
2      100.093227
3       87.615154
4       88.719569
...           ...
1958    92.790248
1959   124.315196
1960    83.326806
1961    81.587266
1962   124.796280

[1963 rows x 1 columns]


In [6]:
# CUPED with 2 cov and 1 succ

# This works with 1 -n cov

## INPUT
data = fake_data
pre_exp = ['revenue_pre', 'volume_pre']
exp = ['revenue_exp']

## CUPED transform parameters

# Legend
# n: number of individuals in population. (randomisation units)
# k: number of pre_exp dimensions
# w: number of success metrics

#1 get 'training' data and its mean.
pre_exp_df = data[pre_exp] # n x k pd df
pre_exp_df_mean = np.mean(pre_exp_df) # k x 1 pd df

#2 get experiment data
exp_df = data[[*exp]] # n x w pd df

#3 get theta vector.
theta_model = LinearRegression(fit_intercept=False)
theta_model.fit(pre_exp_df, exp_df)
theta = theta_model.coef_.flatten() # k x 1 (k,) np arr

# if len(pre_exp) == 1:
#     pre_exp_demeaned_df = pre_exp_demeaned_df.values.reshape(-1, 1)

## cuped transform of success metric

#1: Center the pre-experiment covariates
# n x k - k x 1 --> n x k (pandas broadcasted substraction)
pre_exp_df_centered = pre_exp_df - pre_exp_df_mean 

#2: Compute the CUPED adjustment
# dot product n x k @ k x 1 --> n x 1 (Series)
cuped_adjustment = pre_exp_df_centered @ theta 

#3: Positionally substract adjustment from intra-experiment success metrics
# n x 1 - n x 1 --> n x 1
exp_cuped = exp_df.sub(cuped_adjustment, axis=0)
exp_cuped


Unnamed: 0,revenue_exp
0,105.044744
1,110.225841
2,99.996424
3,87.779400
4,88.682370
...,...
1958,92.693118
1959,124.304768
1960,83.249564
1961,81.648909


In [7]:
# CUPED with 2 cov and 2 succ

## INPUT
data = fake_data
pre_exp_covariates = ['revenue_pre', 'volume_pre']
exp = ['revenue_exp', 'clicks_exp']
group_col = 'group'
randomisation_col = 'user_id'


# init
cuped_transformed_metrics = data[[randomisation_col, group_col]]

for success_metric in exp:

    ### Obtain CUPED parameters ###
    # n: number of individuals in population. (randomisation units)
    # k: number of pre_exp dimensions

    pre_exp_df = data[pre_exp_covariates]                       # n x k
    pre_exp_df_mean = np.mean(pre_exp_df)                       # k x 1 
    exp_df = data[success_metric]                               # n x 1 
    theta_model = LinearRegression(fit_intercept=False)
    theta_model.fit(pre_exp_df, exp_df)
    theta = theta_model.coef_.flatten()                         # k x 1 

    ### Compute CUPED transform ###

    #1: Center the pre-experiment covariates
    pre_exp_df_centered = pre_exp_df - pre_exp_df_mean          # n x k

    #2: Compute the CUPED adjustment
    cuped_adjustment = pre_exp_df_centered @ theta              # n x 1

    #3: Substract adjustment
    exp_cuped = exp_df.sub(cuped_adjustment, axis=0)            # n x 1

    #4 Add transformed col to cuped_exp_df
    cuped_transformed_metrics[success_metric] = exp_cuped

    ### Report Group Stats ###
    grouped = cuped_transformed_metrics.groupby(group_col)[success_metric]
    means = grouped.mean().to_dict()
    vars = grouped.var().to_dict()
    uplift_abs = means['treatment'] - means['control']
    uplift_pct = (
        uplift_abs * 100 / means['control']
        if means['control'] > 0 else None
    )

    control_exp = cuped_transformed_metrics.loc[cuped_transformed_metrics[group_col] == 'control', success_metric] 
    treatment_exp = cuped_transformed_metrics.loc[cuped_transformed_metrics[group_col] == 'treatment', success_metric]
    ttest_result = stats.ttest_ind(control_exp, treatment_exp, equal_var=False)

    ci95 = ttest_result.confidence_interval()

    sample_size_reduction = cuped_transformed_metrics[success_metric].var()*100/ data[success_metric].var()
    print(sample_size_reduction, "%")
    print(ttest_result.pvalue)

22.37391259423004 %
0.0007731144094472288
22.269406205585586 %
0.0002282255679638374


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cuped_transformed_metrics[success_metric] = exp_cuped
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cuped_transformed_metrics[success_metric] = exp_cuped


# Test GBOOST CUPED