In [8]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

from pathlib import Path
import random
import decimal

import pandas as pd
import numpy as np
import math

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf


# Detecting interactions between simultaneous A/B Tests

We will simulate a scneario where we have 2 simultaneous A/B tests, both real treatment effect plus an interaction effect when the users experience both treatments at the same time.

Treatment and interaction effects at population level:
- treatment 1 = 10
- treatment 2 = 5
- interaction (t1 * t2) = 5

In [9]:
# Set seed for reproducibility
np.random.seed(42)

# sample from control and variant populations, from the distributions of each subgroup
group_aa = stats.norm(loc=100, scale=20).rvs(size=200) # control 1 + control 2
group_ab = stats.norm(loc=105, scale=20).rvs(size=300) # control 1 + treatment 2
group_ba = stats.norm(loc=110, scale=20).rvs(size=200) # treatment 1 + control 2
group_bb = stats.norm(loc=120, scale=20).rvs(size=300) # treatment 1 + treatment 2

In [10]:
# Create a dataset with the values for each variant
data = [
    {'test_1': 0, 'test_2': 0, 'values': value} for value in group_aa
] + [
    {'test_1': 0, 'test_2': 1, 'values': value} for value in group_ab
] + [
    {'test_1': 1, 'test_2': 0, 'values': value} for value in group_ba
] + [
    {'test_1': 1, 'test_2': 1, 'values': value} for value in group_bb
]

df = pd.DataFrame(data)
display(df)

Unnamed: 0,test_1,test_2,values
0,0,0,109.934283
1,0,0,97.234714
2,0,0,112.953771
3,0,0,130.460597
4,0,0,95.316933
...,...,...,...
995,1,1,114.377994
996,1,1,155.953731
997,1,1,132.816857
998,1,1,108.576420


If we analyze the tests separetely, the interaction between the two treatments will bias our effect estimates, as we can see with the following analysis.

### Test 1 Analysis without controlling for interaction with Test 2
Because of the interaction effect, our estimate of the effect of treatment 1 was biased upwards.

In [11]:
group_stats = df.groupby(by=['test_1']).agg(mean=('values','mean'),std=('values','std'),count=('values','count'))
display(group_stats)

Unnamed: 0_level_0,mean,std,count
test_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,103.13676,19.873891,500
1,116.636522,20.470795,500


In [12]:
results = smf.ols(f'values ~ test_1', data=df).fit()
model_summary = results.summary(alpha=0.05)
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 values   R-squared:                       0.101
Model:                            OLS   Adj. R-squared:                  0.100
Method:                 Least Squares   F-statistic:                     111.9
Date:                Sun, 14 Apr 2024   Prob (F-statistic):           7.22e-25
Time:                        23:04:09   Log-Likelihood:                -4422.4
No. Observations:                1000   AIC:                             8849.
Df Residuals:                     998   BIC:                             8859.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    103.1368      0.902    114.313      0.0

### Test 2 Analysis without controlling for interaction with Test 1
Because of the interaction effect, our estimate of the effect of treatment 2 was also biased upwards.

In [13]:
group_stats = df.groupby(by=['test_2']).agg(mean=('values','mean'),std=('values','std'),count=('values','count'))
display(group_stats)

Unnamed: 0_level_0,mean,std,count
test_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,104.136232,19.903885,400
1,113.720247,21.295475,600


In [14]:
results = smf.ols(f'values ~ test_2', data=df).fit()
model_summary = results.summary(alpha=0.05)
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 values   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.048
Method:                 Least Squares   F-statistic:                     51.20
Date:                Sun, 14 Apr 2024   Prob (F-statistic):           1.61e-12
Time:                        23:04:09   Log-Likelihood:                -4450.5
No. Observations:                1000   AIC:                             8905.
Df Residuals:                     998   BIC:                             8915.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    104.1362      1.038    100.371      0.0

# Analysing Test 1 and Test 2 accounting for the interaction effect
This time, since we are controlling for interaction effects the estimated effects for both tests are closer to the real effect and we were also able to correctly estimate the interaction effect for when a user is presented with both treatments at the same time.


In [15]:
group_stats = df.groupby(by=['test_1','test_2']).agg(mean=('values','mean'),std=('values','std'),count=('values','count'))
display(group_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
test_1,test_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,99.184581,18.620078,200
0,1,105.771546,20.273174,300
1,0,109.087883,19.958172,200
1,1,121.668949,19.254178,300


In [16]:
results = smf.ols(f'values ~ test_1 * test_2', data=df).fit()
model_summary = results.summary(alpha=0.05)
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:                 values   R-squared:                       0.154
Model:                            OLS   Adj. R-squared:                  0.152
Method:                 Least Squares   F-statistic:                     60.63
Date:                Sun, 14 Apr 2024   Prob (F-statistic):           5.26e-36
Time:                        23:04:10   Log-Likelihood:                -4391.6
No. Observations:                1000   AIC:                             8791.
Df Residuals:                     996   BIC:                             8811.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        99.1846      1.385     71.624

#References

https://vista.io/blog/detecting-interaction-effects-in-online-experimentation