In [19]:
import pandas as pd
import plotly.express as px

df = pd.read_csv('ab_test_data.csv')
type_correct = df['converted'].dtype #int64
converting = df.groupby('group')['converted'].sum()
count_people = df.groupby('group')['converted'].count()
part_of_converting = round(converting / count_people * 100,2)
print(f"Group A: {part_of_converting['A']}% converted (N={converting['A']} out of {count_people['A']})")
print(f"Group B: {part_of_converting['B']}% converted (N={converting['B']} out of {count_people['B']})")
def ok_fail(converting, count_people):
    if converting['A'] >= 5 or count_people['A'] >= 5:
        return 'OK'
    elif converting['B'] >= 5 or count_people['B'] >= 5:
        return 'OK'
    else:
        return 'FAIL'
print(f"Assumption check for z-test: \nGroup A: n*p= {converting['A']}, n*(1-p)= {count_people['A'] - converting['A']} -> {ok_fail(converting, count_people)} \nGroup B: n*p= {converting['B']}, n*(1-p)= {count_people['B'] - converting['B']} -> {ok_fail(converting, count_people)}")

# Variance
variance = round(df.groupby('group')['converted'].var(),2)
variance_relation_A = variance['A'].max() / variance['A'].min()
variance_relation_B = variance['B'].max() / variance['B'].min()
def variance_high(variance_relation_A, variance_relation_B):
    if variance_relation_A > 2:
        return 'High variance imbalance'
    elif variance_relation_B > 2:
        return 'High variance imbalance'
    else:
        return 'Balance'
print(f"Variable: \nGroup A: {variance['A']}, our variable is {variance_relation_A} -> {variance_high(variance_relation_A, variance_relation_B)} \nGroup B: {variance['B']}, our variable is {variance_relation_B} -> {variance_high(variance_relation_A, variance_relation_B)}")

results = pd.DataFrame({'group': ['A', 'B'], 'converted': [converting['A'], converting['B']]})

fig = px.bar(results, x='group', y='converted', title='Conversion Rate by Group',
    labels={'conversion_rate': 'Conversion Rate', 'group': 'Test Group'})
fig.write_html('ab_test_conversion_rates.html')

def get_recommendation(converting, count_people, variance_relation_A, variance_relation_B):
    assumption_ok = ok_fail(converting, count_people)
    converting_balance = variance_high(variance_relation_A, variance_relation_B)
    if assumption_ok == 'OK' and converting_balance  == 'Balance':
        return 'Not recommendation'
    else:
        return 'Proceed with z-test / Use non-parametric test / Collect more data'


print(f"=== A/B TEST READINESS REPORT === \n- Data quality: OK \n- z-test assumptions: {ok_fail(converting, count_people)} \n- Variance balance: {variance_high(variance_relation_A, variance_relation_B)} \n-Recommendation: {get_recommendation(converting, count_people, variance_relation_A, variance_relation_B)}")

Group A: 8.35% converted (N=424 out of 5076)
Group B: 9.59% converted (N=472 out of 4924)
Assumption check for z-test: 
Group A: n*p= 424, n*(1-p)= 4652 -> OK 
Group B: n*p= 472, n*(1-p)= 4452 -> OK
Variable: 
Group A: 0.08, our variable is 1.0 -> Balance 
Group B: 0.09, our variable is 1.0 -> Balance
=== A/B TEST READINESS REPORT === 
- Data quality: OK 
- z-test assumptions: OK 
- Variance balance: Balance 
-Recommendation: Not recommendation
