In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from app.spark_context import get_serverless_spark_session
spark = get_serverless_spark_session()

In [3]:
with open('./projects/ab-test-analysis/sqls/example_continuous_response.sql', 'r') as file:
    sql_query = file.read()

In [None]:
print(sql_query)

In [5]:
df = spark.sql(sqlQuery=sql_query).toPandas()

In [None]:
df

In [7]:
from ab_test_analysis.analysis import split_test_control_groups
group_colname = "non_random_group"
test_group_name = "a"
df_test, df_control = split_test_control_groups(
    df_ab_test=df,
    group_colname=group_colname,
    test_group_name=test_group_name
)

In [None]:
df_test

In [10]:
response_name = "amount_spent"
response_test = df_test[response_name].astype(float)
response_control = df_control[response_name].astype(float)

In [11]:
import numpy as np
import pandas as pd
def compute_cohen_d(response_test: pd.Series, response_control: pd.Series) -> float:
    diff = response_control.mean() - response_test.mean()
    pooled_sd = np.sqrt(
        (
            (len(response_control) - 1) * response_control.var() + (len(response_test) - 1) * response_test.var()
        ) / (len(response_control) + len(response_test) - 2)
    )
    return diff / pooled_sd

In [None]:
def get_cohen_d_explanation(
    d: float
) -> str:
    abs_d = abs(d)
    effect_size = ""
    if abs_d < 0.01:
        effect_size = "very small"
    elif abs_d < 0.2:
        effect_size = "small"
    elif abs_d < 0.5:
        effect_size = "medium"
    elif abs_d < 0.8:
        effect_size = "large"
    elif abs_d < 1.2:
        effect_size = "very large"
    else:
        effect_size = "huge"
    return effect_size

In [None]:
def get_cohen_d_result(
    response_control: pd.Series,
    response_test: pd.Series,
) -> str:
    cohen_d_value = compute_cohen_d(
        response_control=response_control,
        response_test=response_test,
    )
    effect_size = get_cohen_d_explanation(
        d=cohen_d_value
    )
    result_str = f"Cohen's d = {cohen_d_value:.3f}, implying that the effect size is {effect_size}"
    return result_str

In [38]:
from scipy import stats
response_test = df_test["amount_spent"].astype(float)
response_control = df_control["amount_spent"].astype(float)
mwu_test_result = stats.mannwhitneyu(
    response_test,
    response_control
)


In [None]:
from ab_test_analysis.visualisation import plot_histogram
fig_histogram = plot_histogram(
    df_ab_test=df,
    group_colname="non_random_group",
    response_colname="amount_spent"
)

fig_histogram.show()

In [60]:
from ab_test_analysis.analysis import get_ttest_result
t_test_result = get_ttest_result(
    df_ab_test=df,
    group_colname="non_random_group",
    response_colname="amount_spent",
    test_group_name="a",
    alternative_hypothesis="two-sided"
)

In [None]:
t_test_result