In [43]:
from scipy import stats
import numpy as np
import pandas as pd
from scipy.stats import t, ttest_ind, chisquare
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import multipletests

# Hypothesis Test for difference in proportions between two groups
This notebook provides code to perform Hypothesis Test for difference in proportions between two groups using the Z-Test for proportions.

The code below is define the functions necessary to perform the hypothesis test for difference in proportions between two groups (control vs treatment), check for sample ratio mismatch and make p-value adjustments in case of an experiment with multiple treatment variatns (ieg ABC test).

Inputs:

*   dataframe with the summary statistics of the test
  *   variant name
  *   number of sucesses (ieg: users that converted)
  *   total sample size (ieg: total users)


Outputs:
*   dataframe with the test results for each variant



In [44]:
# Function 1: perform a z-test for proportions from summary statistics and returns z and p values
def proportion_z_test(success_variant, users_variant, success_control, users_control, hypothesis):

    # Number of successes (number of events in each group)
    count = np.array([success_variant, success_control])

    # Total number of observations (total number in each group)
    nobs = np.array([users_variant, users_control])

    # Z Test
    z, p_value = proportions_ztest(count, nobs, alternative=hypothesis, value=0)

    return z,p_value



# Function 2: returns if the results are statistically significant or not, based on informed level of significance (alpha)
def confidence(p, alpha):
    if p < alpha:
        return ['Statsig',1-round(p,4)]
    else:
        return ['Inconclusive', np.nan]



# Function 3: tests for Sample Ratio Mismatch
def srm_check(df, users_col):

    observed = df[users_col].tolist() # observed users per variation
    expected = [ df[users_col].sum()/ df[users_col].size]* df[users_col].size # expected users per variation

    chi = chisquare(f_obs=observed, f_exp=expected).pvalue # p-value of the one-way chisquare test (null hypothesis is that the data has the expected frequencies)

    if chi < 0.01 :
        return f'WARNING: Possible Sample Ratio Mismatch (p-value: {chi})'
    else:
        return 'SRM Check: No evidence of Sample Ratio Mismatch'



# Function 4: adjust p-values for multiple testing (when we have more then 1 treatment variant)
def multiple_tests_proportions(df, p_values, alpha):

    if df.index.size > 2:
        adj_pvalues = multipletests(df[p_values].dropna(), method='holm')[1]
        adj_pvalues = np.append(adj_pvalues, np.nan)
        adj_pvalues = adj_pvalues.tolist()
        df['p_value_adj'] = adj_pvalues
        df['adj_significance'] = np.where(df['p_value_adj'] < alpha, 'Statsig', np.where(pd.isna(df['p_value_adj'])==True,np.nan,'Inconclusive'))

        columns = ["variant","p_value", "p_value_adj", "adj_significance", "value","users","conversion", "lift", "ic_inf_perc", "ic_sup_perc"]
        updated_table = df.reindex(columns=columns)

    else:
        columns = ["variant","p_value","significance","value","users","conversion", "lift", "ic_inf_perc", "ic_sup_perc"]
        updated_table = df.reindex(columns=columns)

    return updated_table



# Function 5: returns a dataframe with the results of the test
def df_proportion_z_test(df, variant_col, trials_col, success_col, alpha, hypothesis):

# selects the information from the control variant and calculates conversion rate
    row_control = list(df[df[variant_col] == 'control'].iterrows())[0][1]
    conversion_control = (row_control[success_col]/row_control[trials_col])

# creates dictionary that will be used to create our results table
    records = [dict(
        variant='control',
        value=row_control[success_col],
        users=row_control[trials_col],
        conversion="%.2f%%" % (conversion_control * 100),
        significance=np.nan,
       # confid=np.nan,
        lift=np.nan
    )]

# iterates over the treatment variants (if there is more then one treatment variant)
    for row in df[df[variant_col] != 'control'].iterrows():
        row_variant = row[1]

# calculates p-value
        z, p_value = proportion_z_test(
            row_variant[success_col], row_variant[trials_col],
            row_control[success_col], row_control[trials_col],
            hypothesis
        )

# test for significance and estimates lift
        conversion_variant = (row_variant[success_col]/row_variant[trials_col])
        lift = (conversion_variant/conversion_control) - 1
        se = np.sqrt(conversion_variant * (1 - conversion_variant) / row_variant[trials_col] + conversion_control * (1 - conversion_control) / row_control[trials_col])
        cv_control = np.sqrt(conversion_control * (1 - conversion_control) / row_control[trials_col]) / conversion_control
        cv_variant = np.sqrt(conversion_variant * (1 - conversion_variant) / row_variant[trials_col]) / conversion_variant

        if hypothesis == 'two-sided':
            moe = (se * stats.norm.ppf(1-alpha+(alpha)/2)) / conversion_control # margin of error
            ic_inf = lift - moe
            ic_sup = lift + moe
            ic_inf = "%.2f%%" % (ic_inf * 100)
            ic_sup = "%.2f%%" % (ic_sup * 100)

            # calculate confidence interval of the percentage change using the delta method
            ic_inf_perc = (lift+1) * (1 - stats.norm.ppf(1-alpha+(alpha)/2) * np.sqrt(cv_control**2 + cv_variant**2 - (1.96**2) * cv_control**2 * cv_variant**2))/(1-1.96* cv_control**2)-1
            ic_sup_perc = (lift+1) * (1 + stats.norm.ppf(1-alpha+(alpha)/2) * np.sqrt(cv_control**2 + cv_variant**2 - (1.96**2) * cv_control**2 * cv_variant**2))/(1-1.96* cv_control**2)-1
            ic_inf_perc = "%.2f%%" % (ic_inf_perc * 100)
            ic_sup_perc = "%.2f%%" % (ic_sup_perc * 100)

        elif hypothesis == 'larger':
            moe = (se * stats.norm.ppf(1-alpha)) / conversion_control
            ic_inf = lift - moe
            ic_inf= "%.2f%%" % (ic_inf * 100)
            ic_sup = np.inf

            # calculate confidence interval of the percentage change using the delta method
            ic_inf_perc = (lift+1) * (1 - stats.norm.ppf(1-alpha) * np.sqrt(cv_control**2 + cv_variant**2 - (1.96**2) * cv_control**2 * cv_variant**2))/(1-1.96* cv_control**2)-1
            ic_sup_perc = np.inf
            ic_inf_perc = "%.2f%%" % (ic_inf_perc * 100)

        else:
            moe = (se * stats.norm.ppf(1-alpha)) / conversion_control
            ic_sup = lift + moe
            ic_sup = "%.2f%%" % (ic_sup * 100)
            ic_inf= np.inf

            # calculate confidence interval of the percentage change using the delta method
            ic_sup_perc = (lift+1) * (1 + stats.norm.ppf(1-alpha) * np.sqrt(cv_control**2 + cv_variant**2 - (1.96**2) * cv_control**2 * cv_variant**2))/(1-1.96* cv_control**2)-1
            ic_inf_perc = np.inf
            ic_sup_perc = "%.2f%%" % (ic_sup_perc * 100)

# Append the results of the variants in the dictionary with the initial data
        records.append(dict(
            variant=row_variant[variant_col],
            value=row_variant[success_col],
            users=row_variant[trials_col],
            conversion= "%.2f%%" % (conversion_variant * 100),
            brute_lift=lift,
            lift="%.2f%%" % (lift * 100),
            significance=confidence(p_value,alpha)[0],
           # confid=confidence(p_value,alpha)[1],
            p_value= round(p_value,3),
            moe = "%.2f%%" % (moe * 100),
            ic_inf = ic_inf,
            ic_sup = ic_sup,
            ic_sup_perc = ic_sup_perc,
            ic_inf_perc = ic_inf_perc
        ))

        table = pd.DataFrame.from_records(records)\
          .sort_values(by="brute_lift", ascending=False)\
          [["variant","p_value","significance","value","users","conversion", "lift", "ic_inf_perc", "ic_sup_perc"]]

    return table


# Function 6: combine the functions above into the final function
def complete_proportion_test(df, variant_col, trials_col, success_col, alpha, hypothesis):

    # Execute test of proportions, return the table with the results of the test
    table = df_proportion_z_test(df, variant_col, trials_col, success_col, alpha, hypothesis)

    # Test for inconsistent population
    srm = srm_check(df, trials_col)

    # Adjust for multiple tests
    updated_table = multiple_tests_proportions(table, 'p_value',alpha)

    return display(srm, updated_table)



# Example 1 - A/B Test

To ilustrate the use of the function defined above, we will create a summary table with the successes and sample sizes of each variant in our experiment.

We also need to define the significance level and the type of hypothesis we are testing (two-sided, larger or smaller).

The significance level is how much we want to limit our chance of incurring a type I error (false positive), usually it's set to 0.05.

The types of hypothesis are:
* Two-sided: the null hypothesis is that 'there is no difference between the treatment and the control'
* Larger:  the null hypothesis is that 'the control is not larger then the treatment'
* Smaller: the null hypothesis is that 'the control is not smaller then the treatment'

In [45]:
# Create a dictionary with the data
data = {
    'variant': ['control', 'treatment'],
    'successes': [360, 400],
    'trials': [1000, 1000]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

# Display the DataFrame
display(df)


Unnamed: 0,variant,successes,trials
0,control,360,1000
1,treatment,400,1000


In [46]:
complete_proportion_test(df, 'variant', 'trials', 'successes', alpha=0.05, hypothesis='two-sided')

'SRM Check: No evidence of Sample Ratio Mismatch'

Unnamed: 0,variant,p_value,significance,value,users,conversion,lift,ic_inf_perc,ic_sup_perc
1,treatment,0.065,Inconclusive,400,1000,40.00%,11.11%,-0.99%,23.99%
0,control,,,360,1000,36.00%,,,


In [47]:
complete_proportion_test(df, 'variant', 'trials', 'successes', alpha=0.05, hypothesis='larger')

'SRM Check: No evidence of Sample Ratio Mismatch'

Unnamed: 0,variant,p_value,significance,value,users,conversion,lift,ic_inf_perc,ic_sup_perc
1,treatment,0.033,Statsig,400,1000,40.00%,11.11%,1.02%,inf
0,control,,,360,1000,36.00%,,,


## Example 2 - A/B/C Test
In this example the treatment variants are tested against the control variant, and the p-value is corrected for multiple testing.

In [57]:
# Create a dictionary with the data
data = {
    'variant': ['control', 'treatment A', 'treatment B'],
    'successes': [200, 245, 120],
    'trials': [1000, 1025, 800]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

# Display the DataFrame
display(df)


Unnamed: 0,variant,successes,trials
0,control,200,1000
1,treatment A,245,1025
2,treatment B,120,800


In [58]:
complete_proportion_test(df, 'variant', 'trials', 'successes', alpha=0.05, hypothesis='two-sided')



Unnamed: 0,variant,p_value,p_value_adj,adj_significance,value,users,conversion,lift,ic_inf_perc,ic_sup_perc
1,treatment A,0.034,0.034,Statsig,245,1025,23.90%,19.51%,0.62%,40.29%
2,treatment B,0.006,0.012,Statsig,120,800,15.00%,-25.00%,-39.93%,-8.89%
0,control,,,,200,1000,20.00%,,,


In [59]:
complete_proportion_test(df, 'variant', 'trials', 'successes', alpha=0.05, hypothesis='smaller')



Unnamed: 0,variant,p_value,p_value_adj,adj_significance,value,users,conversion,lift,ic_inf_perc,ic_sup_perc
1,treatment A,0.983,0.983,Inconclusive,245,1025,23.90%,19.51%,inf,37.10%
2,treatment B,0.003,0.006,Statsig,120,800,15.00%,-25.00%,inf,-11.38%
0,control,,,,200,1000,20.00%,,,
