In [12]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [13]:
# load data
data = pd.read_csv('effect_tb.csv',header = None)
data.columns = ["dt","user_id","label","strategy_id"]

# no need to use dt
data = data.drop(columns = "dt")
data.head(3)

Unnamed: 0,user_id,label,strategy_id
0,1,0,1
1,1000004,0,1
2,1000004,0,2


In [15]:
# table summary
data.describe()

Unnamed: 0,user_id,label,strategy_id
count,2645958.0,2645958.0,2645958.0
mean,3112995.0,0.01456297,1.395761
std,1828262.0,0.1197952,0.692048
min,1.0,0.0,1.0
25%,1526772.0,0.0,1.0
50%,3062184.0,0.0,1.0
75%,4721132.0,0.0,2.0
max,6265402.0,1.0,3.0


In [16]:
# distinct count of columns
data.nunique()

user_id        2410683
label                2
strategy_id          3
dtype: int64

In [17]:
data[data.duplicated(keep = False)].sort_values(by = ["user_id"])

Unnamed: 0,user_id,label,strategy_id
8529,1027,0,1
1485546,1027,0,1
1579415,1471,0,1
127827,1471,0,1
404862,2468,0,1
...,...,...,...
1382121,6264633,0,1
1382245,6264940,0,1
2575140,6264940,0,1
1382306,6265082,0,3


In [7]:
# drop duplicate
data = data.drop_duplicates()

# check if any duplicates left
data[data.duplicated(keep = False)]

Unnamed: 0,user_id,label,vehicle_id


In [18]:
data.pivot_table(index = "strategy_id", columns = "label", values = "user_id",
                aggfunc = "count", margins = True)

label,0,1,All
strategy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1892099,23936,1916035
2,406371,6307,412678
3,308955,8290,317245
All,2607425,38533,2645958


In [19]:
data.dtypes

user_id        int64
label          int64
strategy_id    int64
dtype: object

In [40]:
# Sample Size Calculator
# control group
data[data["strategy_id"] == 1]["label"].mean()
import math
from statsmodels.stats.power import TTestIndPower
effect_size = 0.02  # expected change
alpha = 0.05  # confidence
power = 0.8  # statistical power
ratio = 0.2  # sample size with 2 groups

analysis = TTestIndPower()
sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, ratio=ratio)
sample_size = math.ceil(sample_size)  

print(f"min_sample_size: {sample_size}")

min_sample_size: 117735


In [21]:
data["strategy_id"].value_counts()
# Sample size > min sample size

strategy_id
1    1916035
2     412678
3     317245
Name: count, dtype: int64

In [28]:
print("Control Group: ", data[data["strategy_id"] == 1]["label"].mean())
print("Strategy 1: ", data[data["strategy_id"] == 2]["label"].mean())
print("Strategy 2: ", data[data["strategy_id"] == 3]["label"].mean())


Control Group:  0.012492464908000114
Strategy 1:  0.015283102079587475
Strategy 2:  0.026131223502340463


In [49]:

n_old = len(data[data.strategy_id == 1])  # control group
n_new = len(data[data.strategy_id == 3])  # strategy


c_old = len(data[data.strategy_id ==1][data.label == 1])
c_new = len(data[data.strategy_id ==3][data.label == 1])


r_old = c_old / n_old
r_new = c_new / n_new


r = (c_old + c_new) / (n_old + n_new)

print("total rate：", r)

total rate： 0.014429896833357214


  c_old = len(data[data.strategy_id ==1][data.label == 1])
  c_new = len(data[data.strategy_id ==3][data.label == 1])


In [57]:
# strategy 1
import statsmodels.stats.proportion as sp
z_score, p = sp.proportions_ztest([c_old, c_new],[n_old, n_new], alternative = "smaller")
print("z：",z_score,"，pvalue：", p)
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
effect_size = proportion_effectsize(c_old / n_old, c_new / n_new)
power_analysis = NormalIndPower()
power = power_analysis.solve_power(effect_size=effect_size, nobs1=n_old, alpha=0.05, ratio=n_new/n_old, alternative='smaller')
print("Statistical power:", power)

z： -59.66600946268368 ，pvalue： 0.0
Statistical power: 1.0


In [58]:
# strategy 1
z_score, p = sp.proportions_ztest([c_old, len(data[data.strategy_id ==2][data.label == 1])],[n_old, len(data[data.strategy_id == 2])], alternative = "smaller")
print("z：",z_score,"，pvalue：", p)

from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
effect_size = proportion_effectsize(c_old / n_old, c_new / n_new)
power_analysis = NormalIndPower()
power = power_analysis.solve_power(effect_size=effect_size, nobs1=n_old, alpha=0.05, ratio=n_new/n_old, alternative='smaller')
print("Statistical power:", power)

z： -14.362726203811503 ，pvalue： 4.433468512724253e-47
Statistical power: 1.0


  z_score, p = sp.proportions_ztest([c_old, len(data[data.strategy_id ==2][data.label == 1])],[n_old, len(data[data.strategy_id == 2])], alternative = "smaller")


In [61]:
import numpy as np
import statsmodels.stats.proportion as sp

# Generate random groupings
# Assuming control group data has strategy_id == 1
control_data = data[data.strategy_id == 1]

# Randomly split the control group into three groups
np.random.seed(42)  # To ensure consistent randomization
control_data['group'] = np.random.choice(['A1', 'A2', 'A3'], size=len(control_data), replace=True)
# Display the size of each random sample group
n_A1 = len(control_data[control_data.group == 'A1'])
n_A2 = len(control_data[control_data.group == 'A2'])
n_A3 = len(control_data[control_data.group == 'A3'])

print(f"Sample size of A1: {n_A1}")
print(f"Sample size of A2: {n_A2}")
print(f"Sample size of A3: {n_A3}")

# Select two groups for the A/A test, assuming we choose A1 and A2 for comparison
n_A1 = len(control_data[control_data.group == 'A1'])
n_A2 = len(control_data[control_data.group == 'A2'])

c_A1 = len(control_data[control_data.group == 'A1'][control_data.label == 1])
c_A2 = len(control_data[control_data.group == 'A2'][control_data.label == 1])

# Calculate the conversion rates for A1 and A2
r_A1 = c_A1 / n_A1
r_A2 = c_A2 / n_A2

# Calculate the total conversion rate
r_total = (c_A1 + c_A2) / (n_A1 + n_A2)
print("Total rate:", r_total)

# A/A test to verify whether there is a significant difference in conversion rates between the two groups
z_score, p_value = sp.proportions_ztest([c_A1, c_A2], [n_A1, n_A2], alternative="two-sided")

print(f"A/A Test - z-score: {z_score}, p-value: {p_value}")

# Interpret test results
if p_value > 0.05:
    print("A/A Test did not reject the null hypothesis, user segmentation is reasonable.")
else:
    print("A/A Test rejected the null hypothesis, user segmentation may have issues.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data['group'] = np.random.choice(['A1', 'A2', 'A3'], size=len(control_data), replace=True)


Sample size of A1: 638230
Sample size of A2: 638840
Sample size of A3: 638965


  c_A1 = len(control_data[control_data.group == 'A1'][control_data.label == 1])


Total rate: 0.012442544261473528
A/A Test - z-score: -2.5418134831300656, p-value: 0.011027900460504928
A/A Test rejected the null hypothesis, user segmentation may have issues.


  c_A2 = len(control_data[control_data.group == 'A2'][control_data.label == 1])


In [63]:
# A1 vs A3 
c_A3 = len(control_data[control_data.group == 'A3'][control_data.label == 1])

# Calculate the conversion rates for A1 and A3
r_A3 = c_A3 / n_A3

# Calculate the total conversion rate for A1 and A3
r_total_A1_A3 = (c_A1 + c_A3) / (n_A1 + n_A3)
print("Total rate A1 vs A3:", r_total_A1_A3)
r_A3 = c_A3 / n_A3

# Calculate the total conversion rate for A1 and A3
r_total_A1_A3 = (c_A1 + c_A3) / (n_A1 + n_A3)
print("Total rate A1 vs A3:", r_total_A1_A3)

# A/A test for A1 vs A3
z_score_A1_A3, p_value_A1_A3 = sp.proportions_ztest([c_A1, c_A3], [n_A1, n_A3], alternative="two-sided")

print(f"A/A Test A1 vs A3 - z-score: {z_score_A1_A3}, p-value: {p_value_A1_A3}")

# Interpret A1 vs A3 test results
if p_value_A1_A3 > 0.05:
    print("A/A Test A1 vs A3 did not reject the null hypothesis, user segmentation is reasonable.")
else:
    print("A/A Test A1 vs A3 rejected the null hypothesis, user segmentation may have issues.")

Total rate A1 vs A3: 0.012392782621291188
Total rate A1 vs A3: 0.012392782621291188
A/A Test A1 vs A3 - z-score: -2.038684138526655, p-value: 0.0414815617913717
A/A Test A1 vs A3 rejected the null hypothesis, user segmentation may have issues.


  c_A3 = len(control_data[control_data.group == 'A3'][control_data.label == 1])
