# Chapter 2: A/B testing: Evaluating a modification of your system 

In [None]:
import numpy as np
import scipy
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt
from e4e import E4E

e4e = E4E(chapter=2)

## 2.1	Run an ad hoc experiment

### 2.1.1	Simulate the trading system

In [None]:
# Listing 2.1 Simulate the trading system
def trading_system(exchange):
    if exchange == "ASDAQ":
        execution_cost = 12
    elif exchange == "BYSE":
        execution_cost = 10
    execution_cost += np.random.normal()
    return execution_cost

In [None]:
np.random.seed(17)
trading_system("ASDAQ")

### 2.1.2	Compare execution costs

In [None]:
np.random.seed(17)
print(trading_system("ASDAQ"))
print(trading_system("BYSE"))

In [None]:
np.random.seed(18)
print(trading_system("ASDAQ"))
print(trading_system("BYSE"))

#### Variation

In [None]:
np.random.seed(17)
a = np.array([trading_system("ASDAQ") for _ in range(1000)])
b = np.array([trading_system("BYSE") for _ in range(1000)])
plt.hist(a, 25, color=e4e.color_1)
plt.hist(b, 25, color=e4e.color_2)
plt.legend(["ASDAQ", "BYSE"])
plt.xlabel("execution cost (mips)")
e4e.save_fig(2)

In [None]:
i = np.where(b < a)[0]
len(i) / len(b)

In [None]:
np.random.seed(17)
print(np.array([trading_system("ASDAQ") for _ in range(100)]).mean())
print(np.array([trading_system("BYSE") for _ in range(100)]).mean())

In [None]:
print(np.array([trading_system("ASDAQ") for _ in range(100)]).mean())
print(np.array([trading_system("BYSE") for _ in range(100)]).mean())

#### Bias

In [None]:
# Listing 2.2 A simulator that accounts for time of day
def trading_system_tod(exchange, time_of_day):
    if time_of_day == "morning":
        bias = 2.5
    elif time_of_day == "afternoon":
        bias = 0
    return bias + trading_system(exchange)

In [None]:
np.random.seed(17)
print(np.array([trading_system_tod("ASDAQ", "morning") for _ in range(100)]).mean())
print(np.array([trading_system_tod("ASDAQ", "afternoon") for _ in range(100)]).mean())

In [None]:
np.random.seed(17)
print(np.array([trading_system_tod("BYSE", "morning") for _ in range(100)]).mean())
print(np.array([trading_system_tod("ASDAQ", "afternoon") for _ in range(100)]).mean())

In [None]:
print(np.array([trading_system_tod("BYSE", "morning") for _ in range(1000)]).mean())
print(np.array([trading_system_tod("ASDAQ", "afternoon") for _ in range(1000)]).mean())

In [None]:
# Listing 2.3 A randomized measurement
def randomized_measurement():
    asdaq_measurement = []
    byse_measurement = []
    for tod in ["morning", "afternoon"]:
        for _ in range(100):
            if np.random.randint(2) == 0:
                asdaq_measurement.append(trading_system_tod("ASDAQ", tod))
            else:
                byse_measurement.append(trading_system_tod("BYSE", tod))
    return (np.array(asdaq_measurement).mean(), 
            np.array(byse_measurement).mean())

In [None]:
np.random.seed(17)
randomized_measurement()

## 2.2	Take a precise measurement

### 2.2.1	Mitigate measurement variation with replication

In [None]:
np.random.seed(17)
measurements = np.array([trading_system("ASDAQ") for _ in range(3)])
print(measurements)

In [None]:
measurements.mean()

In [None]:
print(measurements - 12)

In [None]:
measurements.mean() - 12

In [None]:
np.sqrt(((measurements - 12) ** 2).mean())

In [None]:
np.sqrt(((measurements - measurements.mean()) ** 2).mean())

In [None]:
measurements.std()

In [None]:
# Listing 2.4 Aggregate measurement
def aggregate_measurement(exchange, num_individual_measurements):
    individual_measurements = np.array(
        [trading_system(exchange) for _ in range(num_individual_measurements)]
    )
    return individual_measurements.mean()

In [None]:
plt.hist(np.array([trading_system("ASDAQ") for _ in range(1000)]), color=e4e.color_1)
plt.hist(
    np.array([aggregate_measurement("ASDAQ", 3) for _ in range(1000)]), color=e4e.color_2
)
plt.xlabel("execution cost (mps)")
print(plt.axis())
plt.legend(["individual", "aggregate of 3"])
e4e.save_fig(3)

In [None]:
plt.hist(np.array([trading_system("ASDAQ") for _ in range(1000)]), color=e4e.color_1)
plt.hist(
    np.array([aggregate_measurement("ASDAQ", 3) for _ in range(1000)]), color=e4e.color_2
)
plt.hist(
    np.array([aggregate_measurement("ASDAQ", 30) for _ in range(1000)]), color=e4e.color_3
)
plt.hist(
    np.array([aggregate_measurement("ASDAQ", 300) for _ in range(1000)]), color=e4e.color_4
)
plt.xlabel("execution cost (mps)")
plt.legend(["individual", "aggregate of 3", "aggregate of 30", "aggregate of 300"])
e4e.save_fig(4)

In [None]:
np.random.seed(17)
print(aggregate_measurement("ASDAQ", 300))
print(aggregate_measurement("BYSE", 300))

In [None]:
print(aggregate_measurement("ASDAQ", 300))
print(aggregate_measurement("BYSE", 300))

In [None]:
plt.hist(
    np.array([aggregate_measurement("ASDAQ", 300) for _ in range(1000)]), color=e4e.color_1
)
plt.hist(
    np.array([aggregate_measurement("BYSE", 300) for _ in range(1000)]), color=e4e.color_2
)

# (6.6141395990492065, 13.526618148811357, 0.0, 270.9)
c = plt.axis()
plt.axis([6.614, 13.52, c[2], c[3]])

plt.xlabel("execution cost (mps)")

plt.legend(["ASDAQ", "BYSE"])
e4e.save_fig(5)

In [None]:
np.random.seed(17)
agg_3 = np.array([aggregate_measurement("ASDAQ", 3) for _ in range(1000)])
agg_30 = np.array([aggregate_measurement("ASDAQ", 30) for _ in range(1000)])
agg_300 = np.array([aggregate_measurement("ASDAQ", 300) for _ in range(1000)])

print(agg_3.std(), agg_30.std(), agg_300.std())

In [None]:
# Listing 2.5 Aggregate measurement with SE
def aggregate_measurement_with_se(exchange, num_individual_measurements):
    individual_measurements = np.array(
        [trading_system(exchange) for _ in range(num_individual_measurements)]
    )
    aggregate_measurement = individual_measurements.mean()
    sd_1 = individual_measurements.std()
    se = sd_1 / np.sqrt(num_individual_measurements)
    return aggregate_measurement, se

In [None]:
np.random.seed(17)
print(aggregate_measurement_with_se("ASDAQ", 300))
print(aggregate_measurement_with_se("BYSE", 300))

In [None]:
10.05 + 0.057

In [None]:
12.00 - 0.060

# 2.2	Run an A/B test

In [None]:
np.random.seed(17)
plt.hist(
    np.array([aggregate_measurement_with_se("ASDAQ", 10)[0] for _ in range(1000)]),
    20,
    color=e4e.color_1,
)
plt.xlabel("potential ASDAQ\naggregate measurement\nvalues (mips)")
e4e.vertical_line(102 + 10)
plt.annotate(
    "actual\naggregate\nmeasurement",
    xy=[112.3, 90],
    xytext=[120, 110],
    arrowprops=e4e.arrow_props,
)
e4e.save_fig(7)

In [None]:
np.random.seed(17)
num_individual_measurements = 10
agg_asdaq, se_asdaq = aggregate_measurement_with_se("ASDAQ", num_individual_measurements)
agg_byse, se_byse = aggregate_measurement_with_se("BYSE", num_individual_measurements)
delta = agg_byse - agg_asdaq
se_delta = np.sqrt(se_byse**2 + se_asdaq**2)

In [None]:
z = np.random.normal(size=(10000,))
plt.hist(z, 30, color=e4e.color_1)
plt.xlabel("z")
e4e.vertical_line(-1.64)
plt.annotate("5%", xy=[-2.25, 112], xytext=[-3, 600], arrowprops=e4e.arrow_props)

e4e.save_fig(8)

### 2.3.2	Design the A/B test

In [None]:
# Listing 2.6 A/B test design
def ab_test_design(sd_1_delta, prac_sig):
    num_individual_measurements = (1.64 * sd_1_delta / prac_sig) ** 2
    return np.ceil(num_individual_measurements)

In [None]:
np.random.seed(17)
sd_1_asdaq = np.array([trading_system("ASDAQ") for _ in range(100)]).std()
sd_1_byse = sd_1_asdaq
sd_1_delta = np.sqrt(sd_1_asdaq**2 + sd_1_byse**2)
prac_sig = 1.0
ab_test_design(sd_1_delta, prac_sig)

#### False negatives

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)

z = np.random.normal(size=(10000,))
ax1.hist(z, 30, color=e4e.color_1)
ax1.set_xlabel("z")
e4e.vertical_line(-1.64, ax=ax1)

ax1.annotate("5%", xy=[-2.25, 112], xytext=[-3.7, 600], arrowprops=e4e.arrow_props)

ax1.text(-3.5, 900, "(a)")
e4e.aspect_square(ax1)


z = np.random.normal(size=(10000,))
ax2.hist(z, 30, color=e4e.color_1, alpha=0.5)
ax2.set_xlabel("z")
e4e.vertical_line(-1.64, clr=e4e.color_1, ax=ax2)
ax2.annotate("5%", xy=[-2.25, 112], xytext=[-6, 600], arrowprops=e4e.arrow_props)

ax2.annotate("20%", xy=[0, 112], xytext=[3, 600], arrowprops=e4e.arrow_props)
ax2.hist(-(1.64 + 0.84) + z, 30, color=e4e.color_2, alpha=0.5)
c = ax2.axis()
ax2.axis([-8, 5.5, 0, c[3]])
ax2.text(-7, 800, "(b)")
e4e.aspect_square(ax2)

e4e.save_fig(10)

In [None]:
1.64 + 0.84

In [None]:
# Listing 2.7 A/B test design with power analysis
def ab_test_design_2(sd_1_delta, prac_sig):
    num_individual_measurements = (2.48 * sd_1_delta / prac_sig) ** 2
    return np.ceil(num_individual_measurements)

In [None]:
np.random.seed(17)
sd_1_asdaq = np.array([trading_system("ASDAQ") for _ in range(100)]).std()
sd_1_byse = sd_1_asdaq
sd_1_delta = np.sqrt(sd_1_asdaq**2 + sd_1_byse**2)
prac_sig = 1.0
ab_test_design_2(sd_1_delta, prac_sig)

### 2.3.3	Measure and analyze

In [None]:
# Listing 2.8 Measure stage
def measure(min_individual_measurements):
    ind_asdaq = []
    ind_byse = []
    while (
        len(ind_asdaq) < min_individual_measurements
        and len(ind_byse) < min_individual_measurements
    ):
        if np.random.randint(2) == 0:
            ind_asdaq.append(trading_system("ASDAQ"))
        else:
            ind_byse.append(trading_system("BYSE"))
    return np.array(ind_asdaq), np.array(ind_byse)

In [None]:
# Listing 2.9 Analyze stage
def analyze(ind_asdaq, ind_byse):
    agg_asdaq = ind_asdaq.mean()
    se_asdaq = ind_asdaq.std() / np.sqrt(len(ind_asdaq))
    agg_byse = ind_byse.mean()
    se_byse = ind_byse.std() / np.sqrt(len(ind_byse))

    delta = agg_byse - agg_asdaq
    se_delta = np.sqrt(se_asdaq**2 + se_byse**2)

    z = delta / se_delta
    return z

In [None]:
np.random.seed(17)
sd_1_asdaq = np.array([trading_system("ASDAQ") for _ in range(100)]).std()
sd_1_byse = sd_1_asdaq
sd_1_delta = np.sqrt(sd_1_asdaq**2 + sd_1_byse**2)
PS = 1.0
ab_test_design_2(sd_1_delta, PS)

In [None]:
np.random.seed(17)
ind_asdaq, ind_byse = measure(16)

In [None]:
ind_byse.mean() - ind_asdaq.mean()

In [None]:
analyze(ind_asdaq, ind_byse)