# Chapter 5
# Contextual bandits: Make targeted decisions

In [None]:
import numpy as np
import scipy
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt
from e4e import E4E

e4e = E4E(chapter=5)

## 5.1	Model a business metric offline to make decisions online

### 5.1.1	Define the prediction model

#### SIMULATE THE VIEWING TIME

In [None]:
# Listing 5.1 Simulate viewing time
def measure_viewing_time(context, action_weights):
    return np.exp( (context*action_weights).mean() + 0.1*np.random.normal())

In [None]:
np.random.seed(17)
action_weights = np.random.normal(size=(5,))
m = [measure_viewing_time(context=np.random.normal(size=(5,)), action_weights=action_weights) 
     for _ in range(1000)]
plt.hist(m,25, color=e4e.color_1);
plt.xlabel('viewing time')
plt.ylabel('counts')
e4e.save_fig(2)

#### FIT THE PREDICTION MODEL

In [None]:
# Listing 5.2 A logged sample
class Sample:
    def __init__(self, context, action, reward):
        self.context = context
        self.action = action
        self.reward = reward

In [None]:
# Listing 5.3 Collect logs for each action
def collect_logs_by_action(num_actions, logs):
    samples_y = [[] for _ in range(num_actions)]
    samples_x = [[] for _ in range(num_actions)]
    for sample in logs:
        samples_y[sample.action].append(sample.reward)
        samples_x[sample.action].append(sample.context)
    return samples_y, samples_x

In [None]:
# Listing 5.4 Build a model for each action
def build_models(num_features, samples_y, samples_x):
    betas = []
    ngood=0
    nbad=0
    for y, x in zip(samples_y, samples_x): # for each action
        y = np.array(y)
        x = np.array(x)
        if len(y) > 0:
            beta = np.linalg.pinv(x.T@x) @ x.T@y
            ngood+=1
        else:
            beta = np.zeros(shape=(num_features,))
            nbad+=1
        betas.append(beta)
    return betas

### 5.1.2	Add the decision-making component

In [None]:
# Listing 5.5 A greedy recommender
class RecommenderGreedy:
    def __init__(self, num_features, num_actions):
        self._num_features = num_features
        self._num_actions = num_actions
        
    def reset(self):
        self._betas = [np.random.normal(size=(num_features, )) for _ in range(self._num_actions)]
        
    def fit_offline(self, logs):
        samples_y, samples_x = collect_logs_by_action(num_actions, logs)
        self._betas = build_models(self._num_features, samples_y, samples_x)
        
    def policy(self, context):
        viewing_max = -np.inf
        for action in range(self._num_actions):
            viewing_hat = context @ self._betas[action]
            if viewing_hat > viewing_max:
                action_best = action
                viewing_max = viewing_hat
        return action_best

### 5.1.3	Run and evaluate the greedy recommender

In [None]:
def log_production_data(action_weights, recommender):
    logs = []
    total_viewing_time = 0
    num_decisions = 100
    for _ in range(num_decisions):
        context = np.random.randint(2, size=(len(action_weights),)) # features describing user
        context[0] = 1 # first "feature" is just a constant / intercept term / offset
        action = recommender.policy(context) # choose best post
        viewing_time = measure_viewing_time(context=context, action_weights=action_weights[:, action])
        logs.append( Sample(context, action, viewing_time) )
        total_viewing_time += viewing_time
    avg_viewing_time = (total_viewing_time)/num_decisions
    return avg_viewing_time, logs

In [None]:
def run_experiment_sequence(action_weights, num_actions, recommender):
    num_days = 14
    num_features = action_weights.shape[0]
    avg_viewing_times = []
    all_logs = []
    recommender.reset()
    for _ in range(num_days):  # one month
        avg_viewing_time, logs = log_production_data(action_weights, recommender)
        avg_viewing_times.append(avg_viewing_time)
        all_logs.extend(logs)
        recommender.fit_offline(all_logs)  # all data from day one till now

    avg_viewing_times = np.array(avg_viewing_times)
    return avg_viewing_times

In [None]:
def run_sequences(action_weights, num_actions, recommender):
    avg_viewing_times = []
    num_runs = 10
    for _ in range(num_runs):
        avg_viewing_times.append(run_experiment_sequence(action_weights, num_actions, recommender))
    avg_viewing_times = np.array(avg_viewing_times)
    mean = avg_viewing_times.mean(axis=0)
    se = avg_viewing_times.std(axis=0)/np.sqrt(num_runs)
    return mean, se

In [None]:
num_features = 5
num_actions = 30

np.random.seed(17)
action_weights = np.random.normal(size=(num_features, num_actions)) # the dgp; fixed values
recommender = RecommenderGreedy(num_features, num_actions)
mean, se = run_sequences(action_weights, num_actions, recommender)
betas_g = recommender._betas

In [None]:
plt.plot(mean, '.-', color=e4e.color_1)
plt.fill_between(np.arange(len(mean)),
                 mean - se,
                 mean + se,
                 color=e4e.color_2, alpha=e4e.alpha_err, linewidth=1);

asymp = mean[4:].mean()
e4e.horizontal_line(asymp, e4e.color_1)

plt.annotate(f"asymptote, mean viewing time = {asymp:.2f}", xy=[0, asymp-.01],
             xytext=[2, 1.4],
             arrowprops=e4e.arrow_props
            )

plt.xlabel('day')
plt.ylabel('mean viewing time')
e4e.save_fig(4)

## 5.2	Explore actions with epsilon-greedy

### 5.2.1	Missing counterfactuals degrade predictions

In [None]:
# action = 1, no missing data
contexts = [
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1]
]
rewards = [
    .6,
    .9,
    1.3
]
x = np.array(contexts)
y = np.array(rewards)
beta_1 = np.linalg.pinv(x.T @ x) @ (x.T@y)
print (beta_1)

# predicted response of user a to action=1
context_a = [0,0,1]
print (context_a @ beta_1)

# predicted response of user b to action=1
context_b = [1,0,1]
print (context_b @ beta_1)

In [None]:
# action = 1, missing data about feature #3
contexts = [
    [1, 0, 0],
    [0, 1, 0]
]
rewards = [
    0.6,
    0.9
]
x = np.array(contexts)
y = np.array(rewards)
beta_1m = np.linalg.pinv(x.T @ x) @ (x.T@y)
print (beta_1m)

# predicted response of user a to action=1
print (context_a @ beta_1m)

# predicted response of user b to action=1
print (context_b @ beta_1m)

#### FEEDBACK LOOPS

### 5.2.2	Explore with epsilon-greedy to collect counterfactuals

In [None]:
# Listing 5.6 Epsilon-greedy recommender
class RecommenderEpsilonGreedy:
    def __init__(self, num_features, num_actions, eps=0.1):
        self._num_features = num_features
        self._num_actions = num_actions
        self._eps = eps
        
    def reset(self):
        self._betas = [np.random.normal(size=(num_features, )) for _ in range(self._num_actions)]
        
    def fit_offline(self, logs):
        samples_y, samples_x = collect_logs_by_action(num_actions, logs)
        self._betas = build_models(self._num_features, samples_y, samples_x)
        
    def policy(self, context):
        viewing_max = -np.inf
        if np.random.uniform(0,1) < self._eps:
            action_best = np.random.randint(0, self._num_actions)
        else:
            for action in range(self._num_actions):
                viewing_hat = context @ self._betas[action]
                if viewing_hat > viewing_max:
                    action_best = action
                    viewing_max = viewing_hat
        return action_best

In [None]:
np.random.seed(17)
recommender = RecommenderEpsilonGreedy(num_features, num_actions, eps=0.1)
mean_eps, se_eps = run_sequences(action_weights, num_actions, recommender)
betas_eg = recommender._betas

In [None]:
plt.plot(mean, '.-', color=e4e.color_1)
plt.plot(mean_eps, '.--', color=e4e.color_1)

plt.fill_between(np.arange(len(mean)),
                 mean - se,
                 mean + se,
                 color=e4e.color_2, alpha=e4e.alpha_err, linewidth=1);

plt.fill_between(np.arange(len(mean_eps)),
                 mean_eps - se_eps,
                 mean_eps + se_eps,
                 color=e4e.color_2, alpha=e4e.alpha_err, linewidth=1);

plt.legend(['RecommenderGreedy', 'RecommenderEpsilonGreedy'])
plt.xlabel('day')
plt.ylabel('mean viewing time')
e4e.save_fig(7)

## 5.3 Explore parameters by Thompson sampling

### 5.3.1	Create an ensemble of prediction models

In [None]:
np.random.seed(17)
visits = np.array([3 + int(5*np.random.uniform()) for _ in range(100)])
i = np.random.randint(len(visits), size=(len(visits,)))
bs_visits = visits[i]
print (visits.mean(), visits.std())
print (bs_visits.mean(), bs_visits.std())

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)

ax1.hist(visits, 25, color=e4e.color_1);
e4e.aspect_square(ax1)
ax1.set_title('Measured sample set')

ax2.hist(bs_visits, 25, color=e4e.color_1);
e4e.aspect_square(ax2)
ax2.set_title('Bootstrap sample set')

e4e.save_fig(9)

In [None]:
# Listing 5.7 Thompson sampling recommender
class RecommenderThompsonSampling:
    def __init__(
        self, num_features, num_actions,
        num_bs_samples
    ):
        self._num_features = num_features
        self._num_actions = num_actions
        self._num_bs_samples = num_bs_samples
        
    def reset(self):
        self._betas = []
        for _ in range(self._num_bs_samples):
            self._betas.append([
                np.random.normal(size=(num_features,))
                for _ in range(self._num_actions)
            ] )
        
    def _bs_sample(self, samples_y, samples_x):
        bs_samples_y = []
        bs_samples_x = []
        for action in range(self._num_actions):
            y = np.array(samples_y[action])
            x = np.array(samples_x[action])
            if len(y)>0:
                i = np.random.randint(0, len(y), size=(len(y),))
                y = y[i]
                x = x[i,:]
            bs_samples_y.append(y)
            bs_samples_x.append(x)
        return bs_samples_y, bs_samples_x
        
    def fit_offline(self, logs):
        fit_logs = logs
        samples_y, samples_x = collect_logs_by_action(
            num_actions, fit_logs
        )
        self._betas = []
        for _ in range(self._num_bs_samples):
            bs_samples_y, bs_samples_x = self._bs_sample(
                samples_y, samples_x
            )
            self._betas.append(build_models(
                self._num_features, bs_samples_y, bs_samples_x
            ))
        
    def policy(self, context):
        i_beta = np.random.randint(0, self._num_bs_samples)
        beta = self._betas[i_beta]
        viewing_max = -np.inf
        for action in range(self._num_actions):
            viewing_hat = context @ beta[action]
            if viewing_hat > viewing_max:
                action_best = action
                viewing_max = viewing_hat
        return action_best

In [None]:
np.random.seed(17)
recommender = RecommenderThompsonSampling(num_features, num_actions, num_bs_samples=30)
mean_ts, se_ts = run_sequences(action_weights, num_actions, recommender)

In [None]:
plt.plot(mean, '.-', color=e4e.color_1)
plt.plot(mean_eps, '.--', color=e4e.color_1)
plt.plot(mean_ts, ':.', color=e4e.color_1)

plt.fill_between(np.arange(len(mean)),
                 mean - se,
                 mean + se,
                 color=e4e.color_2, alpha=e4e.alpha_err, linewidth=1);

plt.fill_between(np.arange(len(mean_eps)),
                 mean_eps - se_eps,
                 mean_eps + se_eps,
                 color=e4e.color_2, alpha=e4e.alpha_err, linewidth=1);

plt.fill_between(np.arange(len(mean_eps)),
                 mean_ts - se_ts,
                 mean_ts + se_ts,
                 color=e4e.color_2, alpha=e4e.alpha_err, linewidth=1);

plt.xlabel('day')
plt.ylabel('mean viewing time')
plt.legend(['RecommenderGreedy', 'RecommenderEpsilonGreedy', 'RecommenderThompsonSampling'])
e4e.save_fig(10)

### 5.3.2	Randomized probability matching

In [None]:
class RecommenderThompsonSamplingInstrumented:
    def __init__(
        self, num_features, num_actions,
        num_bs_samples
    ):
        self._num_features = num_features
        self._num_actions = num_actions
        self._num_bs_samples = num_bs_samples
        
    def reset(self):
        self._betas = []
        for _ in range(self._num_bs_samples):
            self._betas.append([
                np.random.normal(size=(num_features,))
                for _ in range(self._num_actions)
            ] )
        self._p_best = []
        self.mean_vs_day = []
        
    def _bs_sample(self, samples_y, samples_x):
        bs_samples_y = []
        bs_samples_x = []
        for action in range(self._num_actions):
            y = np.array(samples_y[action])
            x = np.array(samples_x[action])
            if len(y)>0:
                i = np.random.randint(0, len(y), size=(len(y),))
                y = y[i]
                x = x[i,:]
            bs_samples_y.append(y)
            bs_samples_x.append(x)
        return bs_samples_y, bs_samples_x
        
    def fit_offline(self, logs):
        self.mean_vs_day.append(np.array(self._p_best).mean())
            
        fit_logs = logs
        samples_y, samples_x = collect_logs_by_action(
            num_actions, fit_logs
        )
        self._betas = []
        for _ in range(self._num_bs_samples):
            bs_samples_y, bs_samples_x = self._bs_sample(
                samples_y, samples_x
            )
            self._betas.append(build_models(
                self._num_features, bs_samples_y, bs_samples_x
            ))
        
    def _best_post(self, context, beta):
        viewing_max = -np.inf
        for action in range(self._num_actions):
            viewing_hat = context @ beta[action]
            if viewing_hat > viewing_max:
                action_best = action
                viewing_max = viewing_hat
        return action_best        
        
    def policy(self, context):
        best_posts = [
            self._best_post(context, self._betas[i_beta])
            for i_beta in range(self._num_bs_samples)
        ]
    
        i_beta = np.random.randint(self._num_bs_samples)
        action_best = best_posts[i_beta]
        num = 0
        for bp in best_posts:
            if bp == action_best:
                num += 1
            
        p_post = num / self._num_bs_samples
        self._p_best.append(p_post)
        return action_best

In [None]:
np.random.seed(17)
recommender = RecommenderThompsonSamplingInstrumented(num_features, num_actions, num_bs_samples=30)
run_sequences(action_weights, num_actions, recommender);

In [None]:
m = np.array(recommender.mean_vs_day)
plt.plot(m,'.--', color=e4e.color_1)

plt.xlabel('day')
plt.ylabel('avg. $p_{best}(action)$')
e4e.save_fig(12)