In [14]:
import numpy as np
import pandas as pd
from scipy.special import softmax

n_match_picks = 1000
n_context_features = 8
n_arms = 7

X_context = np.random.randint(10, size=(n_match_picks, n_context_features))
X_action = np.random.randint(10, size=(n_match_picks, 1))
Y_reward = np.random.randint(10, size=(n_match_picks, 1))

In [15]:
X_context.shape, X_action.shape, Y_reward.shape

((1000, 8), (1000, 1), (1000, 1))

In [210]:
class Bandit(object):
    def __init__(self, n_context_features, n_arms, step_size=0.05, baseline=False):
        self.n_context_features = n_context_features
        self.n_arms = n_arms
        self.step_size = step_size
        self.baseline = baseline
        self.reward_sum = 0
        self.iters = 0

        # start at uniform
        self.theta = np.zeros((self.n_context_features, self.n_arms))
        
    @property
    def current_baseline(self):
        return self.reward_sum / self.iters if self.baseline and self.iters else 0
        
    def predict_proba(self, X):
        return softmax((X @ self.theta).reshape(-1, self.n_arms), axis=1)
    
    def predict(self, X, deterministic=True):
        predictions = self.predict_proba(X)
        if deterministic:
            return predictions.argmax()
        cumsum = predictions.cumsum(axis=1)
        random_val = np.random.random_sample(len(cumsum))
        binarized = (cumsum.T < random_val).astype(int).sum(axis=0)
        return binarized
    
    def theta_gradient(self, X, actual_action):
        return np.eye(self.n_arms)[actual_action].squeeze() - self.predict_proba(X)
    
    def update_theta(self, X, action, reward):
        self.reward_sum += reward.sum()
        self.iters += len(reward)
        self.theta = self.theta + self.step_size * (reward.T - self.current_baseline) @ self.theta_gradient(X, action)

In [211]:
bandit = Bandit(n_context_features, n_arms)
bandit.predict_proba(X_context[:2])
bandit.predict(X_context[0], deterministic=False)
bandit.predict(X_context[:2], deterministic=False)
bandit.predict(X_context[0], deterministic=True)
bandit.theta_gradient(X_context[0], X_action[0])
bandit.theta_gradient(X_context[:2], X_action[:2])
print(bandit.theta)
bandit.update_theta(X_context[:2], X_action[:2], Y_reward[:2])
print(bandit.theta)
bandit = Bandit(n_context_features, n_arms, baseline=True)
bandit.update_theta(X_context[:2], X_action[:2], Y_reward[:2])
print(bandit.theta)

[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
[[-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]
 [-0.06428571  0.33571429 -0.06428571 -0.01428571 -0.06428571 -0.06428571
  -0.06428571]]
[[ 0.     0.175  0.    -0.175  0.     0.     0.   ]
 [ 0.     0.175  0.    -0.175  0.     0.  