In [1]:
from bandit import Bandit
from context_engineering_functions import *
from logging_policy import LoggingPolicy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
import os

Need to take a Bandit object - which contains a policy to evaluate, and also a Logging Policy Object.
We estimate the value using the Direct Method, where we fit a ridge regression w/ importance weights onto the reward function.



In [2]:
data_directory = '../data/clean/'
map_pick_context, vetoes_only_context = create_basic_pick_veto_triples(data_directory) # Not important, but loaded vetoes too
map_pick_context_train, map_pick_context_test  = train_test_split(map_pick_context,test_size=.2,train_size=.8,shuffle=False)

cols = [col for col in map_pick_context if col.endswith('is_available')]
X_train = map_pick_context_train[cols].values
A_train = map_pick_context_train['X_Action'].values
Y_train = map_pick_context_train['Y_reward'].values

X_test = map_pick_context_test[cols].values
A_test = map_pick_context_test['X_Action'].values
Y_test = map_pick_context_test['Y_reward'].values


In [3]:
n_arms = 7

# randomly chosen by dice roll
n = 4569
# n = np.random.choice(context.index)

bandit = Bandit(X_train.shape[1], n_arms, step_size=0.01)

print('Probabilities for random row: ')
print(bandit.predict_proba(X_train[n]))

Probabilities for random row: 
[[0.2 0.2 0.  0.2 0.2 0.2 0. ]]


In [4]:
lp = LoggingPolicy(map_pick_context_train,map_pick_context_train['X_Action'])

In [5]:
# TO DO: multiple epochs, parameter tuning
for i in range(X_train.shape[0]):
    bandit.update_theta(X_train[i].reshape(1, -1), A_train[i], Y_train[i])

In [6]:
print(bandit.predict_proba(X_train[n]))

[[0.16526486 0.2547316  0.         0.26307424 0.1534473  0.163482
  0.        ]]


### Multi-epoch loop

In [10]:
# TO DO: multiple epochs, parameter tuning
n_epochs = 10
for epoch in range(n_epochs):
    for i in range(X_train.shape[0]):
        bandit.update_theta(X_train[i].reshape(1, -1), A_train[i], Y_train[i])

In [11]:
print(bandit.predict_proba(X_train[n]))

[[0.14596562 0.25995308 0.         0.27543339 0.15278697 0.16586094
  0.        ]]


In [7]:
print('Probabilities for random row after training: ')
print(bandit.predict_proba(X_train[n]))
print('Selected action: ')
print(bandit.predict(X_train[n]))

print('Actual action and reward for random row: ')
print(f'Action: {A_train[n]}, reward: {Y_train[n]}')

Probabilities for random row after training: 
[[0.16526486 0.2547316  0.         0.26307424 0.1534473  0.163482
  0.        ]]
Selected action: 
[3]
Actual action and reward for random row: 
Action: 4, reward: 0


MHS: You had X_train hardcoded in the function a few times, and I updated your assert in evaluate() because I removed log_policy.Y from LoggingPolicy

In [22]:
def train_value_estimator(context_train,map_picks_train,actions_train,rewards_train,log_policy,target_bandit):
    '''
    Trains an importance weighted RidgeCV model which is used for direct method estimation.
    
    Input:
        context_train (np.array(n x k)): n is number of train examples. k is dimensionality of context.
        map_picks_train (df)
            map_picks train and context train must be created using this line of code:
            "context_train = map_pick_context_train[cols].values"

        actions_train (np.array(n)) : actions taken
        rewards (np.array(n): rewards received
        log_policy: a LoggingPolicy object, which needs to have a function predict_proba(self,context)
        target_policy: a Bandit object, which needs to have a function predict_proba(self,context)
    '''
    all_actions = np.unique(actions_train) # return from unique is already sorted
    action_to_model_dict = {}
    log_propensities = np.empty((context_train.shape[0],len(log_policy.map_cols)))
    
    for ii,(idx,row) in enumerate(map_picks_train.iterrows()):   
        log_propensities_row = log_policy.predict_proba(row)
        log_propensities[ii,:] = log_propensities_row
    
    target_propensities = np.empty((context_train.shape[0],len(log_policy.map_cols)))
    
    for ii in range(context_train.shape[0]):   
        target_propensities_row = target_bandit.predict_proba(context_train[ii,:])
        target_propensities[ii,:] = target_propensities_row
    
    # Check to make sure these are both n x k. I guess its possible that not all actions were chosen, but thats unlikely.
    assert log_propensities.shape == target_propensities.shape
    assert log_propensities.shape == (context_train.shape[0], len(all_actions))

    
    #Fit a model for each action
    for action in all_actions:
        context_for_action = context_train[actions_train==action,:]
        rewards_for_action = rewards_train[actions_train==action]
        model = RidgeCV()
        t_prop_action = target_propensities[actions_train==action,action]
        l_prop_action = log_propensities[actions_train==action,action]
        importance_weights = np.divide(t_prop_action, l_prop_action, out=np.zeros_like(t_prop_action), where=l_prop_action!=0)
        model.fit(context_for_action,rewards_for_action,sample_weight=importance_weights )
        action_to_model_dict[action] = model
    # Models are fit
    return action_to_model_dict

def evaluate(context_test,map_picks_test,actions_test,rewards_test,log_policy,target_bandit,action_to_model_dict):
    est = {}
    est["mean"] = np.mean(rewards_test)
    
    all_actions = action_to_model_dict.keys()
    num_actions = target_bandit.n_arms
    assert target_bandit.n_arms == len(log_policy.pa_x_dict[6])
    
    #Create Logging policies propensity distribution
    log_propensities = np.empty((context_test.shape[0],len(log_policy.map_cols)))
    for ii,(idx,row) in enumerate(map_picks_test.iterrows()):  
        log_propensities_row = log_policy.predict_proba(row)
        log_propensities[ii,:] = log_propensities_row
    
    #Create target policies propensity distribution
    target_propensities = np.empty((context_test.shape[0],num_actions))
    for ii in range(context_test.shape[0]):   
        target_propensities_row = target_bandit.predict_proba(context_test[ii,:])
        target_propensities[ii,:] = target_propensities_row
   
    #( Self-normalized) Importance weighted value estimator
    
    importance_weights_matrix = np.divide(target_propensities,log_propensities,out=np.zeros_like(target_propensities), where=log_propensities!=0)
    importance_weights = np.choose(actions_test,importance_weights_matrix.T)

    est['IW'] = (rewards_test * importance_weights).mean()
    
    #SN IW estimator
    est['SN_IW'] = (rewards_test*importance_weights).sum()/importance_weights.sum()
    
    #Direct Method
    
    predicted_rewards = np.empty((context_test.shape[0],num_actions))
    #Create predicted reward distribution
    for action in all_actions:
        model = action_to_model_dict[action]
        predicted_rewards[:,action] = model.predict(context_test)
    # estimate
    est['Direct_Method_IW'] = (predicted_rewards*target_propensities).sum(axis=1).mean()
    return est
    

In [9]:
action_to_model_dict = train_value_estimator(X_train,map_pick_context_train, A_train, Y_train, log_policy=lp, target_bandit=bandit)

In [10]:
# Uniform policy: untrained bandit
untrained_bandit = Bandit(X_train.shape[1], n_arms, step_size=0.01)

MHS: Nothing important different down here, except I added the Test data section too.

### Eval on train data

In [23]:
print("Trained Bandit:")
evaluate(X_train, map_pick_context_train, A_train, Y_train, \
         log_policy=lp, target_bandit=bandit, action_to_model_dict=action_to_model_dict)

Trained Bandit:


{'mean': 0.5495714285714286,
 'IW': 1.2872922442626826,
 'SN_IW': 0.5567765383510583,
 'Direct_Method_IW': 0.5571072945472595}

In [24]:
print("Uniform Policy:")
evaluate(X_train, map_pick_context_train, A_train, Y_train, \
         log_policy=lp, target_bandit=untrained_bandit,action_to_model_dict=action_to_model_dict)

Uniform Policy:


{'mean': 0.5495714285714286,
 'IW': 1.254442289045975,
 'SN_IW': 0.5558585769907851,
 'Direct_Method_IW': 0.5557734225238844}

### Eval on test data

In [25]:
print("Trained Bandit:")
evaluate(X_test, map_pick_context_test, A_test, Y_test, \
         log_policy=lp, target_bandit=bandit, action_to_model_dict=action_to_model_dict)

Trained Bandit:
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training

{'mean': 0.5362649914334666,
 'IW': 1.1683165867327745,
 'SN_IW': 0.5673088334865963,
 'Direct_Method_IW': 0.558290557623652}

In [26]:
print("Uniform Policy:")
evaluate(X_test, map_pick_context_test, A_test, Y_test, \
         log_policy=lp, target_bandit=untrained_bandit, action_to_model_dict=action_to_model_dict)

Uniform Policy:
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training

{'mean': 0.5362649914334666,
 'IW': 1.155900524260706,
 'SN_IW': 0.5674112357209264,
 'Direct_Method_IW': 0.5568874670342726}