In [1]:
from bandit import Bandit
#from logging_policy import LoggingPolicy

Need to take a Bandit object - which contains a policy to evaluate, and also a Logging Policy Object.
We estimate the value using the Direct Method, where we fit a ridge regression w/ importance weights onto the reward function.



In [None]:
def train_value_estimator(context_train,actions_train,rewards_train,log_policy,target_bandit):
    '''
    Trains an importance weighted RidgeCV model which is used for direct method estimation.
    
    Input:
        Context_train (np.array(n x k)): n is number of train examples. k is dimensionality of context.
        actions_train (np.array(n)) : actions taken
        rewards (np.array(n): rewards received
        log_policy: a LoggingPolicy object, which needs to have a function get_pa_x(self,context)
        target_policy: a Bandit object, which needs to have a function predict_proba(self,context)
    '''
    all_actions = np.unique(actions_train) # return from unique is already sorted
    action_to_model_dict = {}
    log_propensities = log_policy.get_pa_x(context_train)
    target_propensities = target_bandit.predict_proba(context_train)
    
    # Check to make sure these are both n x k. I guess its possible that not all actions were chosen, but thats unlikely.
    assert log_propensities.shape == target_propensities.shape
    assert log_propensities.shape == (context_train.shape[0], len(all_actions))
    
    
    #Fit a model for each action
    for action in all_actions:
        contexts_for_action = contexts_train[actions_train==action,:]
        rewards_for_action = rewards_train[actions_train==action]
        model = RidgeCV()
        importance_weights = target_propensities[actions_train==action,:]/propensities[actions_train==action,:]
        model.fit(contexts_for_action,rewards_for_action,sample_weight=importance_weights )
        action_to_model_dict[action] = model
    # Models are fit
    return action_to_model_dict


In [None]:
def evaluate(context_test,actions_test,rewards_test,target_bandit,action_to_model_dict)
    num_actions = len(action_to_model_dict.keys())
    predicted_rewards = np.empty((context_test.shape[0],num_actions))
    for action in all_actions:
        model = action_to_model_dict[action]
        predicted_rewards[:,action] = model.predict(context_test)
    
    target_action_distribution = target_bandit.predict_proba(context_test)
    val_est = (predicted_rewards*target_action_distribution).sum(axis=1).mean()
    return val_est
    