In [24]:
from bandit import Bandit
from context_engineering_functions import *
from logging_policy import LoggingPolicy
from sklearn.model_selection import train_test_split
import os


Need to take a Bandit object - which contains a policy to evaluate, and also a Logging Policy Object.
We estimate the value using the Direct Method, where we fit a ridge regression w/ importance weights onto the reward function.



In [58]:
def train_value_estimator(context_train,map_picks_train,actions_train,rewards_train,log_policy,target_bandit):
    '''
    Trains an importance weighted RidgeCV model which is used for direct method estimation.
    
    Input:
        Context_train (np.array(n x k)): n is number of train examples. k is dimensionality of context.
        actions_train (np.array(n)) : actions taken
        rewards (np.array(n): rewards received
        log_policy: a LoggingPolicy object, which needs to have a function predict_proba(self,context)
        target_policy: a Bandit object, which needs to have a function predict_proba(self,context)
    '''
    all_actions = np.unique(actions_train) # return from unique is already sorted
    action_to_model_dict = {}
    log_propensities = log_policy.predict_proba(map_picks_train)
    target_propensities = target_bandit.predict_proba(context_train)
    print(log_propensities)
    # Check to make sure these are both n x k. I guess its possible that not all actions were chosen, but thats unlikely.
    assert log_propensities.shape == target_propensities.shape
    assert log_propensities.shape == (context_train.shape[0], len(all_actions))
    
    
    #Fit a model for each action
    for action in all_actions:
        contexts_for_action = contexts_train[actions_train==action,:]
        rewards_for_action = rewards_train[actions_train==action]
        model = RidgeCV()
        importance_weights = target_propensities[actions_train==action,:]/propensities[actions_train==action,:]
        model.fit(contexts_for_action,rewards_for_action,sample_weight=importance_weights )
        action_to_model_dict[action] = model
    # Models are fit
    return action_to_model_dict
def evaluate(context_test,actions_test,rewards_test,target_bandit,action_to_model_dict):
    num_actions = len(action_to_model_dict.keys())
    predicted_rewards = np.empty((context_test.shape[0],num_actions))
    for action in all_actions:
        model = action_to_model_dict[action]
        predicted_rewards[:,action] = model.predict(context_test)
    
    target_action_distribution = target_bandit.predict_proba(context_test)
    val_est = (predicted_rewards*target_action_distribution).sum(axis=1).mean()
    return val_est
    

In [45]:
data_directory = './csgo_clean/'
map_pick_context = create_basic_triples(data_directory)
map_pick_context_train, map_pick_context_test  = train_test_split(map_pick_context,test_size=.2,train_size=.8,shuffle=False)

cols = [col for col in map_pick_context if col.endswith('is_available')]
X_train = map_pick_context_train[cols].values
A_train = map_pick_context_train['X_Action'].values
Y_train = map_pick_context_train['Y_reward'].values

X_test = map_pick_context_test[cols].values
A_test = map_pick_context_test['X_Action'].values
Y_test = map_pick_context_test['Y_reward'].values




Finished Basic Context Engineering


In [46]:
n_arms = 7

# randomly chosen by dice roll
n = 4569
# n = np.random.choice(context.index)

bandit = Bandit(X_train.shape[1], n_arms, step_size=0.01)

print('Probabilities for random row: ')
print(bandit.predict_proba(X[n]))

Probabilities for random row: 
[[0.2 0.2 0.  0.2 0.2 0.2 0. ]]


In [47]:
lp = LoggingPolicy(map_pick_context_train,map_pick_context_train['X_Action'])

In [48]:
for i in range(X_train.shape[0]):
    bandit.update_theta(X_train[i].reshape(1, -1), A_train[i], Y_train[i])

In [49]:
print('Probabilities for random row after training: ')
print(bandit.predict_proba(X[n]))
print('Selected action: ')
print(bandit.predict(X[n]))

print('Actual action and reward for random row: ')
print(f'Action: {A[n]}, reward: {Y[n]}')

Probabilities for random row after training: 
[[0.16526486 0.2547316  0.         0.26307424 0.1534473  0.163482
  0.        ]]
Selected action: 
[3]
Actual action and reward for random row: 
Action: 4, reward: 0


In [59]:
action_to_model_dict = train_value_estimator(X_train,map_pick_context_train, A_train, R_train, log_policy=lp, target_bandit=bandit)

AssertionError: 

In [51]:
X_train

array([[1, 1, 1, ..., 0, 1, 0],
       [0, 1, 1, ..., 0, 1, 0],
       [0, 1, 1, ..., 0, 1, 1],
       ...,
       [1, 0, 1, ..., 0, 1, 1],
       [0, 0, 1, ..., 0, 1, 1],
       [1, 1, 0, ..., 1, 1, 0]])

In [60]:
map_pick_context_train

Unnamed: 0,MatchId,de_dust2_is_available,de_inferno_is_available,de_mirage_is_available,de_nuke_is_available,de_overpass_is_available,de_train_is_available,de_vertigo_is_available,DecisionTeamId,OtherTeamId,DecisionOrder,X_Action,Y_reward
2,4,1,1,1,1,0,1,0,12,6,3,0,0
3,4,0,1,1,1,0,1,0,6,12,4,3,1
9,5,0,1,1,1,0,1,1,9,5,3,6,1
10,5,0,1,1,1,0,1,0,5,9,4,3,0
16,7,1,1,1,1,0,1,0,4,11,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20052,5054,1,0,0,1,0,1,1,13,528,4,0,1
20055,5054,0,0,0,0,0,0,1,13,528,7,6,1
20058,5055,1,0,1,1,0,1,1,123,53,3,0,1
20059,5055,0,0,1,1,0,1,1,53,123,4,5,0
