In [1]:
from bandit import Bandit
from context_engineering_functions import *
from logging_policy import LoggingPolicy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from evaluation import train_value_estimator, evaluate
import os

Need to take a Bandit object - which contains a policy to evaluate, and also a Logging Policy Object.
We estimate the value using the Direct Method, where we fit a ridge regression w/ importance weights onto the reward function.



In [2]:
data_directory = './csgo_clean/'
map_pick_context, vetoes_only_context = create_basic_pick_veto_triples(data_directory) # Not important, but loaded vetoes too
map_pick_context_train, map_pick_context_test  = train_test_split(map_pick_context,test_size=.2,train_size=.8,shuffle=False)

cols = [col for col in map_pick_context if col.endswith('is_available')]
X_train = map_pick_context_train[cols].values
A_train = map_pick_context_train['X_Action'].values
Y_train = map_pick_context_train['Y_reward'].values

X_test = map_pick_context_test[cols].values
A_test = map_pick_context_test['X_Action'].values
Y_test = map_pick_context_test['Y_reward'].values


In [3]:
n_arms = 7

# randomly chosen by dice roll
n = 4569
# n = np.random.choice(context.index)

bandit = Bandit(X_train.shape[1], n_arms, step_size=0.01)

print('Probabilities for random row: ')
print(bandit.predict_proba(X_train[n]))

Probabilities for random row: 
[[0.2 0.2 0.  0.2 0.2 0.2 0. ]]


In [4]:
lp = LoggingPolicy(map_pick_context_train,map_pick_context_train['X_Action'])

In [5]:
# TO DO: multiple epochs, parameter tuning
for i in range(X_train.shape[0]):
    bandit.update_theta(X_train[i].reshape(1, -1), A_train[i], Y_train[i])

In [6]:
print(bandit.predict_proba(X_train[n]))

[[0.16526486 0.2547316  0.         0.26307424 0.1534473  0.163482
  0.        ]]


### Multi-epoch loop

In [7]:
# TO DO: multiple epochs, parameter tuning
n_epochs = 10
for epoch in range(n_epochs):
    for i in range(X_train.shape[0]):
        bandit.update_theta(X_train[i].reshape(1, -1), A_train[i], Y_train[i])

In [8]:
print(bandit.predict_proba(X_train[n]))

[[0.15566783 0.25676085 0.         0.27968643 0.1492302  0.15865469
  0.        ]]


In [9]:
print('Probabilities for random row after training: ')
print(bandit.predict_proba(X_train[n]))
print('Selected action: ')
print(bandit.predict(X_train[n]))

print('Actual action and reward for random row: ')
print(f'Action: {A_train[n]}, reward: {Y_train[n]}')

Probabilities for random row after training: 
[[0.15566783 0.25676085 0.         0.27968643 0.1492302  0.15865469
  0.        ]]
Selected action: 
[3]
Actual action and reward for random row: 
Action: 4, reward: 0


In [10]:
action_to_model_dict = train_value_estimator(X_train, map_pick_context_train, A_train, Y_train, log_policy=lp, target_bandit=bandit)

In [11]:
# Uniform policy: untrained bandit
untrained_bandit = Bandit(X_train.shape[1], n_arms, step_size=0.01)

MHS: Nothing important different down here, except I added the Test data section too.

### Eval on train data

In [12]:
print("Trained Bandit:")
evaluate(X_train, map_pick_context_train, A_train, Y_train, \
         log_policy=lp, target_bandit=bandit, action_to_model_dict=action_to_model_dict)

Trained Bandit:


{'mean': 0.5495714285714286,
 'IW': 1.3000251226930468,
 'SN_IW': 0.5582780985246645,
 'Direct_Method_IW': 0.5583079306013885}

In [13]:
print("Uniform Policy:")
evaluate(X_train, map_pick_context_train, A_train, Y_train, \
         log_policy=lp, target_bandit=untrained_bandit,action_to_model_dict=action_to_model_dict)

Uniform Policy:


{'mean': 0.5495714285714286,
 'IW': 1.254442289045975,
 'SN_IW': 0.5558585769907851,
 'Direct_Method_IW': 0.5559663867915816}

### Eval on test data

In [14]:
print("Trained Bandit:")
evaluate(X_test, map_pick_context_test, A_test, Y_test, \
         log_policy=lp, target_bandit=bandit, action_to_model_dict=action_to_model_dict)

Trained Bandit:
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training

{'mean': 0.5362649914334666,
 'IW': 1.1751889281300871,
 'SN_IW': 0.5701120532080091,
 'Direct_Method_IW': 0.5596151524302753}

In [15]:
print("Uniform Policy:")
evaluate(X_test, map_pick_context_test, A_test, Y_test, \
         log_policy=lp, target_bandit=untrained_bandit, action_to_model_dict=action_to_model_dict)

Uniform Policy:
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 18 not seen during training. Using default policy.
Team ID 549 not seen during training. Using default policy.
Team ID 549 not seen during training

{'mean': 0.5362649914334666,
 'IW': 1.155900524260706,
 'SN_IW': 0.5674112357209264,
 'Direct_Method_IW': 0.5570846554003637}