In [1]:
import context_engineering_functions as cef
import pickle
import pandas as pd

In [2]:
data = '../data/clean/'

basic_picks, basic_vetos = cef.create_basic_pick_veto_triples(data)
proportion_picks, proportion_vetos = cef.create_basic_pick_veto_triples(data, pick_reward_function=cef.get_proportion_rewards)

# Added
proportion_picks['Y_reward_01'] = basic_picks['Y_reward']
proportion_vetos['Y_reward_01'] = basic_vetos['Y_reward']
basic_picks['Y_reward_01'] = basic_picks['Y_reward']
basic_vetos['Y_reward_01'] = basic_vetos['Y_reward']

In [3]:
basic_vetos[basic_vetos.isna().any(axis=1)]

Unnamed: 0,MatchId,de_dust2_is_available,de_inferno_is_available,de_mirage_is_available,de_nuke_is_available,de_overpass_is_available,de_train_is_available,de_vertigo_is_available,DecisionTeamId,OtherTeamId,...,OtherTeam_de_inferno_WinPct,OtherTeam_de_mirage_WinPct,OtherTeam_de_nuke_WinPct,OtherTeam_de_overpass_WinPct,OtherTeam_de_train_WinPct,OtherTeam_de_vertigo_WinPct,DecisionOrder,X_Action,Y_reward,Y_reward_01
2516,1236,1,1,1,1,1,1,1,79,51,...,,,,,0.545455,,1,5,0.5,0.5


In [4]:
basic_vetos.dropna(inplace=True)
proportion_vetos.dropna(inplace=True)

In [None]:
only_basic_picks.shape, only_proportion_picks.shape

In [None]:
basic_picks.shape, basic_vetos.shape

In [None]:
proportion_picks.shape, proportion_vetos.shape

In [None]:
proportion_picks.head()

In [5]:
from bandit import Bandit, VetoBandit, ComboBandit, EpisodicBandit, BothBandit

In [9]:
# bandit_types = [(Bandit, None),
#                 (Bandit, VetoBandit),
#                 (ComboBandit, None),
#                 (EpisodicBandit, None),
#                 ]

datasets = {#'basic': (basic_picks, None),
            #'proportion': (proportion_picks, None),
            'basic_veto': (basic_picks, basic_vetos),
            'proportion_veto': (proportion_picks, proportion_vetos),
            }

contexts = [# 'maps_only',
            #'winprob',
            #'map_winprob',
            'both',
            ]

baselines = [True]

step_sizes = [ # 5e-6,
              1e-5, # 5e-5,  # better for basic
#               1e-4, # 5e-4,
#               1e-3, # 5e-3,
#               1e-2, 5e-2,1e-1  # better for proportion
              ]

epochs = [1#,3
          # 3, 5, 10,
          ]

def get_cols(x):
    cols = ['de_dust2_is_available',
            'de_inferno_is_available',
            'de_mirage_is_available',
            'de_nuke_is_available',
            'de_overpass_is_available',
            'de_train_is_available',
            'de_vertigo_is_available',
            ]

    if x in ('winprob', 'both'):
        cols.extend(['DecisionTeam_WinPercent',
                     'OtherTeam_WinPercent'])
    if x in ('map_winprob', 'both'):
        cols.extend([col for col in proportion_picks.columns if 'WinPct' in col])
    return cols

In [10]:
from logging_policy import LoggingPolicy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from evaluation import train_value_estimator, evaluate
results=[]

In [11]:
# BOTH BANDIT!

for dataset, (pick_df, veto_df) in datasets.items():
    
    if dataset not in ('basic_veto', 'proportion_veto'):
        continue
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        # Reverse sign of veto rewards
        veto_df['Y_reward'] = veto_df['Y_reward']*-1
        veto_df['Y_reward_01'] = veto_df['Y_reward_01']*-1
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test, Y_train_01, Y_test_01 = train_test_split(X, A.values, \
                                                                            Y.values, Y_01.values, test_size=0.2, random_state=13)
    
    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training combo bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values

                    bandit = BothBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
#                     for _ in range(n_epochs):
#                         for i in range(X.shape[0]):
#                             bandit.update_theta(X[i].reshape(1, -1), A_train[i], Y_train[i])
#                         actual_epochs_trained += 1


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train_01,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result_veto = evaluate(X_test[X_test['action_type']=='veto'][cols].values,
                                      X_test[X_test['action_type']=='veto'],
                                      A_test[X_test['action_type']=='veto'],
                                      Y_test_01[X_test['action_type']=='veto'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='veto']['action_type']
                                      )
                    
                    result_pick = evaluate(X_test[X_test['action_type']=='pick'][cols].values,
                                      X_test[X_test['action_type']=='pick'],
                                      A_test[X_test['action_type']=='pick'],
                                      Y_test_01[X_test['action_type']=='pick'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='pick']['action_type']
                                      )
                    
                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'bothbandit-pick'
                    result_pick['action_type'] = 'pick'
                    
                    result_veto['dataset'] = dataset
                    result_veto['context'] = context
                    result_veto['baseline'] = baseline
                    result_veto['step_size'] = step_size
                    result_veto['n_epochs_actual'] = actual_epochs_trained
                    result_veto['bandit_type'] = 'bothbandit-veto'
                    result_veto['action_type'] = 'veto'
                    
                    results.append(result_pick)
                    results.append(result_veto)

Training combo bandit for 1 epochs, with baseline, step size 1e-05, context both, dataset basic_veto
Training combo bandit for 1 epochs, with baseline, step size 1e-05, context both, dataset proportion_veto


In [12]:
results

[{'mean': 0.5489530277306168,
  'IW': 1.1877404394878919,
  'SN_IW': 0.568256941912775,
  'Direct_Method_IW': 0.5417076336946948,
  'dataset': 'basic_veto',
  'context': 'both',
  'baseline': True,
  'step_size': 1e-05,
  'n_epochs_actual': 0,
  'bandit_type': 'bothbandit-pick',
  'action_type': 'pick'},
 {'mean': -0.014406260930395242,
  'IW': -0.05313593936678146,
  'SN_IW': -0.01797692646400901,
  'Direct_Method_IW': -0.0033076278684943303,
  'dataset': 'basic_veto',
  'context': 'both',
  'baseline': True,
  'step_size': 1e-05,
  'n_epochs_actual': 0,
  'bandit_type': 'bothbandit-veto',
  'action_type': 'veto'},
 {'mean': 0.5489530277306168,
  'IW': 1.1877404394878919,
  'SN_IW': 0.568256941912775,
  'Direct_Method_IW': 0.5417076336946948,
  'dataset': 'proportion_veto',
  'context': 'both',
  'baseline': True,
  'step_size': 1e-05,
  'n_epochs_actual': 0,
  'bandit_type': 'bothbandit-pick',
  'action_type': 'pick'},
 {'mean': 0.014406260930395242,
  'IW': 0.05313593936678146,
  'S

In [None]:
# simple bandit

results = []

for dataset, (pick_df, veto_df) in datasets.items():

    # Skip vetoes here
    if dataset in ('basic_veto', 'proportion_veto'):
        continue
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']
    # Added
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test,Y_train_01, Y_test_01 = train_test_split(X, A.values, \
                                                                Y.values, Y_01.values, test_size=0.2, random_state=13)
    
    lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training simple bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values # numpy array for bandit

                    bandit = Bandit(n_features,
                                    n_arms=7,
                                    step_size=step_size,
                                    baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for i in range(X.shape[0]):
                            bandit.update_theta(X[i].reshape(1, -1), A_train[i], Y_train[i])
                        actual_epochs_trained += 1

                    # eval on test set
                    try:
                        action_to_model_dict = train_value_estimator(X,
                                                                     X_train,
                                                                     A_train,
                                                                     Y_train_01,
                                                                     log_policy=lp,
                                                                     target_bandit=bandit,
                                                                     )

                        result = evaluate(X_test[cols].values,
                                          X_test,
                                          A_test,
                                          Y_test_01,
                                          log_policy=lp,
                                          target_bandit=bandit,
                                          action_to_model_dict=action_to_model_dict,
                                          )
                    except ValueError:
                        result = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = actual_epochs_trained
                    result['bandit_type'] = 'simplebandit'

                    results.append(result)

In [None]:
results

In [None]:
# SAVE Results
with open('results-simple.pckl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
# LOAD Results
with open('results-simple.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
## split bandit

for dataset, (pick_df, veto_df) in datasets.items():
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']

    # Added
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test,Y_train_01, Y_test_01 = train_test_split(X, A.values, \
                                                                Y.values, Y_01.values, test_size=0.2, random_state=13)

    if dataset in ('basic_veto', 'proportion_veto'):
        # Added
        X_veto = veto_df
        A_veto = veto_df['X_Action']
        Y_veto = veto_df['Y_reward']
        Y_veto_01 = veto_df['Y_reward_01']
        (X_veto_train, X_veto_test,
         A_veto_train, A_veto_test,
         Y_veto_train, Y_veto_test,
        Y_veto_train_01, Y_veto_test_01) = train_test_split(X_veto, A_veto.values, Y_veto.values, Y_veto_01.values, test_size=0.2, random_state=13)
        # LP takes vetoes too
        lp = LoggingPolicy(X_train, X_train['X_Action'],X_veto_train,X_veto_train['X_Action'])
    else:
        continue
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training split bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    
                    if dataset in ('basic_veto', 'proportion_veto'):
                        X_veto = X_veto_train[cols].values
                        X_veto_train.reset_index(drop=True, inplace=True)

                    bandit = Bandit(n_features,
                                    n_arms=7,
                                    step_size=step_size,
                                    baseline=baseline)

                    veto_bandit = VetoBandit(n_features,
                                             n_arms=7,
                                             step_size=step_size,
                                             baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        # for i in range(X.shape[0]):
                        #     bandit.update_theta(X[i].reshape(1, -1), A_train[i], Y_train[i])
                        if veto_df is not None:
                            for matchid in veto_df['MatchId'].unique():
                                indices = X_veto_train[X_veto_train['MatchId'] == matchid].index
                                veto_bandit.update_theta(X_veto[indices],
                                                         A_veto_train[indices],
                                                         Y_veto_train[indices])
                        actual_epochs_trained += 1

                    # eval on test set
                        #action_to_model_dict = train_value_estimator(X,
                        #                                             X_train,
                        #                                             A_train,
                        #                                             Y_train_01,
                        #                                             log_policy=lp,
                        #                                             target_bandit=bandit,
                        #                                             )

                    result_pick = {}

                        #result_pick = evaluate(X_test[cols].values,
                        #                       X_test,
                        #                       A_test,
                        #                       Y_test_01,
                        #                       log_policy=lp,
                        #                       target_bandit=bandit,
                        #                       action_to_model_dict=action_to_model_dict,
                        #                       )
                        
                    if dataset in ('basic_veto', 'proportion_veto'):

                        action_to_model_dict = train_value_estimator(X_veto,
                                                                     X_veto_train,
                                                                     A_veto_train,
                                                                     Y_veto_train_01,
                                                                     log_policy=lp,
                                                                     target_bandit=veto_bandit,
                                                                     veto_flags=pd.Series(['veto']*X_veto_train.shape[0])
                                                                     )

                        result_veto = evaluate(X_veto_test[cols].values,
                                               X_veto_test,
                                               A_veto_test,
                                               Y_veto_test_01,
                                               log_policy=lp,
                                               target_bandit=veto_bandit,
                                               action_to_model_dict=action_to_model_dict,
                                               veto_flags=pd.Series(['veto']*X_veto_test.shape[0])
                                               )

                    # result_pick = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}
                    # I commented this out because it is bad?
#                     if dataset in ('basic_veto', 'proportion_veto'):
#                         result_veto = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'splitbandit-pick'
                    
                    if dataset in ('basic_veto', 'proportion_veto'):
                        result_veto['dataset'] = dataset
                        result_veto['context'] = context
                        result_veto['baseline'] = baseline
                        result_veto['step_size'] = step_size
                        result_veto['n_epochs_actual'] = actual_epochs_trained
                        result_veto['bandit_type'] = 'splitbandit-veto'

                    # results.append(result_pick)
                    if dataset in ('basic_veto', 'proportion_veto'):
                        results.append(result_veto)

In [None]:
basic_vetos['Y_reward '] = -1*basic_vetos['Y_reward ']

In [None]:
basic_vetos['Y_reward'] = -1*basic_vetos['Y_reward']
basic_vetos

In [None]:
with open('results-split.pckl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open('results-split.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
# BOTH BANDIT!

for dataset, (pick_df, veto_df) in datasets.items():
    
    if dataset not in ('basic_veto', 'proportion_veto'):
        continue
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        # Reverse sign of veto rewards
        veto_df['Y_reward'] = veto_df['Y_reward']*-1
        veto_df['Y_reward_01'] = veto_df['Y_reward_01']*-1
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test, Y_train_01, Y_test_01 = train_test_split(X, A.values, \
                                                                            Y.values, Y_01.values, test_size=0.2, random_state=13)
    
    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training combo bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values

                    bandit = BothBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for i in range(X.shape[0]):
                            bandit.update_theta(X[i].reshape(1, -1), A_train[i], Y_train[i])
                        actual_epochs_trained += 1


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train_01,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result_veto = evaluate(X_test[X_test['action_type']=='veto'][cols].values,
                                      X_test[X_test['action_type']=='veto'],
                                      A_test[X_test['action_type']=='veto'],
                                      Y_test_01[X_test['action_type']=='veto'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='veto']['action_type']
                                      )
                    
                    result_pick = evaluate(X_test[X_test['action_type']=='pick'][cols].values,
                                      X_test[X_test['action_type']=='pick'],
                                      A_test[X_test['action_type']=='pick'],
                                      Y_test_01[X_test['action_type']=='pick'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='pick']['action_type']
                                      )
                    
                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'bothbandit-pick'
                    result_pick['action_type'] = 'pick'
                    
                    result_veto['dataset'] = dataset
                    result_veto['context'] = context
                    result_veto['baseline'] = baseline
                    result_veto['step_size'] = step_size
                    result_veto['n_epochs_actual'] = actual_epochs_trained
                    result_veto['bandit_type'] = 'bothbandit-veto'
                    result_veto['action_type'] = 'veto'
                    
                    results.append(result_pick)
                    results.append(result_veto)

In [None]:
# Picks only BANDIT!

for dataset, (pick_df, veto_df) in datasets.items():
    
    if dataset not in ('basic_veto', 'proportion_veto'):
        continue
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        # Reverse sign of veto rewards
        veto_df['Y_reward'] = veto_df['Y_reward']*-1
        veto_df['Y_reward_01'] = veto_df['Y_reward_01']*-1
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test, Y_train_01, Y_test_01 = train_test_split(X, A.values, \
                                                                            Y.values, Y_01.values, test_size=0.2, random_state=13)
    
    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training combo bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    X_picks = X_train[X_train['action_type']=='pick'][cols].values
                    A_picks = A_train[X_train['action_type']=='pick']
                    Y_picks = Y_train[X_train['action_type']=='pick']
                    
                    bandit = BothBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for i in range(X_picks.shape[0]):
                            bandit.update_theta(X_picks[i].reshape(1, -1), A_picks[i], Y_picks[i])
                        actual_epochs_trained += 1


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train_01,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result_veto = evaluate(X_test[X_test['action_type']=='veto'][cols].values,
                                      X_test[X_test['action_type']=='veto'],
                                      A_test[X_test['action_type']=='veto'],
                                      Y_test_01[X_test['action_type']=='veto'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='veto']['action_type']
                                      )
                    
                    result_pick = evaluate(X_test[X_test['action_type']=='pick'][cols].values,
                                      X_test[X_test['action_type']=='pick'],
                                      A_test[X_test['action_type']=='pick'],
                                      Y_test_01[X_test['action_type']=='pick'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='pick']['action_type']
                                      )
                    
                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'bothbandit-pick'
                    result_pick['action_type'] = 'pick'
                    
                    result_veto['dataset'] = dataset
                    result_veto['context'] = context
                    result_veto['baseline'] = baseline
                    result_veto['step_size'] = step_size
                    result_veto['n_epochs_actual'] = actual_epochs_trained
                    result_veto['bandit_type'] = 'bothbandit-veto'
                    result_veto['action_type'] = 'veto'
                    
                    results.append(result_pick)
                    results.append(result_veto)

In [None]:
# Veto only BANDIT!

for dataset, (pick_df, veto_df) in datasets.items():
    
    if dataset not in ('basic_veto', 'proportion_veto'):
        continue
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        # Reverse sign of veto rewards
        veto_df['Y_reward'] = veto_df['Y_reward']*-1
        veto_df['Y_reward_01'] = veto_df['Y_reward_01']*-1
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test, Y_train_01, Y_test_01 = train_test_split(X, A.values, \
                                                                            Y.values, Y_01.values, test_size=0.2, random_state=13)
    
    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training combo bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    X_veto = X_train[X_train['action_type']=='veto'][cols].values
                    A_veto = A_train[X_train['action_type']=='veto']
                    Y_veto = Y_train[X_train['action_type']=='veto']
                    
                    bandit = BothBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for i in range(X_veto.shape[0]):
                            bandit.update_theta(X_veto[i].reshape(1, -1), A_veto[i], Y_veto[i])
                        actual_epochs_trained += 1


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train_01,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result_veto = evaluate(X_test[X_test['action_type']=='veto'][cols].values,
                                      X_test[X_test['action_type']=='veto'],
                                      A_test[X_test['action_type']=='veto'],
                                      Y_test_01[X_test['action_type']=='veto'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='veto']['action_type']
                                      )
                    
                    result_pick = evaluate(X_test[X_test['action_type']=='pick'][cols].values,
                                      X_test[X_test['action_type']=='pick'],
                                      A_test[X_test['action_type']=='pick'],
                                      Y_test_01[X_test['action_type']=='pick'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='pick']['action_type']
                                      )
                    
                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'bothbandit-pick'
                    result_pick['action_type'] = 'pick'
                    
                    result_veto['dataset'] = dataset
                    result_veto['context'] = context
                    result_veto['baseline'] = baseline
                    result_veto['step_size'] = step_size
                    result_veto['n_epochs_actual'] = actual_epochs_trained
                    result_veto['bandit_type'] = 'bothbandit-veto'
                    result_veto['action_type'] = 'veto'
                    
                    results.append(result_pick)
                    results.append(result_veto)

In [None]:
results

In [None]:
df_results = pd.DataFrame.from_records(results)
df_results.dropna(inplace=True)
df_results[df_results['dataset']=='basic_veto']

In [None]:
filtered = df_results[(df_results['bandit_type'] == 'episodictbandit-pick')
                      & ~(df_results['dataset'].str.endswith('veto'))
                      & (df_results['dataset'].str.startswith('basic'))
                      ]


print(filtered.sort_values(by=['bandit_type', 'dataset', 'SN_IW'], ascending=False).head(1))
filtered = df_results[(df_results['bandit_type'] == 'episodictbandit-pick')
                      & ~(df_results['dataset'].str.endswith('veto'))
                      & ~(df_results['dataset'].str.startswith('basic'))
                      ]


print(filtered.sort_values(by=['bandit_type', 'dataset', 'SN_IW'], ascending=False).head(1))
filtered = df_results[(df_results['bandit_type'] == 'episodictbandit-pick')
                      & (df_results['dataset'].str.endswith('veto'))
                      & (df_results['dataset'].str.startswith('basic'))
                      ]


print(filtered.sort_values(by=['bandit_type', 'dataset', 'SN_IW'], ascending=False).head(1))
filtered = df_results[(df_results['bandit_type'] == 'episodictbandit-pick')
                      & (df_results['dataset'].str.endswith('veto'))
                      & ~(df_results['dataset'].str.startswith('basic'))
                      ]


print(filtered.sort_values(by=['bandit_type', 'dataset', 'SN_IW'], ascending=False).head(1))

In [None]:
df_results[(df_results['bandit_type'] == 'splitbandit-pick')
                      & (df_results['dataset'].str.endswith('veto'))
                      & (df_results['dataset'].str.startswith('basic'))
                      ]

In [None]:
with open('results-combo-abridged.pckl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open('results-combo-abridged.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
# episodic bandit

for dataset, (pick_df, veto_df) in datasets.items():
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    Y_01 = pick_df['Y_reward_01']
    
    # train test split - match-based since episodic
    
    train_matchids, test_matchids = train_test_split(X['MatchId'], test_size=0.2, random_state=13)
    
    
    X_train = X[X['MatchId'].isin(train_matchids)]
    A_train = A[X['MatchId'].isin(train_matchids)].values
    Y_train = Y[X['MatchId'].isin(train_matchids)].values
    Y_train_01 = Y_01[X['MatchId'].isin(train_matchids)].values
    
    X_test = X[X['MatchId'].isin(test_matchids)]
    A_test = A[X['MatchId'].isin(test_matchids)].values
    Y_test = Y[X['MatchId'].isin(test_matchids)].values
    Y_test_01 = Y_01[X['MatchId'].isin(test_matchids)].values

    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in [1e-5,1e-4,1e-3,5e-3]:
                for n_epochs in epochs:
                    print(f'Training episodic bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    X_train.reset_index(drop=True, inplace=True)

                    bandit = EpisodicBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for matchid in X_train['MatchId'].unique():
                            indices = X_train[X_train['MatchId'] == matchid].index

                            bandit.update_theta(X[indices], A_train[indices],
                                                Y_train[indices], action_types=X_train['action_type'].loc[indices].values)
                        actual_epochs_trained += 1


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train_01,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result_veto = evaluate(X_test[X_test['action_type']=='veto'][cols].values,
                                      X_test[X_test['action_type']=='veto'],
                                      A_test[X_test['action_type']=='veto'],
                                      Y_test_01[X_test['action_type']=='veto'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='veto']['action_type']
                                      )
                    
                    result_pick = evaluate(X_test[X_test['action_type']=='pick'][cols].values,
                                      X_test[X_test['action_type']=='pick'],
                                      A_test[X_test['action_type']=='pick'],
                                      Y_test_01[X_test['action_type']=='pick'],
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test[X_test['action_type']=='pick']['action_type']
                                      )
                    
                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'episodictbandit-pick'
                    result_pick['action_type'] = 'pick'
                    
                    result_veto['dataset'] = dataset
                    result_veto['context'] = context
                    result_veto['baseline'] = baseline
                    result_veto['step_size'] = step_size
                    result_veto['n_epochs_actual'] = actual_epochs_trained
                    result_veto['bandit_type'] = 'episodicbandit-veto'
                    result_veto['action_type'] = 'veto'
                    
                    results.append(result_pick)
                    results.append(result_veto)
#                         action_to_model_dict = train_value_estimator(X,
#                                                                      X_train,
#                                                                      A_train,
#                                                                      Y_train_01,
#                                                                      log_policy=lp,
#                                                                      target_bandit=bandit,
#                                                                      veto_flags=X_train['action_type']
#                                                                      )

#                         result = evaluate(X_test[cols].values,
#                                           X_test,
#                                           A_test,
#                                           Y_test_01,
#                                           log_policy=lp,
#                                           target_bandit=bandit,
#                                           action_to_model_dict=action_to_model_dict,
#                                           veto_flags=X_test['action_type']
#                                           )
#                     except ValueError:
#                         result = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

#                     result['dataset'] = dataset
#                     result['context'] = context
#                     result['baseline'] = baseline
#                     result['step_size'] = step_size
#                     result['n_epochs_actual'] = actual_epochs_trained
#                     result['bandit_type'] = 'episodicbandit'
                    
#                     results.append(result)

In [None]:
results

In [None]:
with open('results-episodic.pckl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open('results-episodic.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
with open('results-full.pckl', 'wb') as f:
    pickle.dump(results, f)