In [1]:
import context_engineering_functions as cef
import pickle
import pandas as pd

In [2]:
data = 'csgo_map_picks/clean/'

only_basic_picks, _ = cef.create_basic_pick_veto_triples(data)
only_proportion_picks, _ = cef.create_basic_pick_veto_triples(data, pick_reward_function=cef.get_proportion_rewards)
basic_picks, basic_vetos = cef.create_basic_pick_veto_triples(data)
proportion_picks, proportion_vetos = cef.create_basic_pick_veto_triples(data, pick_reward_function=cef.get_proportion_rewards)

In [24]:
basic_vetos[basic_vetos.isna().any(axis=1)]

Unnamed: 0,MatchId,de_dust2_is_available,de_inferno_is_available,de_mirage_is_available,de_nuke_is_available,de_overpass_is_available,de_train_is_available,de_vertigo_is_available,DecisionTeamId,OtherTeamId,...,OtherTeam_de_inferno_WinPct,OtherTeam_de_mirage_WinPct,OtherTeam_de_nuke_WinPct,OtherTeam_de_overpass_WinPct,OtherTeam_de_train_WinPct,OtherTeam_de_vertigo_WinPct,DecisionOrder,X_Action,Y_reward,action_type
2516,1236,1,1,1,1,1,1,1,79,51,...,,,,,0.545455,,1,5,0.5,veto


In [25]:
basic_vetos.dropna(inplace=True)
proportion_vetos.dropna(inplace=True)

In [3]:
only_basic_picks.shape, only_proportion_picks.shape

((8751, 29), (8751, 29))

In [4]:
basic_picks.shape, basic_vetos.shape

((8751, 29), (14377, 29))

In [5]:
proportion_picks.shape, proportion_vetos.shape

((8751, 29), (14377, 29))

In [6]:
proportion_picks.head()

Unnamed: 0,MatchId,de_dust2_is_available,de_inferno_is_available,de_mirage_is_available,de_nuke_is_available,de_overpass_is_available,de_train_is_available,de_vertigo_is_available,DecisionTeamId,OtherTeamId,...,OtherTeam_de_dust2_WinPct,OtherTeam_de_inferno_WinPct,OtherTeam_de_mirage_WinPct,OtherTeam_de_nuke_WinPct,OtherTeam_de_overpass_WinPct,OtherTeam_de_train_WinPct,OtherTeam_de_vertigo_WinPct,DecisionOrder,X_Action,Y_reward
2,4,1,1,1,1,0,1,0,12,6,...,0.607143,0.622222,0.611111,0.507692,0.375,0.477273,0.609756,3,0,-0.055556
3,4,0,1,1,1,0,1,0,6,12,...,0.607143,0.622222,0.611111,0.507692,0.375,0.477273,0.609756,4,3,0.28
9,5,0,1,1,1,0,1,1,9,5,...,0.393939,0.5,0.357143,0.463415,0.58,0.571429,0.45,3,6,0.454545
10,5,0,1,1,1,0,1,0,5,9,...,0.393939,0.5,0.357143,0.463415,0.58,0.571429,0.45,4,3,-0.103448
16,7,1,1,1,1,0,1,0,4,11,...,0.470588,0.432432,0.5,0.592593,0.454545,0.411765,0.585366,3,1,0.28


In [7]:
from bandit import Bandit, VetoBandit, ComboBandit, EpisodicBandit

In [17]:
# bandit_types = [(Bandit, None),
#                 (Bandit, VetoBandit),
#                 (ComboBandit, None),
#                 (EpisodicBandit, None),
#                 ]

datasets = {# 'basic': (only_basic_picks, None),
            # 'proportion': (only_proportion_picks, None),
            'basic_veto': (basic_picks, basic_vetos),
            'proportion_veto': (proportion_picks, proportion_vetos),
            }

contexts = [# 'maps_only',
            #'winprob',
            #'map_winprob',
            'both',
            ]

baselines = [True, False]

step_sizes = [1e-6, # 5e-6,
              1e-5, # 5e-5,  # better for basic
              1e-4, # 5e-4,
              1e-3, # 5e-3,
              # 1e-2, 5e-2,  # better for proportion
              ]

epochs = [1,
          # 3, 5, 10,
          ]

def get_cols(x):
    cols = ['de_dust2_is_available',
            'de_inferno_is_available',
            'de_mirage_is_available',
            'de_nuke_is_available',
            'de_overpass_is_available',
            'de_train_is_available',
            'de_vertigo_is_available',
            ]

    if x in ('winprob', 'both'):
        cols.extend(['DecisionTeam_WinPercent',
                     'OtherTeam_WinPercent'])
    if x in ('map_winprob', 'both'):
        cols.extend([col for col in proportion_picks.columns if 'WinPct' in col])
    return cols

In [9]:
from logging_policy import LoggingPolicy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from evaluation import train_value_estimator, evaluate

In [9]:
# simple bandit

results = []

for dataset, (pick_df, veto_df) in datasets.items():

    # Skip vetoes here
    if dataset in ('basic_veto', 'proportion_veto'):
        continue
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test = train_test_split(X, A.values, Y.values, test_size=0.2, random_state=13)
    
    lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training simple bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values # numpy array for bandit

                    bandit = Bandit(n_features,
                                    n_arms=7,
                                    step_size=step_size,
                                    baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for i in range(X.shape[0]):
                            bandit.update_theta(X[i].reshape(1, -1), A_train[i], Y_train[i])
                        actual_epochs_trained += 1

                    # eval on test set
                    try:
                        action_to_model_dict = train_value_estimator(X,
                                                                     X_train,
                                                                     A_train,
                                                                     Y_train,
                                                                     log_policy=lp,
                                                                     target_bandit=bandit,
                                                                     )

                        result = evaluate(X_test[cols].values,
                                          X_test,
                                          A_test,
                                          Y_test,
                                          log_policy=lp,
                                          target_bandit=bandit,
                                          action_to_model_dict=action_to_model_dict,
                                          )
                    except ValueError:
                        result = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = actual_epochs_trained
                    result['bandit_type'] = 'simplebandit'

                    results.append(result)

Training simple bandit for 1 epochs, with baseline, step size 1e-06, context maps_only, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 1e-06, context maps_only, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context maps_only, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 1e-06, context maps_only, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 5e-06, context maps_only, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 5e-06, context maps_only, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 5e-06, context maps_only, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 5e-06, context maps_only, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 1e-05, context maps_only, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 1e-05, context map

Training simple bandit for 10 epochs, without baseline, step size 0.05, context maps_only, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 1e-06, context winprob, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 1e-06, context winprob, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context winprob, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 1e-06, context winprob, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 5e-06, context winprob, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 5e-06, context winprob, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 5e-06, context winprob, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 5e-06, context winprob, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 1e-05, context winprob, dataset

Training simple bandit for 1 epochs, with baseline, step size 1e-06, context map_winprob, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 1e-06, context map_winprob, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context map_winprob, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 1e-06, context map_winprob, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 5e-06, context map_winprob, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 5e-06, context map_winprob, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 5e-06, context map_winprob, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 5e-06, context map_winprob, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 1e-05, context map_winprob, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 

Training simple bandit for 5 epochs, without baseline, step size 0.05, context map_winprob, dataset basic
Training simple bandit for 10 epochs, without baseline, step size 0.05, context map_winprob, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 1e-06, context both, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 1e-06, context both, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context both, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 1e-06, context both, dataset basic
Training simple bandit for 1 epochs, with baseline, step size 5e-06, context both, dataset basic
Training simple bandit for 3 epochs, with baseline, step size 5e-06, context both, dataset basic
Training simple bandit for 5 epochs, with baseline, step size 5e-06, context both, dataset basic
Training simple bandit for 10 epochs, with baseline, step size 5e-06, context both, dataset basic
Training 

Training simple bandit for 3 epochs, with baseline, step size 1e-06, context maps_only, dataset proportion
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context maps_only, dataset proportion
Training simple bandit for 10 epochs, with baseline, step size 1e-06, context maps_only, dataset proportion
Training simple bandit for 1 epochs, with baseline, step size 5e-06, context maps_only, dataset proportion
Training simple bandit for 3 epochs, with baseline, step size 5e-06, context maps_only, dataset proportion
Training simple bandit for 5 epochs, with baseline, step size 5e-06, context maps_only, dataset proportion
Training simple bandit for 10 epochs, with baseline, step size 5e-06, context maps_only, dataset proportion
Training simple bandit for 1 epochs, with baseline, step size 1e-05, context maps_only, dataset proportion
Training simple bandit for 3 epochs, with baseline, step size 1e-05, context maps_only, dataset proportion
Training simple bandit for 5 epochs

Training simple bandit for 3 epochs, without baseline, step size 0.05, context maps_only, dataset proportion
Training simple bandit for 5 epochs, without baseline, step size 0.05, context maps_only, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.05, context maps_only, dataset proportion
Training simple bandit for 1 epochs, with baseline, step size 1e-06, context winprob, dataset proportion
Training simple bandit for 3 epochs, with baseline, step size 1e-06, context winprob, dataset proportion
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context winprob, dataset proportion
Training simple bandit for 10 epochs, with baseline, step size 1e-06, context winprob, dataset proportion
Training simple bandit for 1 epochs, with baseline, step size 5e-06, context winprob, dataset proportion
Training simple bandit for 3 epochs, with baseline, step size 5e-06, context winprob, dataset proportion
Training simple bandit for 5 epochs, with

Training simple bandit for 5 epochs, without baseline, step size 0.01, context winprob, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.01, context winprob, dataset proportion
Training simple bandit for 1 epochs, without baseline, step size 0.05, context winprob, dataset proportion
Training simple bandit for 3 epochs, without baseline, step size 0.05, context winprob, dataset proportion
Training simple bandit for 5 epochs, without baseline, step size 0.05, context winprob, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.05, context winprob, dataset proportion
Training simple bandit for 1 epochs, with baseline, step size 1e-06, context map_winprob, dataset proportion
Training simple bandit for 3 epochs, with baseline, step size 1e-06, context map_winprob, dataset proportion
Training simple bandit for 5 epochs, with baseline, step size 1e-06, context map_winprob, dataset proportion
Training simple bandit for 10

Training simple bandit for 3 epochs, without baseline, step size 0.005, context map_winprob, dataset proportion
Training simple bandit for 5 epochs, without baseline, step size 0.005, context map_winprob, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.005, context map_winprob, dataset proportion
Training simple bandit for 1 epochs, without baseline, step size 0.01, context map_winprob, dataset proportion
Training simple bandit for 3 epochs, without baseline, step size 0.01, context map_winprob, dataset proportion
Training simple bandit for 5 epochs, without baseline, step size 0.01, context map_winprob, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.01, context map_winprob, dataset proportion
Training simple bandit for 1 epochs, without baseline, step size 0.05, context map_winprob, dataset proportion
Training simple bandit for 3 epochs, without baseline, step size 0.05, context map_winprob, dataset proport

Training simple bandit for 1 epochs, without baseline, step size 0.005, context both, dataset proportion
Training simple bandit for 3 epochs, without baseline, step size 0.005, context both, dataset proportion
Training simple bandit for 5 epochs, without baseline, step size 0.005, context both, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.005, context both, dataset proportion
Training simple bandit for 1 epochs, without baseline, step size 0.01, context both, dataset proportion
Training simple bandit for 3 epochs, without baseline, step size 0.01, context both, dataset proportion
Training simple bandit for 5 epochs, without baseline, step size 0.01, context both, dataset proportion
Training simple bandit for 10 epochs, without baseline, step size 0.01, context both, dataset proportion
Training simple bandit for 1 epochs, without baseline, step size 0.05, context both, dataset proportion
Training simple bandit for 3 epochs, without baseline, ste

In [10]:
# SAVE Results
with open('results-simple.pckl', 'wb') as f:
    pickle.dump(results, f)

In [11]:
# LOAD Results
with open('results-simple.pckl', 'rb') as f:
    results = pickle.load(f)

In [29]:
## split bandit

for dataset, (pick_df, veto_df) in datasets.items():
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']

    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test = train_test_split(X, A.values, Y.values, test_size=0.2, random_state=13)
    
    if dataset in ('basic_veto', 'proportion_veto'):
    
        X_veto = veto_df
        A_veto = veto_df['X_Action']
        Y_veto = veto_df['Y_reward']

        (X_veto_train, X_veto_test,
         A_veto_train, A_veto_test,
         Y_veto_train, Y_veto_test) = train_test_split(X_veto, A_veto.values, Y_veto.values, test_size=0.2, random_state=13)
        # LP takes vetoes too
        lp = LoggingPolicy(X_train, X_train['X_Action'],X_veto_train,X_veto_train['X_Action'])
    else:
        continue
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training split bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    
                    if dataset in ('basic_veto', 'proportion_veto'):
                        X_veto = X_veto_train[cols].values
                        X_veto_train.reset_index(drop=True, inplace=True)

                    bandit = Bandit(n_features,
                                    n_arms=7,
                                    step_size=step_size,
                                    baseline=baseline)

                    veto_bandit = VetoBandit(n_features,
                                             n_arms=7,
                                             step_size=step_size,
                                             baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        # for i in range(X.shape[0]):
                        #     bandit.update_theta(X[i].reshape(1, -1), A_train[i], Y_train[i])
                        if veto_df is not None:
                            for matchid in veto_df['MatchId'].unique():
                                indices = X_veto_train[X_veto_train['MatchId'] == matchid].index
                                veto_bandit.update_theta(X_veto[indices],
                                                         A_veto_train[indices],
                                                         Y_veto_train[indices])
                        actual_epochs_trained += 1

                    # eval on test set
                        #action_to_model_dict = train_value_estimator(X,
                        #                                             X_train,
                        #                                             A_train,
                        #                                             Y_train,
                        #                                             log_policy=lp,
                        #                                             target_bandit=bandit,
                        #                                             )

                    result_pick = {}

                        #result_pick = evaluate(X_test[cols].values,
                        #                       X_test,
                        #                       A_test,
                        #                       Y_test,
                        #                       log_policy=lp,
                        #                       target_bandit=bandit,
                        #                       action_to_model_dict=action_to_model_dict,
                        #                       )
                        
                    if dataset in ('basic_veto', 'proportion_veto'):

                        action_to_model_dict = train_value_estimator(X_veto,
                                                                     X_veto_train,
                                                                     A_veto_train,
                                                                     Y_veto_train,
                                                                     log_policy=lp,
                                                                     target_bandit=veto_bandit,
                                                                     veto_flags=pd.Series(['veto']*X_veto_train.shape[0])
                                                                     )

                        result_veto = evaluate(X_veto_test[cols].values,
                                               X_veto_test,
                                               A_veto_test,
                                               Y_veto_test,
                                               log_policy=lp,
                                               target_bandit=veto_bandit,
                                               action_to_model_dict=action_to_model_dict,
                                               veto_flags=pd.Series(['veto']*X_veto_test.shape[0])
                                               )

                    # result_pick = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}
                    if dataset in ('basic_veto', 'proportion_veto'):
                        result_veto = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'splitbandit-pick'
                    
                    if dataset in ('basic_veto', 'proportion_veto'):
                        result_veto['dataset'] = dataset
                        result_veto['context'] = context
                        result_veto['baseline'] = baseline
                        result_veto['step_size'] = step_size
                        result_veto['n_epochs_actual'] = actual_epochs_trained
                        result_veto['bandit_type'] = 'splitbandit-veto'

                    # results.append(result_pick)
                    if dataset in ('basic_veto', 'proportion_veto'):
                        results.append(result_veto)

Training split bandit for 1 epochs, with baseline, step size 1e-06, context both, dataset basic_veto
Training split bandit for 1 epochs, with baseline, step size 1e-05, context both, dataset basic_veto
Training split bandit for 1 epochs, with baseline, step size 0.0001, context both, dataset basic_veto
Training split bandit for 1 epochs, with baseline, step size 0.001, context both, dataset basic_veto
Training split bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset basic_veto
Training split bandit for 1 epochs, without baseline, step size 1e-05, context both, dataset basic_veto
Training split bandit for 1 epochs, without baseline, step size 0.0001, context both, dataset basic_veto
Training split bandit for 1 epochs, without baseline, step size 0.001, context both, dataset basic_veto
Training split bandit for 1 epochs, with baseline, step size 1e-06, context both, dataset proportion_veto
Training split bandit for 1 epochs, with baseline, step size 1e-05, cont

In [30]:
with open('results-split.pckl', 'wb') as f:
    pickle.dump(results, f)

In [31]:
with open('results-split.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
# combo bandit

for dataset, (pick_df, veto_df) in datasets.items():
    
    if dataset not in ('basic_veto', 'proportion_veto'):
        continue
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test = train_test_split(X, A.values, Y.values, test_size=0.2, random_state=13)
    
    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training combo bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values

                    bandit = ComboBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for i in range(X.shape[0]):
                            bandit.update_theta(X[i].reshape(1, -1), A_train[i].reshape(1,),
                                                Y_train[i].reshape(1,),
                                                action_type=X_train['action_type'].values[i])
                        actual_epochs_trained += 1


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result = evaluate(X_test[cols].values,
                                      X_test,
                                      A_test,
                                      Y_test,
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test['action_type']
                                      )
                    
                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = actual_epochs_trained
                    result['bandit_type'] = 'combobandit'
                    
                    results.append(result)

Training combo bandit for 1 epochs, with baseline, step size 1e-06, context both, dataset basic_veto


In [None]:
with open('results-combo-abridged.pckl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open('results-combo-abridged.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
# episodic bandit

for dataset, (pick_df, veto_df) in datasets.items():
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    
    # train test split - match-based since episodic
    
    train_matchids, test_matchids = train_test_split(X['MatchId'], test_size=0.2, random_state=13)
    
    
    X_train = X[X['MatchId'].isin(train_matchids)]
    A_train = A[X['MatchId'].isin(train_matchids)].values
    Y_train = Y[X['MatchId'].isin(train_matchids)].values

    X_test = X[X['MatchId'].isin(test_matchids)]
    A_test = A[X['MatchId'].isin(test_matchids)].values
    Y_test = Y[X['MatchId'].isin(test_matchids)].values

    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training episodic bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    X_train.reset_index(drop=True, inplace=True)

                    bandit = EpisodicBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 0
                    
                    for _ in range(n_epochs):
                        for matchid in X_train['MatchId'].unique():
                            indices = X_train[X_train['MatchId'] == matchid].index

                            bandit.update_theta(X[indices], A_train[indices],
                                                Y_train[indices], action_types=X_train['action_type'].loc[indices].values)
                        actual_epochs_trained += 1


                    # eval on test set
                    try:
                        action_to_model_dict = train_value_estimator(X,
                                                                     X_train,
                                                                     A_train,
                                                                     Y_train,
                                                                     log_policy=lp,
                                                                     target_bandit=bandit,
                                                                     veto_flags=X_train['action_type']
                                                                     )

                        result = evaluate(X_test[cols].values,
                                          X_test,
                                          A_test,
                                          Y_test,
                                          log_policy=lp,
                                          target_bandit=bandit,
                                          action_to_model_dict=action_to_model_dict,
                                          veto_flags=X_test['action_type']
                                          )
                    except ValueError:
                        result = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = actual_epochs_trained
                    result['bandit_type'] = 'episodicbandit'
                    
                    results.append(result)

In [None]:
with open('results-episodic.pckl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open('results-episodic.pckl', 'rb') as f:
    results = pickle.load(f)

In [None]:
with open('results-full.pckl', 'wb') as f:
    pickle.dump(results, f)