In [1]:
import context_engineering_functions as cef
import pickle
import pandas as pd

In [2]:
data = '../clean/'

only_basic_picks, _ = cef.create_basic_pick_veto_triples(data)
only_proportion_picks, _ = cef.create_basic_pick_veto_triples(data, pick_reward_function=cef.get_proportion_rewards)
basic_picks, basic_vetos = cef.create_basic_pick_veto_triples(data)
proportion_picks, proportion_vetos = cef.create_basic_pick_veto_triples(data, pick_reward_function=cef.get_proportion_rewards)

In [3]:
basic_vetos[basic_vetos.isna().any(axis=1)]

Unnamed: 0,MatchId,de_dust2_is_available,de_inferno_is_available,de_mirage_is_available,de_nuke_is_available,de_overpass_is_available,de_train_is_available,de_vertigo_is_available,DecisionTeamId,OtherTeamId,...,OtherTeam_de_dust2_WinPct,OtherTeam_de_inferno_WinPct,OtherTeam_de_mirage_WinPct,OtherTeam_de_nuke_WinPct,OtherTeam_de_overpass_WinPct,OtherTeam_de_train_WinPct,OtherTeam_de_vertigo_WinPct,DecisionOrder,X_Action,Y_reward
2516,1236,1,1,1,1,1,1,1,79,51,...,,,,,,0.545455,,1,5,0.5


In [4]:
basic_vetos.dropna(inplace=True)
proportion_vetos.dropna(inplace=True)

In [5]:
from bandit import Bandit, VetoBandit, ComboBandit, EpisodicBandit

In [6]:
# bandit_types = [(Bandit, None),
#                 (Bandit, VetoBandit),
#                 (ComboBandit, None),
#                 (EpisodicBandit, None),
#                 ]

datasets = {'basic': (only_basic_picks, None),
            'proportion': (only_proportion_picks, None),
            'basic_veto': (basic_picks, basic_vetos),
            'proportion_veto': (proportion_picks, proportion_vetos),
            }

contexts = [# 'maps_only',
            #'winprob',
            #'map_winprob',
            'both',
            ]

baselines = [False]

step_sizes = [1e-6, # 5e-6,
              ]

epochs = [1,
          # 3, 5, 10,
          ]

def get_cols(x):
    cols = ['de_dust2_is_available',
            'de_inferno_is_available',
            'de_mirage_is_available',
            'de_nuke_is_available',
            'de_overpass_is_available',
            'de_train_is_available',
            'de_vertigo_is_available',
            ]

    if x in ('winprob', 'both'):
        cols.extend(['DecisionTeam_WinPercent',
                     'OtherTeam_WinPercent'])
    if x in ('map_winprob', 'both'):
        cols.extend([col for col in proportion_picks.columns if 'WinPct' in col])
    return cols

In [7]:
from logging_policy import LoggingPolicy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from evaluation import train_value_estimator, evaluate

In [8]:
# simple bandit

results = []

for dataset, (pick_df, veto_df) in datasets.items():
    # Skip vetoes here
    if dataset in ('basic_veto', 'proportion_veto'):
        continue
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test = train_test_split(X, A.values, Y.values, test_size=0.2, random_state=13)
    
    lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training simple bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values # numpy array for bandit

                    bandit = Bandit(n_features,
                                    n_arms=7,
                                    step_size=step_size,
                                    baseline=baseline)
                    # eval on test set

                    action_to_model_dict = train_value_estimator(X,
                                                                X_train,
                                                                A_train,
                                                                Y_train,
                                                                log_policy=lp,
                                                                target_bandit=bandit,
                                                                )

                    result = evaluate(X_test[cols].values,
                                          X_test,
                                          A_test,
                                          Y_test,
                                          log_policy=lp,
                                          target_bandit=bandit,
                                          action_to_model_dict=action_to_model_dict,
                                          )

                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = n_epochs
                    result['bandit_type'] = 'simplebandit'

                    results.append(result)

Training simple bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset basic
Training simple bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset proportion


In [9]:
pd.DataFrame.from_dict(results)

Unnamed: 0,mean,IW,SN_IW,Direct_Method_IW,dataset,context,baseline,step_size,n_epochs_actual,bandit_type
0,0.551685,1.270104,0.575141,0.540777,basic,both,False,1e-06,1,simplebandit
1,0.007557,0.020938,0.009482,0.014302,proportion,both,False,1e-06,1,simplebandit


In [10]:
## split bandit

for dataset, (pick_df, veto_df) in datasets.items():
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']

    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test = train_test_split(X, A.values, Y.values, test_size=0.2, random_state=13)
    
    if dataset in ('basic_veto', 'proportion_veto'):
    
        X_veto = veto_df
        A_veto = veto_df['X_Action']
        Y_veto = veto_df['Y_reward']

        (X_veto_train, X_veto_test,
         A_veto_train, A_veto_test,
         Y_veto_train, Y_veto_test) = train_test_split(X_veto, A_veto.values, Y_veto.values, test_size=0.2, random_state=13)
        # LP takes vetoes too
        lp = LoggingPolicy(X_train, X_train['X_Action'],X_veto_train,X_veto_train['X_Action'])
    else:
        continue
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training split bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    
                    if dataset in ('basic_veto', 'proportion_veto'):
                        X_veto = X_veto_train[cols].values
                        X_veto_train.reset_index(drop=True, inplace=True)

                    bandit = Bandit(n_features,
                                    n_arms=7,
                                    step_size=step_size,
                                    baseline=baseline)

                    veto_bandit = VetoBandit(n_features,
                                             n_arms=7,
                                             step_size=step_size,
                                             baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 1
                    
                    # eval on test set
                        #action_to_model_dict = train_value_estimator(X,
                        #                                             X_train,
                        #                                             A_train,
                        #                                             Y_train,
                        #                                             log_policy=lp,
                        #                                             target_bandit=bandit,
                        #                                             )

                    result_pick = {}

                        #result_pick = evaluate(X_test[cols].values,
                        #                       X_test,
                        #                       A_test,
                        #                       Y_test,
                        #                       log_policy=lp,
                        #                       target_bandit=bandit,
                        #                       action_to_model_dict=action_to_model_dict,
                        #                       )
                        
                    if dataset in ('basic_veto', 'proportion_veto'):

                        action_to_model_dict = train_value_estimator(X_veto,
                                                                     X_veto_train,
                                                                     A_veto_train,
                                                                     Y_veto_train,
                                                                     log_policy=lp,
                                                                     target_bandit=veto_bandit,
                                                                     veto_flags=pd.Series(['veto']*X_veto_train.shape[0])
                                                                     )

                        result_veto = evaluate(X_veto_test[cols].values,
                                               X_veto_test,
                                               A_veto_test,
                                               Y_veto_test,
                                               log_policy=lp,
                                               target_bandit=veto_bandit,
                                               action_to_model_dict=action_to_model_dict,
                                               veto_flags=pd.Series(['veto']*X_veto_test.shape[0])
                                               )

                    # result_pick = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}
                    #if dataset in ('basic_veto', 'proportion_veto'):
                     #   result_veto = {'mean': None, 'IW': None, 'SN_IW': None, 'Direct_Method_IW': None}

                    result_pick['dataset'] = dataset
                    result_pick['context'] = context
                    result_pick['baseline'] = baseline
                    result_pick['step_size'] = step_size
                    result_pick['n_epochs_actual'] = actual_epochs_trained
                    result_pick['bandit_type'] = 'splitbandit-pick'
                    
                    if dataset in ('basic_veto', 'proportion_veto'):
                        result_veto['dataset'] = dataset
                        result_veto['context'] = context
                        result_veto['baseline'] = baseline
                        result_veto['step_size'] = step_size
                        result_veto['n_epochs_actual'] = actual_epochs_trained
                        result_veto['bandit_type'] = 'splitbandit-veto'

                    # results.append(result_pick)
                    if dataset in ('basic_veto', 'proportion_veto'):
                        results.append(result_veto)

Training split bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset basic_veto
Training split bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset proportion_veto


In [11]:
pd.DataFrame.from_dict(results)

Unnamed: 0,mean,IW,SN_IW,Direct_Method_IW,dataset,context,baseline,step_size,n_epochs_actual,bandit_type
0,0.551685,1.270104,0.575141,0.540777,basic,both,False,1e-06,1,simplebandit
1,0.007557,0.020938,0.009482,0.014302,proportion,both,False,1e-06,1,simplebandit
2,-0.006954,-0.053706,-0.020533,-0.003989,basic_veto,both,False,1e-06,1,splitbandit-veto
3,-0.006954,-0.053706,-0.020533,-0.003989,proportion_veto,both,False,1e-06,1,splitbandit-veto


In [12]:
# combo bandit

for dataset, (pick_df, veto_df) in datasets.items():
    
    if dataset not in ('basic_veto', 'proportion_veto'):
        continue
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    
    # train test split
    X_train, X_test, A_train, A_test, Y_train, Y_test = train_test_split(X, A.values, Y.values, test_size=0.2, random_state=13)
    
    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training combo bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values

                    bandit = ComboBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 1
                    


                    # eval on test set
                    action_to_model_dict = train_value_estimator(X,
                                                                 X_train,
                                                                 A_train,
                                                                 Y_train,
                                                                 log_policy=lp,
                                                                 target_bandit=bandit,
                                                                 veto_flags=X_train['action_type']
                                                                 )

                    result = evaluate(X_test[cols].values,
                                      X_test,
                                      A_test,
                                      Y_test,
                                      log_policy=lp,
                                      target_bandit=bandit,
                                      action_to_model_dict=action_to_model_dict,
                                      veto_flags=X_test['action_type']
                                      )
                    
                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = actual_epochs_trained
                    result['bandit_type'] = 'combobandit'
                    
                    results.append(result)

Training combo bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset basic_veto
Training combo bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset proportion_veto


In [13]:
pd.DataFrame.from_dict(results)

Unnamed: 0,mean,IW,SN_IW,Direct_Method_IW,dataset,context,baseline,step_size,n_epochs_actual,bandit_type
0,0.551685,1.270104,0.575141,0.540777,basic,both,False,1e-06,1,simplebandit
1,0.007557,0.020938,0.009482,0.014302,proportion,both,False,1e-06,1,simplebandit
2,-0.006954,-0.053706,-0.020533,-0.003989,basic_veto,both,False,1e-06,1,splitbandit-veto
3,-0.006954,-0.053706,-0.020533,-0.003989,proportion_veto,both,False,1e-06,1,splitbandit-veto
4,0.200781,0.420843,0.160313,0.204873,basic_veto,both,False,1e-06,1,combobandit
5,-0.003456,-0.023165,-0.008824,0.001929,proportion_veto,both,False,1e-06,1,combobandit


In [14]:
# episodic bandit

for dataset, (pick_df, veto_df) in datasets.items():
    
    pick_df['action_type'] = 'pick'
    
    if dataset in ('basic_veto', 'proportion_veto'):
        veto_df['action_type'] = 'veto'
        pick_df = pd.concat([pick_df, veto_df], axis=0, ignore_index=True)
    
    X = pick_df
    A = pick_df['X_Action']
    Y = pick_df['Y_reward']
    
    # train test split - match-based since episodic
    
    train_matchids, test_matchids = train_test_split(X['MatchId'], test_size=0.2, random_state=13)
    
    
    X_train = X[X['MatchId'].isin(train_matchids)]
    A_train = A[X['MatchId'].isin(train_matchids)].values
    Y_train = Y[X['MatchId'].isin(train_matchids)].values

    X_test = X[X['MatchId'].isin(test_matchids)]
    A_test = A[X['MatchId'].isin(test_matchids)].values
    Y_test = Y[X['MatchId'].isin(test_matchids)].values

    # LoggingPolicy either includes veto data or not
    if dataset in ('basic_veto', 'proportion_veto'):
        lp = LoggingPolicy(X_train[X_train['action_type']=='pick'], X_train[X_train['action_type']=='pick']['X_Action'], \
                       X_train[X_train['action_type']=='veto'], X_train[X_train['action_type']=='veto']['X_Action'])
    else:
        lp = LoggingPolicy(X_train, X_train['X_Action'])

    for context in contexts:
        for baseline in baselines:
            for step_size in step_sizes:
                for n_epochs in epochs:
                    print(f'Training episodic bandit for {n_epochs} epochs, {"with" if baseline else "without"} '
                          f'baseline, step size {step_size}, context {context}, dataset {dataset}')
                    
                    cols = get_cols(context)
                    
                    n_features = len(cols)
                    X = X_train[cols].values
                    X_train.reset_index(drop=True, inplace=True)

                    bandit = EpisodicBandit(n_features,
                                         n_arms=7,
                                         step_size=step_size,
                                         baseline=baseline)

                    # train bandit
                    
                    actual_epochs_trained = 1



                    # eval on test set

                    action_to_model_dict = train_value_estimator(X,
                                                                     X_train,
                                                                     A_train,
                                                                     Y_train,
                                                                     log_policy=lp,
                                                                     target_bandit=bandit,
                                                                     veto_flags=X_train['action_type']
                                                                     )

                    result = evaluate(X_test[cols].values,
                                          X_test,
                                          A_test,
                                          Y_test,
                                          log_policy=lp,
                                          target_bandit=bandit,
                                          action_to_model_dict=action_to_model_dict,
                                          veto_flags=X_test['action_type']
                                          )
                    
                    result['dataset'] = dataset
                    result['context'] = context
                    result['baseline'] = baseline
                    result['step_size'] = step_size
                    result['n_epochs_actual'] = actual_epochs_trained
                    result['bandit_type'] = 'episodicbandit'
                    
                    results.append(result)

Training episodic bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset basic
Training episodic bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset proportion
Training episodic bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset basic_veto
Training episodic bandit for 1 epochs, without baseline, step size 1e-06, context both, dataset proportion_veto


In [15]:
pd.DataFrame.from_dict(results)

Unnamed: 0,mean,IW,SN_IW,Direct_Method_IW,dataset,context,baseline,step_size,n_epochs_actual,bandit_type
0,0.551685,1.270104,0.575141,0.540777,basic,both,False,1e-06,1,simplebandit
1,0.007557,0.020938,0.009482,0.014302,proportion,both,False,1e-06,1,simplebandit
2,-0.006954,-0.053706,-0.020533,-0.003989,basic_veto,both,False,1e-06,1,splitbandit-veto
3,-0.006954,-0.053706,-0.020533,-0.003989,proportion_veto,both,False,1e-06,1,splitbandit-veto
4,0.200781,0.420843,0.160313,0.204873,basic_veto,both,False,1e-06,1,combobandit
5,-0.003456,-0.023165,-0.008824,0.001929,proportion_veto,both,False,1e-06,1,combobandit
6,0.559705,1.357502,0.579982,0.547084,basic,both,False,1e-06,1,episodicbandit
7,0.011847,0.017011,0.007268,0.007108,proportion,both,False,1e-06,1,episodicbandit
8,0.202015,0.48142,0.191587,0.201668,basic_veto,both,False,1e-06,1,episodicbandit
9,-0.002156,-0.019975,-0.007949,-0.003149,proportion_veto,both,False,1e-06,1,episodicbandit
